diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/arm/neon/vp9_loopfilter_neon.asm | 356 | ||||
-rw-r--r-- | vp9/common/vp9_alloccommon.c | 5 | ||||
-rw-r--r-- | vp9/common/vp9_common_data.c | 43 | ||||
-rw-r--r-- | vp9/common/vp9_common_data.h | 8 | ||||
-rw-r--r-- | vp9/common/vp9_enums.h | 4 | ||||
-rw-r--r-- | vp9/common/vp9_loopfilter.c | 2 | ||||
-rw-r--r-- | vp9/common/vp9_loopfilter.h | 2 | ||||
-rw-r--r-- | vp9/common/vp9_onyx.h | 8 | ||||
-rw-r--r-- | vp9/common/vp9_onyxc_int.h | 3 | ||||
-rw-r--r-- | vp9/common/vp9_pred_common.c | 2 | ||||
-rw-r--r-- | vp9/common/vp9_seg_common.h | 12 | ||||
-rw-r--r-- | vp9/decoder/vp9_decodemv.c | 2 | ||||
-rw-r--r-- | vp9/decoder/vp9_decodframe.c | 4 | ||||
-rw-r--r-- | vp9/encoder/vp9_bitstream.c | 44 | ||||
-rw-r--r-- | vp9/encoder/vp9_block.h | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 195 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodemb.c | 12 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodemv.c | 18 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.c | 75 | ||||
-rw-r--r-- | vp9/encoder/vp9_onyx_if.c | 72 | ||||
-rw-r--r-- | vp9/encoder/vp9_onyx_int.h | 3 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 125 | ||||
-rw-r--r-- | vp9/encoder/vp9_segmentation.c | 8 |
23 files changed, 587 insertions, 417 deletions
diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/vp9/common/arm/neon/vp9_loopfilter_neon.asm index 4fe1a6ac6..8b4fe5dcc 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_neon.asm +++ b/vp9/common/arm/neon/vp9_loopfilter_neon.asm @@ -37,13 +37,14 @@ |vp9_loop_filter_horizontal_edge_neon| PROC push {lr} - ldr r12, [sp,#8] ; load count + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r12, [sp, #8] ; load count + ldr r2, [sp, #4] ; load thresh add r1, r1, r1 ; double pitch + cmp r12, #0 beq end_vp9_lf_h_edge - vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r2, [sp, #4] ; load thresh vld1.8 {d1[]}, [r3] ; duplicate *limit vld1.8 {d2[]}, [r2] ; duplicate *thresh @@ -99,18 +100,18 @@ end_vp9_lf_h_edge |vp9_loop_filter_vertical_edge_neon| PROC push {lr} - ldr r12, [sp,#8] ; load count + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r12, [sp, #8] ; load count + vld1.8 {d1[]}, [r3] ; duplicate *limit + + ldr r3, [sp, #4] ; load thresh + sub r2, r0, #4 ; move s pointer down by 4 columns cmp r12, #0 beq end_vp9_lf_v_edge - vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r2, [sp, #4] ; load thresh - vld1.8 {d1[]}, [r3] ; duplicate *limit - vld1.8 {d2[]}, [r2] ; duplicate *thresh + vld1.8 {d2[]}, [r3] ; duplicate *thresh count_lf_v_loop - sub r2, r0, #4 ; move s pointer down by 4 columns - vld1.u8 {d3}, [r2], r1 ; load s data vld1.u8 {d4}, [r2], r1 vld1.u8 {d5}, [r2], r1 @@ -152,6 +153,7 @@ count_lf_v_loop add r0, r0, r1, lsl #3 ; s += pitch * 8 subs r12, r12, #1 + subne r2, r0, #4 ; move s pointer down by 4 columns bne count_lf_v_loop end_vp9_lf_v_edge @@ -163,6 +165,7 @@ end_vp9_lf_v_edge ; necessary load, transpose (if necessary) and store. The function does not use ; registers d8-d15. ; +; Inputs: ; r0-r3, r12 PRESERVE ; d0 blimit ; d1 limit @@ -175,39 +178,48 @@ end_vp9_lf_v_edge ; d16 q1 ; d17 q2 ; d18 q3 +; +; Outputs: +; d4 op1 +; d5 op0 +; d6 oq0 +; d7 oq1 |vp9_loop_filter_neon| PROC ; filter_mask - vabd.u8 d19, d3, d4 ; abs(p3 - p2) - vabd.u8 d20, d4, d5 ; abs(p2 - p1) - vabd.u8 d21, d5, d6 ; abs(p1 - p0) - vabd.u8 d22, d16, d7 ; abs(q1 - q0) - vabd.u8 d3, d17, d16 ; abs(q2 - q1) - vabd.u8 d4, d18, d17 ; abs(q3 - q2) + vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) + vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) + vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) + vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) + vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) + vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) ; only compare the largest value to limit - vmax.u8 d19, d19, d20 - vmax.u8 d20, d21, d22 - vmax.u8 d3, d3, d4 - vmax.u8 d23, d19, d20 + vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) + vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) vabd.u8 d17, d6, d7 ; abs(p0 - q0) + vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) + + vmov.u8 d18, #0x80 + + vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) + ; hevmask vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 d23, d23, d3 - - vmov.u8 d18, #0x80 + vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 - ; abs () > limit - vcge.u8 d23, d1, d23 + veor d7, d7, d18 ; qs0 + + vcge.u8 d23, d1, d23 ; abs(m1) > limit ; filter() function ; convert to signed - veor d7, d7, d18 ; qs0 + vshr.u8 d28, d28, #1 ; a = a / 2 veor d6, d6, d18 ; ps0 @@ -244,19 +256,20 @@ end_vp9_lf_v_edge vshr.s8 d28, d28, #3 ; filter2 >>= 3 vshr.s8 d27, d27, #3 ; filter1 >>= 3 - vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) - ; outer tap adjustments: ++filter >> 1 - vrshr.s8 d27, d27, #1 + ; outer tap adjustments + vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 + + veor d6, d26, d18 ; *oq0 = u^0x80 + vbic d27, d27, d22 ; filter &= ~hev vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) veor d5, d19, d18 ; *op0 = u^0x80 - veor d6, d26, d18 ; *oq0 = u^0x80 veor d4, d21, d18 ; *op1 = u^0x80 veor d7, d20, d18 ; *oq1 = u^0x80 @@ -277,13 +290,14 @@ end_vp9_lf_v_edge |vp9_mbloop_filter_horizontal_edge_neon| PROC push {r4-r5, lr} - ldr r12, [sp,#16] ; load count + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r12, [sp, #16] ; load count + ldr r2, [sp, #12] ; load thresh add r1, r1, r1 ; double pitch + cmp r12, #0 beq end_vp9_mblf_h_edge - vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r2, [sp, #12] ; load thresh vld1.8 {d1[]}, [r3] ; duplicate *limit vld1.8 {d2[]}, [r2] ; duplicate *thresh @@ -305,12 +319,12 @@ count_mblf_h_loop bl vp9_mbloop_filter_neon - vst1.u8 {d2}, [r2@64], r1 ; store op2 - vst1.u8 {d3}, [r3@64], r1 ; store op1 - vst1.u8 {d4}, [r2@64], r1 ; store op0 - vst1.u8 {d5}, [r3@64], r1 ; store oq0 - vst1.u8 {d6}, [r2@64], r1 ; store oq1 - vst1.u8 {d7}, [r3@64], r1 ; store oq2 + vst1.u8 {d0}, [r2@64], r1 ; store op2 + vst1.u8 {d1}, [r3@64], r1 ; store op1 + vst1.u8 {d2}, [r2@64], r1 ; store op0 + vst1.u8 {d3}, [r3@64], r1 ; store oq0 + vst1.u8 {d4}, [r2@64], r1 ; store oq1 + vst1.u8 {d5}, [r3@64], r1 ; store oq2 add r0, r0, #8 subs r12, r12, #1 @@ -337,18 +351,18 @@ end_vp9_mblf_h_edge |vp9_mbloop_filter_vertical_edge_neon| PROC push {r4-r5, lr} - ldr r12, [sp,#16] ; load count + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r12, [sp, #16] ; load count + vld1.8 {d1[]}, [r3] ; duplicate *limit + + ldr r3, [sp, #12] ; load thresh + sub r2, r0, #4 ; move s pointer down by 4 columns cmp r12, #0 beq end_vp9_mblf_v_edge - vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r2, [sp, #12] ; load thresh - vld1.8 {d1[]}, [r3] ; duplicate *limit - vld1.8 {d2[]}, [r2] ; duplicate *thresh + vld1.8 {d2[]}, [r3] ; duplicate *thresh count_mblf_v_loop - sub r2, r0, #4 ; move s pointer down by 4 columns - vld1.u8 {d3}, [r2], r1 ; load s data vld1.u8 {d4}, [r2], r1 vld1.u8 {d5}, [r2], r1 @@ -380,27 +394,28 @@ count_mblf_v_loop bl vp9_mbloop_filter_neon ;store op2, op1, op0, oq0 - vst4.8 {d2[0], d3[0], d4[0], d5[0]}, [r2], r1 - vst4.8 {d2[1], d3[1], d4[1], d5[1]}, [r2], r1 - vst4.8 {d2[2], d3[2], d4[2], d5[2]}, [r2], r1 - vst4.8 {d2[3], d3[3], d4[3], d5[3]}, [r2], r1 - vst4.8 {d2[4], d3[4], d4[4], d5[4]}, [r2], r1 - vst4.8 {d2[5], d3[5], d4[5], d5[5]}, [r2], r1 - vst4.8 {d2[6], d3[6], d4[6], d5[6]}, [r2], r1 - vst4.8 {d2[7], d3[7], d4[7], d5[7]}, [r2] + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1 + vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1 + vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1 + vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1 + vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r2], r1 + vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r2], r1 + vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r2], r1 + vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r2] ;store oq1, oq2 - vst2.8 {d6[0], d7[0]}, [r3], r1 - vst2.8 {d6[1], d7[1]}, [r3], r1 - vst2.8 {d6[2], d7[2]}, [r3], r1 - vst2.8 {d6[3], d7[3]}, [r3], r1 - vst2.8 {d6[4], d7[4]}, [r3], r1 - vst2.8 {d6[5], d7[5]}, [r3], r1 - vst2.8 {d6[6], d7[6]}, [r3], r1 - vst2.8 {d6[7], d7[7]}, [r3] + vst2.8 {d4[0], d5[0]}, [r3], r1 + vst2.8 {d4[1], d5[1]}, [r3], r1 + vst2.8 {d4[2], d5[2]}, [r3], r1 + vst2.8 {d4[3], d5[3]}, [r3], r1 + vst2.8 {d4[4], d5[4]}, [r3], r1 + vst2.8 {d4[5], d5[5]}, [r3], r1 + vst2.8 {d4[6], d5[6]}, [r3], r1 + vst2.8 {d4[7], d5[7]}, [r3] add r0, r0, r1, lsl #3 ; s += pitch * 8 subs r12, r12, #1 + subne r2, r0, #4 ; move s pointer down by 4 columns bne count_mblf_v_loop end_vp9_mblf_v_edge @@ -412,6 +427,7 @@ end_vp9_mblf_v_edge ; necessary load, transpose (if necessary) and store. The function does not use ; registers d8-d15. ; +; Inputs: ; r0-r3, r12 PRESERVE ; d0 blimit ; d1 limit @@ -424,22 +440,38 @@ end_vp9_mblf_v_edge ; d16 q1 ; d17 q2 ; d18 q3 +; +; Outputs: +; d0 op2 +; d1 op1 +; d2 op0 +; d3 oq0 +; d4 oq1 +; d5 oq2 |vp9_mbloop_filter_neon| PROC ; filter_mask - vabd.u8 d19, d3, d4 ; abs(p3 - p2) - vabd.u8 d20, d4, d5 ; abs(p2 - p1) - vabd.u8 d21, d5, d6 ; abs(p1 - p0) - vabd.u8 d22, d16, d7 ; abs(q1 - q0) - vabd.u8 d23, d17, d16 ; abs(q2 - q1) - vabd.u8 d24, d18, d17 ; abs(q3 - q2) + vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) + vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) + vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) + vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) + vabd.u8 d23, d17, d16 ; m5 = abs(q2 - q1) + vabd.u8 d24, d18, d17 ; m6 = abs(q3 - q2) ; only compare the largest value to limit - vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1)) - vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0)) - vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2)) + vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) + vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) + + vabd.u8 d25, d6, d4 ; m7 = abs(p0 - p2) + + vmax.u8 d23, d23, d24 ; m3 = max(m5, m6) + + vabd.u8 d26, d7, d17 ; m8 = abs(q0 - q2) + vmax.u8 d19, d19, d20 - vabd.u8 d24, d6, d7 ; abs(p0 - q0) + vabd.u8 d24, d6, d7 ; m9 = abs(p0 - q0) + vabd.u8 d27, d3, d6 ; m10 = abs(p3 - p0) + vabd.u8 d28, d18, d7 ; m11 = abs(q3 - q0) vmax.u8 d19, d19, d23 @@ -449,30 +481,35 @@ end_vp9_mblf_v_edge ; abs () > limit vcge.u8 d19, d1, d19 - ; flatmask4 - vabd.u8 d25, d6, d4 ; abs(p0 - p2) - vabd.u8 d26, d7, d17 ; abs(q0 - q2) - vabd.u8 d27, d3, d6 ; abs(p3 - p0) - vabd.u8 d28, d18, d7 ; abs(q3 - q0) - ; only compare the largest value to thresh - vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2)) - vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0)) - vmax.u8 d25, d25, d26 - vmax.u8 d20, d20, d25 + vmax.u8 d25, d25, d26 ; m4 = max(m7, m8) + vmax.u8 d26, d27, d28 ; m5 = max(m10, m11) vshr.u8 d23, d23, #1 ; a = a / 2 + + vmax.u8 d25, d25, d26 ; m4 = max(m4, m5) + vqadd.u8 d24, d24, d23 ; a = b + a + vmax.u8 d20, d20, d25 ; m2 = max(m2, m4) + vmov.u8 d23, #1 vcge.u8 d24, d0, d24 ; a > blimit + vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 + vcge.u8 d20, d23, d20 ; flat vand d19, d19, d24 ; mask + vcgt.u8 d23, d22, d2 ; (abs(q1 - q0) > thresh)*-1 + vand d20, d20, d19 ; flat & mask + vmov.u8 d22, #0x80 + + vorr d23, d21, d23 ; hev + ; This instruction will truncate the "flat & mask" masks down to 4 bits ; each to fit into one 32 bit arm register. The values are stored in ; q10.64[0]. @@ -480,35 +517,30 @@ end_vp9_mblf_v_edge vmov.u32 r4, d30[0] ; flat & mask 4bits adds r5, r4, #1 ; Check for all 1's + + ; If mask and flat are 1's for all vectors, then we only need to execute + ; the power branch for all vectors. beq power_branch_only cmp r4, #0 ; Check for 0, set flag for later - ; hevmask - vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 - vorr d21, d21, d22 ; hev - - vmov.u8 d22, #0x80 - ; mbfilter() function - ; filter() function ; convert to signed - veor d23, d7, d22 ; qs0 + veor d21, d7, d22 ; qs0 veor d24, d6, d22 ; ps0 veor d25, d5, d22 ; ps1 veor d26, d16, d22 ; qs1 vmov.u8 d27, #3 - vsub.s8 d28, d23, d24 ; ( qs0 - ps0) + vsub.s8 d28, d21, d24 ; ( qs0 - ps0) vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) - vand d29, d29, d21 ; filter &= hev + vand d29, d29, d23 ; filter &= hev vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) @@ -525,80 +557,96 @@ end_vp9_mblf_v_edge vshr.s8 d29, d29, #3 ; filter1 >>= 3 vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) - vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1) + vqsub.s8 d21, d21, d29 ; oq0 = clamp(qs0 - filter1) ; outer tap adjustments: ++filter1 >> 1 vrshr.s8 d29, d29, #1 - vbic d29, d29, d21 ; filter &= ~hev + vbic d29, d29, d23 ; filter &= ~hev vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) + ; If mask and flat are 0's for all vectors, then we only need to execute + ; the filter branch for all vectors. beq filter_branch_only + ; If mask and flat are mixed then we must perform both branches and + ; combine the data. veor d24, d24, d22 ; *f_op0 = u^0x80 - veor d23, d23, d22 ; *f_oq0 = u^0x80 + veor d21, d21, d22 ; *f_oq0 = u^0x80 veor d25, d25, d22 ; *f_op1 = u^0x80 veor d26, d26, d22 ; *f_oq1 = u^0x80 - ; mbfilter flat && mask branch - ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's - ; and using vibt on the q's? - vmov.u8 d21, #2 - vaddl.u8 q14, d6, d7 ; op2 = p0 + q0 - vmlal.u8 q14, d3, d27 ; op2 += p3 * 3 - vmlal.u8 q14, d4, d21 ; op2 += p2 * 2 - vaddw.u8 q14, d5 ; op2 += p1 + ; At this point we have already executed the filter branch. The filter + ; branch does not set op2 or oq2, so use p2 and q2. Execute the power + ; branch and combine the data. + vmov.u8 d23, #2 + vaddl.u8 q14, d6, d7 ; r_op2 = p0 + q0 + vmlal.u8 q14, d3, d27 ; r_op2 += p3 * 3 + vmlal.u8 q14, d4, d23 ; r_op2 += p2 * 2 + + vbif d0, d4, d20 ; op2 |= p2 & ~(flat & mask) + + vaddw.u8 q14, d5 ; r_op2 += p1 + + vbif d1, d25, d20 ; op1 |= f_op1 & ~(flat & mask) + vqrshrn.u16 d30, q14, #3 ; r_op2 - vsubw.u8 q14, d3 ; op1 = op2 - p3 - vsubw.u8 q14, d4 ; op1 -= p2 - vaddw.u8 q14, d5 ; op1 += p1 - vaddw.u8 q14, d16 ; op1 += q1 + vsubw.u8 q14, d3 ; r_op1 = r_op2 - p3 + vsubw.u8 q14, d4 ; r_op1 -= p2 + vaddw.u8 q14, d5 ; r_op1 += p1 + vaddw.u8 q14, d16 ; r_op1 += q1 + + vbif d2, d24, d20 ; op0 |= f_op0 & ~(flat & mask) + vqrshrn.u16 d31, q14, #3 ; r_op1 - vsubw.u8 q14, d3 ; op0 = op1 - p3 - vsubw.u8 q14, d5 ; op0 -= p1 - vaddw.u8 q14, d6 ; op0 += p0 - vaddw.u8 q14, d17 ; op0 += q2 - vqrshrn.u16 d21, q14, #3 ; r_op0 + vsubw.u8 q14, d3 ; r_op0 = r_op1 - p3 + vsubw.u8 q14, d5 ; r_op0 -= p1 + vaddw.u8 q14, d6 ; r_op0 += p0 + vaddw.u8 q14, d17 ; r_op0 += q2 + + vbit d0, d30, d20 ; op2 |= r_op2 & (flat & mask) + + vqrshrn.u16 d23, q14, #3 ; r_op0 + + vsubw.u8 q14, d3 ; r_oq0 = r_op0 - p3 + vsubw.u8 q14, d6 ; r_oq0 -= p0 + vaddw.u8 q14, d7 ; r_oq0 += q0 + + vbit d1, d31, d20 ; op1 |= r_op1 & (flat & mask) - vsubw.u8 q14, d3 ; oq0 = op0 - p3 - vsubw.u8 q14, d6 ; oq0 -= p0 - vaddw.u8 q14, d7 ; oq0 += q0 vaddw.u8 q14, d18 ; oq0 += q3 + + vbit d2, d23, d20 ; op0 |= r_op0 & (flat & mask) + vqrshrn.u16 d22, q14, #3 ; r_oq0 - vsubw.u8 q14, d4 ; oq1 = oq0 - p2 - vsubw.u8 q14, d7 ; oq1 -= q0 - vaddw.u8 q14, d16 ; oq1 += q1 - vaddw.u8 q14, d18 ; oq1 += q3 - vqrshrn.u16 d0, q14, #3 ; r_oq1 + vsubw.u8 q14, d4 ; r_oq1 = r_oq0 - p2 + vsubw.u8 q14, d7 ; r_oq1 -= q0 + vaddw.u8 q14, d16 ; r_oq1 += q1 - vsubw.u8 q14, d5 ; oq2 = oq0 - p1 - vsubw.u8 q14, d16 ; oq2 -= q1 - vaddw.u8 q14, d17 ; oq2 += q2 - vaddw.u8 q14, d18 ; oq2 += q3 - vqrshrn.u16 d1, q14, #3 ; r_oq2 + vbif d3, d21, d20 ; oq0 |= f_oq0 & ~(flat & mask) + + vaddw.u8 q14, d18 ; r_oq1 += q3 - ; Filter does not set op2 or oq2, so use p2 and q2. - vbit d2, d30, d20 ; op2 |= r_op2 & (flat & mask) - vbif d2, d4, d20 ; op2 |= op2 & ~(flat & mask) + vbif d4, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask) - vbit d3, d31, d20 ; op1 |= r_op1 & (flat & mask) - vbif d3, d25, d20 ; op1 |= f_op1 & ~(flat & mask) + vqrshrn.u16 d6, q14, #3 ; r_oq1 - vbit d4, d21, d20 ; op0 |= r_op0 & (flat & mask) - vbif d4, d24, d20 ; op0 |= f_op0 & ~(flat & mask) + vsubw.u8 q14, d5 ; r_oq2 = r_oq1 - p1 + vsubw.u8 q14, d16 ; r_oq2 -= q1 + vaddw.u8 q14, d17 ; r_oq2 += q2 + vaddw.u8 q14, d18 ; r_oq2 += q3 - vbit d5, d22, d20 ; oq0 |= r_oq0 & (flat & mask) - vbif d5, d23, d20 ; oq0 |= f_oq0 & ~(flat & mask) + vbif d5, d17, d20 ; oq2 |= q2 & ~(flat & mask) - vbit d6, d0, d20 ; oq1 |= r_oq1 & (flat & mask) - vbif d6, d26, d20 ; oq1 |= f_oq1 & ~(flat & mask) + vqrshrn.u16 d7, q14, #3 ; r_oq2 - vbit d7, d1, d20 ; oq2 |= r_oq2 & (flat & mask) - vbif d7, d17, d20 ; oq2 |= oq2 & ~(flat & mask) + vbit d3, d22, d20 ; oq0 |= r_oq0 & (flat & mask) + vbit d4, d6, d20 ; oq1 |= r_oq1 & (flat & mask) + vbit d5, d7, d20 ; oq2 |= r_oq2 & (flat & mask) bx lr @@ -609,53 +657,49 @@ power_branch_only vmlal.u8 q14, d3, d27 ; op2 += p3 * 3 vmlal.u8 q14, d4, d21 ; op2 += p2 * 2 vaddw.u8 q14, d5 ; op2 += p1 - vqrshrn.u16 d2, q14, #3 ; op2 + vqrshrn.u16 d0, q14, #3 ; op2 vsubw.u8 q14, d3 ; op1 = op2 - p3 vsubw.u8 q14, d4 ; op1 -= p2 vaddw.u8 q14, d5 ; op1 += p1 vaddw.u8 q14, d16 ; op1 += q1 - vqrshrn.u16 d31, q14, #3 ; op1 + vqrshrn.u16 d1, q14, #3 ; op1 vsubw.u8 q14, d3 ; op0 = op1 - p3 vsubw.u8 q14, d5 ; op0 -= p1 vaddw.u8 q14, d6 ; op0 += p0 vaddw.u8 q14, d17 ; op0 += q2 - vqrshrn.u16 d21, q14, #3 ; op0 + vqrshrn.u16 d2, q14, #3 ; op0 vsubw.u8 q14, d3 ; oq0 = op0 - p3 vsubw.u8 q14, d6 ; oq0 -= p0 vaddw.u8 q14, d7 ; oq0 += q0 vaddw.u8 q14, d18 ; oq0 += q3 - vqrshrn.u16 d22, q14, #3 ; oq0 + vqrshrn.u16 d3, q14, #3 ; oq0 vsubw.u8 q14, d4 ; oq1 = oq0 - p2 vsubw.u8 q14, d7 ; oq1 -= q0 vaddw.u8 q14, d16 ; oq1 += q1 vaddw.u8 q14, d18 ; oq1 += q3 - vqrshrn.u16 d6, q14, #3 ; oq1 + vqrshrn.u16 d4, q14, #3 ; oq1 - vsubw.u8 q14, d5 ; oq2 = oq0 - p1 + vsubw.u8 q14, d5 ; oq2 = oq1 - p1 vsubw.u8 q14, d16 ; oq2 -= q1 vaddw.u8 q14, d17 ; oq2 += q2 vaddw.u8 q14, d18 ; oq2 += q3 - vqrshrn.u16 d7, q14, #3 ; oq2 - - vswp d3, d31 - vswp d4, d21 - vswp d5, d22 + vqrshrn.u16 d5, q14, #3 ; oq2 bx lr filter_branch_only ; TODO(fgalligan): See if we can rearange registers so we do not need to ; do the 2 vswp. - vswp d2, d4 ; op2 - vswp d7, d17 ; oq2 - veor d4, d24, d22 ; *op0 = u^0x80 - veor d5, d23, d22 ; *oq0 = u^0x80 - veor d3, d25, d22 ; *op1 = u^0x80 - veor d6, d26, d22 ; *oq1 = u^0x80 + vswp d0, d4 ; op2 + vswp d5, d17 ; oq2 + veor d2, d24, d22 ; *op0 = u^0x80 + veor d3, d21, d22 ; *oq0 = u^0x80 + veor d1, d25, d22 ; *op1 = u^0x80 + veor d4, d26, d22 ; *oq1 = u^0x80 bx lr diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index 0a4f921c2..554a31730 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -53,7 +53,6 @@ void vp9_free_frame_buffers(VP9_COMMON *oci) { for (i = 0; i < NUM_YV12_BUFFERS; i++) vp9_free_frame_buffer(&oci->yv12_fb[i]); - vp9_free_frame_buffer(&oci->temp_scale_frame); vp9_free_frame_buffer(&oci->post_proc_buffer); vpx_free(oci->mip); @@ -121,10 +120,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) { oci->fb_idx_ref_cnt[i] = 1; } - if (vp9_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, ss_x, ss_y, - VP9BORDERINPIXELS) < 0) - goto fail; - if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y, VP9BORDERINPIXELS) < 0) goto fail; diff --git a/vp9/common/vp9_common_data.c b/vp9/common/vp9_common_data.c index d5b51e89d..dee44ec63 100644 --- a/vp9/common/vp9_common_data.c +++ b/vp9/common/vp9_common_data.c @@ -17,11 +17,54 @@ const int b_width_log2_lookup[BLOCK_SIZE_TYPES] = {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4}; const int b_height_log2_lookup[BLOCK_SIZE_TYPES] = {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4}; +const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES] = + {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16}; +const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES] = + {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16}; // Log 2 conversion lookup tables for modeinfo width and height const int mi_width_log2_lookup[BLOCK_SIZE_TYPES] = {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3}; +const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES] = + {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8}; const int mi_height_log2_lookup[BLOCK_SIZE_TYPES] = {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3}; +const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES] = + {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8}; + +const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = { + { // 4X4 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID + }, { // 8X8 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID + }, { // 16X16 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID + }, { // 32X32 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, + PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID, + PARTITION_INVALID, PARTITION_INVALID + }, { // 64X64 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, + PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, + PARTITION_NONE + } +}; const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES] = { { // PARTITION_NONE diff --git a/vp9/common/vp9_common_data.h b/vp9/common/vp9_common_data.h index 52c314897..8b0f8a500 100644 --- a/vp9/common/vp9_common_data.h +++ b/vp9/common/vp9_common_data.h @@ -17,6 +17,14 @@ extern const int b_width_log2_lookup[BLOCK_SIZE_TYPES]; extern const int b_height_log2_lookup[BLOCK_SIZE_TYPES]; extern const int mi_width_log2_lookup[BLOCK_SIZE_TYPES]; extern const int mi_height_log2_lookup[BLOCK_SIZE_TYPES]; +extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES]; +extern const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES]; +extern const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES]; +extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES]; +extern const PARTITION_TYPE + partition_lookup[][BLOCK_SIZE_TYPES]; + + extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES]; extern const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES]; extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES]; diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h index 855c5e3de..86f0d0bfd 100644 --- a/vp9/common/vp9_enums.h +++ b/vp9/common/vp9_enums.h @@ -35,7 +35,7 @@ typedef enum BLOCK_SIZE_TYPE { BLOCK_SIZE_SB32X64, BLOCK_32X64 = BLOCK_SIZE_SB32X64, BLOCK_SIZE_SB64X32, BLOCK_64X32 = BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X64, BLOCK_64X64 = BLOCK_SIZE_SB64X64, - BLOCK_SIZE_TYPES, BLOCK_MAX_SB_SEGMENTS = BLOCK_SIZE_TYPES, + BLOCK_SIZE_TYPES, BLOCK_MAX_SB_SEGMENTS = BLOCK_SIZE_TYPES } BLOCK_SIZE_TYPE; typedef enum PARTITION_TYPE { @@ -43,7 +43,7 @@ typedef enum PARTITION_TYPE { PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT, - PARTITION_TYPES + PARTITION_TYPES, PARTITION_INVALID = PARTITION_TYPES } PARTITION_TYPE; #define PARTITION_PLOFFSET 4 // number of probability models per block size diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index d8be8765a..5498b1717 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -87,7 +87,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd, lf->last_sharpness_level = lf->sharpness_level; } - for (seg = 0; seg < MAX_MB_SEGMENTS; seg++) { + for (seg = 0; seg < MAX_SEGMENTS; seg++) { int lvl_seg = default_filt_lvl, ref, mode, intra_lvl; // Set the baseline filter values for each segment diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h index fddf2ce82..e59cc6485 100644 --- a/vp9/common/vp9_loopfilter.h +++ b/vp9/common/vp9_loopfilter.h @@ -31,7 +31,7 @@ typedef struct { lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[4][SIMD_WIDTH]); - uint8_t lvl[MAX_MB_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS]; + uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS]; uint8_t mode_lf_lut[MB_MODE_COUNT]; } loop_filter_info_n; diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h index fe8122b46..152046f6f 100644 --- a/vp9/common/vp9_onyx.h +++ b/vp9/common/vp9_onyx.h @@ -22,7 +22,7 @@ extern "C" #include "vpx_scale/yv12config.h" #include "vp9/common/vp9_ppflags.h" -#define MAX_MB_SEGMENTS 8 +#define MAX_SEGMENTS 8 typedef int *VP9_PTR; @@ -200,9 +200,9 @@ extern "C" int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols, - int delta_q[MAX_MB_SEGMENTS], - int delta_lf[MAX_MB_SEGMENTS], - unsigned int threshold[MAX_MB_SEGMENTS]); + int delta_q[MAX_SEGMENTS], + int delta_lf[MAX_SEGMENTS], + unsigned int threshold[MAX_SEGMENTS]); int vp9_set_active_map(VP9_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols); diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index 2efdf8fa3..8b76ac711 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -130,10 +130,7 @@ typedef struct VP9Common { struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME]; int new_fb_idx; - YV12_BUFFER_CONFIG post_proc_buffer; - YV12_BUFFER_CONFIG temp_scale_frame; - FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */ FRAME_TYPE frame_type; diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c index ea2b0f418..71fca4cb9 100644 --- a/vp9/common/vp9_pred_common.c +++ b/vp9/common/vp9_pred_common.c @@ -445,6 +445,6 @@ int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids, segment_id = MIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]); - assert(segment_id >= 0 && segment_id < MAX_MB_SEGMENTS); + assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); return segment_id; } diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h index f072a518d..f22239b92 100644 --- a/vp9/common/vp9_seg_common.h +++ b/vp9/common/vp9_seg_common.h @@ -16,8 +16,8 @@ #define SEGMENT_DELTADATA 0 #define SEGMENT_ABSDATA 1 -#define MAX_MB_SEGMENTS 8 -#define MB_SEG_TREE_PROBS (MAX_MB_SEGMENTS-1) +#define MAX_SEGMENTS 8 +#define SEG_TREE_PROBS (MAX_SEGMENTS-1) #define PREDICTION_PROBS 3 @@ -27,7 +27,7 @@ typedef enum { SEG_LVL_ALT_LF = 1, // Use alternate loop filter value... SEG_LVL_REF_FRAME = 2, // Optional Segment reference frame SEG_LVL_SKIP = 3, // Optional Segment (0,0) + skip mode - SEG_LVL_MAX = 4 // Number of MB level features supported + SEG_LVL_MAX = 4 // Number of features supported } SEG_LVL_FEATURES; @@ -38,11 +38,11 @@ struct segmentation { uint8_t abs_delta; uint8_t temporal_update; - vp9_prob tree_probs[MB_SEG_TREE_PROBS]; + vp9_prob tree_probs[SEG_TREE_PROBS]; vp9_prob pred_probs[PREDICTION_PROBS]; - int16_t feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX]; - unsigned int feature_mask[MAX_MB_SEGMENTS]; + int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX]; + unsigned int feature_mask[MAX_SEGMENTS]; }; int vp9_segfeature_active(const struct segmentation *seg, diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index 6660f5b8e..0fdba805d 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -86,7 +86,7 @@ static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE_TYPE bsize, const int ymis = MIN(cm->mi_rows - mi_row, bh); int x, y; - assert(segment_id >= 0 && segment_id < MAX_MB_SEGMENTS); + assert(segment_id >= 0 && segment_id < MAX_SEGMENTS); for (y = 0; y < ymis; y++) for (x = 0; x < xmis; x++) diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index 61c14b8dc..6f7908ffc 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -400,7 +400,7 @@ static void setup_segmentation(struct segmentation *seg, // Segmentation map update seg->update_map = vp9_rb_read_bit(rb); if (seg->update_map) { - for (i = 0; i < MB_SEG_TREE_PROBS; i++) + for (i = 0; i < SEG_TREE_PROBS; i++) seg->tree_probs[i] = vp9_rb_read_bit(rb) ? vp9_rb_read_literal(rb, 8) : MAX_PROB; @@ -422,7 +422,7 @@ static void setup_segmentation(struct segmentation *seg, vp9_clearall_segfeatures(seg); - for (i = 0; i < MAX_MB_SEGMENTS; i++) { + for (i = 0; i < MAX_SEGMENTS; i++) { for (j = 0; j < SEG_LVL_MAX; j++) { int data = 0; const int feature_enabled = vp9_rb_read_bit(rb); diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 07cb2b83e..2fede1580 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -459,10 +459,10 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, write_intra_mode(bc, mode, pc->fc.y_mode_prob[MIN(3, bsl)]); } else { int idx, idy; - int bw = 1 << b_width_log2(mi->sb_type); - int bh = 1 << b_height_log2(mi->sb_type); - for (idy = 0; idy < 2; idy += bh) - for (idx = 0; idx < 2; idx += bw) { + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mi->sb_type]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mi->sb_type]; + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { const MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode; write_intra_mode(bc, bm, pc->fc.y_mode_prob[0]); } @@ -498,11 +498,11 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, int j; MB_PREDICTION_MODE blockmode; int_mv blockmv; - int bwl = b_width_log2(mi->sb_type), bw = 1 << bwl; - int bhl = b_height_log2(mi->sb_type), bh = 1 << bhl; + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mi->sb_type]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mi->sb_type]; int idx, idy; - for (idy = 0; idy < 2; idy += bh) { - for (idx = 0; idx < 2; idx += bw) { + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { j = idy * 2 + idx; blockmode = cpi->mb.partition_info->bmi[j].mode; blockmv = m->bmi[j].as_mv[0]; @@ -563,10 +563,10 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, write_intra_mode(bc, ym, vp9_kf_y_mode_prob[A][L]); } else { int idx, idy; - int bw = 1 << b_width_log2(m->mbmi.sb_type); - int bh = 1 << b_height_log2(m->mbmi.sb_type); - for (idy = 0; idy < 2; idy += bh) { - for (idx = 0; idx < 2; idx += bw) { + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[m->mbmi.sb_type]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[m->mbmi.sb_type]; + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { int i = idy * 2 + idx; const MB_PREDICTION_MODE A = above_block_mode(m, i, mis); const MB_PREDICTION_MODE L = (xd->left_available || idx) ? @@ -619,7 +619,6 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc, VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &cpi->mb.e_mbd; const int mis = cm->mode_info_stride; - int bwl, bhl; int bsl = b_width_log2(bsize); int bs = (1 << bsl) / 4; // mode_info step for subsize int n; @@ -629,20 +628,7 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc, if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - bwl = b_width_log2(m->mbmi.sb_type); - bhl = b_height_log2(m->mbmi.sb_type); - - // parse the partition type - if ((bwl == bsl) && (bhl == bsl)) - partition = PARTITION_NONE; - else if ((bwl == bsl) && (bhl < bsl)) - partition = PARTITION_HORZ; - else if ((bwl < bsl) && (bhl == bsl)) - partition = PARTITION_VERT; - else if ((bwl < bsl) && (bhl < bsl)) - partition = PARTITION_SPLIT; - else - assert(0); + partition = partition_lookup[bsl][m->mbmi.sb_type]; if (bsize < BLOCK_SIZE_SB8X8) if (xd->ab_index > 0) @@ -1011,7 +997,7 @@ static void encode_segmentation(VP9_COMP *cpi, // Select the coding strategy (temporal or spatial) vp9_choose_segmap_coding_method(cpi); // Write out probabilities used to decode unpredicted macro-block segments - for (i = 0; i < MB_SEG_TREE_PROBS; i++) { + for (i = 0; i < SEG_TREE_PROBS; i++) { const int prob = seg->tree_probs[i]; const int update = prob != MAX_PROB; vp9_wb_write_bit(wb, update); @@ -1037,7 +1023,7 @@ static void encode_segmentation(VP9_COMP *cpi, if (seg->update_data) { vp9_wb_write_bit(wb, seg->abs_delta); - for (i = 0; i < MAX_MB_SEGMENTS; i++) { + for (i = 0; i < MAX_SEGMENTS; i++) { for (j = 0; j < SEG_LVL_MAX; j++) { const int active = vp9_segfeature_active(seg, i, j); vp9_wb_write_bit(wb, active); diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index d6882d585..4b49b17a2 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -96,6 +96,7 @@ struct macroblock { signed int act_zbin_adj; int mv_best_ref_index[MAX_REF_FRAMES]; + unsigned int max_mv_context[MAX_REF_FRAMES]; int nmvjointcost[MV_JOINTS]; int nmvcosts[2][MV_VALS]; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 2e7cb291d..502308766 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -323,7 +323,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, int mb_mode_index = ctx->best_mode_index; const int mis = cpi->common.mode_info_stride; - const int bh = 1 << mi_height_log2(bsize), bw = 1 << mi_width_log2(bsize); + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; assert(mi->mbmi.mode < MB_MODE_COUNT); assert(mb_mode_index < MAX_MODES); @@ -333,10 +334,10 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, // Restore the coding context of the MB to that that was in place // when the mode was picked for it - for (y = 0; y < bh; y++) { - for (x_idx = 0; x_idx < bw; x_idx++) { - if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > x_idx - && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > y) { + for (y = 0; y < mi_height; y++) { + for (x_idx = 0; x_idx < mi_width; x_idx++) { + if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > x_idx + && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > y) { MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis; *mi_addr = *mi; } @@ -412,10 +413,10 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, if (bsize > BLOCK_SIZE_SB8X8 && mbmi->mode == NEWMV) { int i, j; - for (j = 0; j < bh; ++j) - for (i = 0; i < bw; ++i) - if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > i - && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > j) + for (j = 0; j < mi_height; ++j) + for (i = 0; i < mi_width; ++i) + if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + mi_width > i + && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + mi_height > j) xd->mode_info_context[mis * j + i].mbmi = *mbmi; } @@ -459,7 +460,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, MB_MODE_INFO *mbmi; const int dst_fb_idx = cm->new_fb_idx; const int idx_str = xd->mode_info_stride * mi_row + mi_col; - const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize); + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; const int mb_row = mi_row >> 1; const int mb_col = mi_col >> 1; const int idx_map = mb_row * cm->mb_cols + mb_col; @@ -496,13 +498,13 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, x->mv_row_min = -((mi_row * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND); x->mv_col_min = -((mi_col * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND); x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE - + (VP9BORDERINPIXELS - MI_SIZE * bh - VP9_INTERP_EXTEND)); + + (VP9BORDERINPIXELS - MI_SIZE * mi_height - VP9_INTERP_EXTEND)); x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE - + (VP9BORDERINPIXELS - MI_SIZE * bw - VP9_INTERP_EXTEND)); + + (VP9BORDERINPIXELS - MI_SIZE * mi_width - VP9_INTERP_EXTEND)); // Set up distance of MB to edge of frame in 1/8th pel units - assert(!(mi_col & (bw - 1)) && !(mi_row & (bh - 1))); - set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw); + assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); + set_mi_row_col(cm, xd, mi_row, mi_height, mi_col, mi_width); /* set up source buffers */ vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); @@ -676,23 +678,27 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col, MACROBLOCK * const x = &cpi->mb; MACROBLOCKD * const xd = &x->e_mbd; int p; - int bwl = b_width_log2(bsize), bw = 1 << bwl; - int bhl = b_height_log2(bsize), bh = 1 << bhl; - int mwl = mi_width_log2(bsize), mw = 1 << mwl; - int mhl = mi_height_log2(bsize), mh = 1 << mhl; + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int mi_height = num_8x8_blocks_high_lookup[bsize]; for (p = 0; p < MAX_MB_PLANE; p++) { vpx_memcpy( cm->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x), - a + bw * p, sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x); + a + num_4x4_blocks_wide * p, + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> + xd->plane[p].subsampling_x); vpx_memcpy( cm->left_context[p] - + ((mi_row & MI_MASK)* 2 >> xd->plane[p].subsampling_y),l + bh * p, - sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y); - } + + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), + l + num_4x4_blocks_high * p, + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> + xd->plane[p].subsampling_y); + } vpx_memcpy(cm->above_seg_context + mi_col, sa, - sizeof(PARTITION_CONTEXT) * mw); + sizeof(PARTITION_CONTEXT) * mi_width); vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl, - sizeof(PARTITION_CONTEXT) * mh); + sizeof(PARTITION_CONTEXT) * mi_height); } static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], @@ -703,27 +709,30 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, MACROBLOCK * const x = &cpi->mb; MACROBLOCKD * const xd = &x->e_mbd; int p; - int bwl = b_width_log2(bsize), bw = 1 << bwl; - int bhl = b_height_log2(bsize), bh = 1 << bhl; - int mwl = mi_width_log2(bsize), mw = 1 << mwl; - int mhl = mi_height_log2(bsize), mh = 1 << mhl; + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int mi_height = num_8x8_blocks_high_lookup[bsize]; // buffer the above/left context information of the block in search. for (p = 0; p < MAX_MB_PLANE; ++p) { vpx_memcpy( - a + bw * p, + a + num_4x4_blocks_wide * p, cm->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x), - sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x); + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >> + xd->plane[p].subsampling_x); vpx_memcpy( - l + bh * p, + l + num_4x4_blocks_high * p, cm->left_context[p] - + ((mi_row & MI_MASK)* 2 >> xd->plane[p].subsampling_y),sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y); - } + + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), + (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >> + xd->plane[p].subsampling_y); + } vpx_memcpy(sa, cm->above_seg_context + mi_col, - sizeof(PARTITION_CONTEXT) * mw); + sizeof(PARTITION_CONTEXT) * mi_width); vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK), - sizeof(PARTITION_CONTEXT) * mh) - ;} + sizeof(PARTITION_CONTEXT) * mi_height); +} static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, int output_enabled, BLOCK_SIZE_TYPE bsize, int sub_index) { @@ -759,8 +768,10 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, MACROBLOCKD * const xd = &x->e_mbd; BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8; const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4; - int bwl, bhl; int UNINITIALIZED_IS_SAFE(pl); + PARTITION_TYPE partition; + BLOCK_SIZE_TYPE subsize; + int i; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; @@ -771,44 +782,46 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, pl = partition_plane_context(xd, bsize); c1 = *(get_sb_partitioning(x, bsize)); } + partition = partition_lookup[bsl][c1]; - bwl = b_width_log2(c1), bhl = b_height_log2(c1); - - if (bsl == bwl && bsl == bhl) { - if (output_enabled && bsize >= BLOCK_SIZE_SB8X8) - cpi->partition_count[pl][PARTITION_NONE]++; - encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1); - } else if (bsl == bhl && bsl > bwl) { - if (output_enabled) - cpi->partition_count[pl][PARTITION_VERT]++; - encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); - encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1); - } else if (bsl == bwl && bsl > bhl) { - if (output_enabled) - cpi->partition_count[pl][PARTITION_HORZ]++; - encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); - encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1); - } else { - BLOCK_SIZE_TYPE subsize; - int i; - - assert(bwl < bsl && bhl < bsl); - subsize = get_subsize(bsize, PARTITION_SPLIT); + switch (partition) { + case PARTITION_NONE: + if (output_enabled && bsize >= BLOCK_SIZE_SB8X8) + cpi->partition_count[pl][PARTITION_NONE]++; + encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1); + break; + case PARTITION_VERT: + if (output_enabled) + cpi->partition_count[pl][PARTITION_VERT]++; + encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); + encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1); + break; + case PARTITION_HORZ: + if (output_enabled) + cpi->partition_count[pl][PARTITION_HORZ]++; + encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); + encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1); + break; + case PARTITION_SPLIT: + subsize = get_subsize(bsize, PARTITION_SPLIT); - if (output_enabled) - cpi->partition_count[pl][PARTITION_SPLIT]++; + if (output_enabled) + cpi->partition_count[pl][PARTITION_SPLIT]++; - for (i = 0; i < 4; i++) { - const int x_idx = i & 1, y_idx = i >> 1; + for (i = 0; i < 4; i++) { + const int x_idx = i & 1, y_idx = i >> 1; - *(get_sb_index(xd, subsize)) = i; - encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs, - output_enabled, subsize); - } + *(get_sb_index(xd, subsize)) = i; + encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs, + output_enabled, subsize); + } + break; + default: + assert(0); + break; } - if (bsize >= BLOCK_SIZE_SB8X8 - && (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) { + if (partition != PARTITION_SPLIT || bsize == BLOCK_SIZE_SB8X8) { set_partition_seg_context(cm, xd, mi_row, mi_col); update_partition_context(xd, c1, bsize); } @@ -1159,13 +1172,11 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, MACROBLOCK * const x = &cpi->mb; MACROBLOCKD *xd = &cpi->mb.e_mbd; const int mis = cm->mode_info_stride; - int bwl = b_width_log2(m->mbmi.sb_type); - int bhl = b_height_log2(m->mbmi.sb_type); int bsl = b_width_log2(bsize); - int bs = (1 << bsl); - int bh = (1 << bhl); - int ms = bs / 2; - int mh = bh / 2; + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + int ms = num_4x4_blocks_wide / 2; + int mh = num_4x4_blocks_high / 2; int bss = (1 << bsl) / 4; int i, pl; PARTITION_TYPE partition = PARTITION_NONE; @@ -1187,17 +1198,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - // parse the partition type - if ((bwl == bsl) && (bhl == bsl)) - partition = PARTITION_NONE; - else if ((bwl == bsl) && (bhl < bsl)) - partition = PARTITION_HORZ; - else if ((bwl < bsl) && (bhl == bsl)) - partition = PARTITION_VERT; - else if ((bwl < bsl) && (bhl < bsl)) - partition = PARTITION_SPLIT; - else - assert(0); + partition = partition_lookup[bsl][bs_type]; subsize = get_subsize(bsize, partition); @@ -1340,8 +1341,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, // Split partition. for (i = 0; i < 4; i++) { - int x_idx = (i & 1) * (bs >> 2); - int y_idx = (i >> 1) * (bs >> 2); + int x_idx = (i & 1) * (num_4x4_blocks_wide >> 2); + int y_idx = (i >> 1) * (num_4x4_blocks_wide >> 2); int rt = 0; int64_t dt = 0; ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; @@ -2468,10 +2469,12 @@ static void sum_intra_stats(VP9_COMP *cpi, MACROBLOCK *x) { ++cpi->y_mode_count[MIN(bsl, 3)][m]; } else { int idx, idy; - int bw = 1 << b_width_log2(xd->mode_info_context->mbmi.sb_type); - int bh = 1 << b_height_log2(xd->mode_info_context->mbmi.sb_type); - for (idy = 0; idy < 2; idy += bh) { - for (idx = 0; idx < 2; idx += bw) { + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[ + xd->mode_info_context->mbmi.sb_type]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[ + xd->mode_info_context->mbmi.sb_type]; + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { int m = xd->mode_info_context->bmi[idy * 2 + idx].as_mode; ++cpi->y_mode_count[0][m]; } @@ -2509,8 +2512,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, MB_MODE_INFO *mbmi = &mi->mbmi; unsigned int segment_id = mbmi->segment_id; const int mis = cm->mode_info_stride; - const int bwl = mi_width_log2(bsize); - const int bw = 1 << bwl, bh = 1 << mi_height_log2(bsize); + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; x->rd_search = 0; x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH); @@ -2635,8 +2638,8 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, sz = TX_4X4; } - for (y = 0; y < bh; y++) { - for (x = 0; x < bw; x++) { + for (y = 0; y < mi_height; y++) { + for (x = 0; x < mi_width; x++) { if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows) { mi[mis * y + x].mbmi.txfm_size = sz; } diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 6a918926d..710417948 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -441,7 +441,7 @@ void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize, TX_TYPE tx_type; const int16_t *scan, *iscan; uint16_t *eob = &pd->eobs[block]; - const int bwl = b_width_log2(bsize) - pd->subsampling_x, bw = 1 << bwl; + const int bwl = plane_block_width_log2by4(bsize, pd), bw = 1 << bwl; const int twl = bwl - tx_size, twmask = (1 << twl) - 1; int xoff, yoff; int16_t *src_diff; @@ -533,6 +533,8 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize, if (x->skip_encode) return; + if (pd->eobs[block] == 0) + return; switch (ss_txfrm_size / 2) { case TX_32X32: @@ -657,7 +659,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); - if (!x->skip_encode) + if (!x->skip_encode && *eob) vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride); break; case TX_16X16: @@ -682,7 +684,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); - if (!x->skip_encode) { + if (!x->skip_encode && *eob) { if (tx_type == DCT_DCT) vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride); else @@ -711,7 +713,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); - if (!x->skip_encode) { + if (!x->skip_encode && *eob) { if (tx_type == DCT_DCT) vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride); else @@ -743,7 +745,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); - if (!x->skip_encode) { + if (!x->skip_encode && *eob) { if (tx_type == DCT_DCT) // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c index 500f57442..f0c34b373 100644 --- a/vp9/encoder/vp9_encodemv.c +++ b/vp9/encoder/vp9_encodemv.c @@ -486,11 +486,11 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w, if (mv_joint_horizontal(j)) encode_mv_component(w, diff.col, &mvctx->comps[1], usehp); - // If auto_mv_step_size is enabled and it is an arf/non shown frame - // then keep track of the largest motion vector component used. - if (cpi->sf.auto_mv_step_size && !cpi->common.show_frame) { - cpi->max_mv_magnitude = MAX((MAX(abs(mv->row), abs(mv->col)) >> 3), - cpi->max_mv_magnitude); + // If auto_mv_step_size is enabled then keep track of the largest + // motion vector component used. + if (!cpi->dummy_packing && cpi->sf.auto_mv_step_size) { + unsigned int maxv = MAX(abs(mv->row), abs(mv->col)) >> 3; + cpi->max_mv_magnitude = MAX(maxv, cpi->max_mv_magnitude); } } @@ -513,14 +513,14 @@ void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x, MODE_INFO *mi = x->e_mbd.mode_info_context; MB_MODE_INFO *const mbmi = &mi->mbmi; MV diff; - const int bw = 1 << b_width_log2(mbmi->sb_type); - const int bh = 1 << b_height_log2(mbmi->sb_type); + const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type]; + const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type]; int idx, idy; if (mbmi->sb_type < BLOCK_SIZE_SB8X8) { PARTITION_INFO *pi = x->partition_info; - for (idy = 0; idy < 2; idy += bh) { - for (idx = 0; idx < 2; idx += bw) { + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { const int i = idy * 2 + idx; if (pi->bmi[i].mode == NEWMV) { diff.row = mi->bmi[i].as_mv[0].as_mv.row - best_ref_mv->as_mv.row; diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 5b7bed463..0be98913e 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -19,11 +19,13 @@ #include "vp9/common/vp9_findnearmv.h" #include "vp9/common/vp9_common.h" +// #define NEW_DIAMOND_SEARCH + void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) { int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL + - ((ref_mv->as_mv.col & 7) ? 1 : 0); + ((ref_mv->as_mv.col & 7) ? 1 : 0); int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL + - ((ref_mv->as_mv.row & 7) ? 1 : 0); + ((ref_mv->as_mv.row & 7) ? 1 : 0); int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL; int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL; @@ -1511,12 +1513,13 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x, this_row_offset = best_mv->as_mv.row + ss[i].mv.row; this_col_offset = best_mv->as_mv.col + ss[i].mv.col; - if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) && - (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max)) - - { + if ((this_col_offset > x->mv_col_min) && + (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && + (this_row_offset < x->mv_row_max)) { check_here = ss[i].offset + best_address; - thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad); + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, + bestsad); if (thissad < bestsad) { this_mv.as_mv.row = this_row_offset; @@ -1539,6 +1542,34 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x, best_mv->as_mv.col += ss[best_site].mv.col; best_address += ss[best_site].offset; last_site = best_site; +#if defined(NEW_DIAMOND_SEARCH) + while (1) { + this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row; + this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col; + if ((this_col_offset > x->mv_col_min) && + (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && + (this_row_offset < x->mv_row_max)) { + check_here = ss[best_site].offset + best_address; + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, + bestsad); + if (thissad < bestsad) { + this_mv.as_mv.row = this_row_offset; + this_mv.as_mv.col = this_col_offset; + thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + mvjsadcost, mvsadcost, sad_per_bit); + if (thissad < bestsad) { + bestsad = thissad; + best_mv->as_mv.row += ss[best_site].mv.row; + best_mv->as_mv.col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + continue; + } + } + } + break; + }; +#endif } else if (best_address == in_what) (*num00)++; } @@ -1680,12 +1711,39 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, i++; } } - if (best_site != last_site) { best_mv->as_mv.row += ss[best_site].mv.row; best_mv->as_mv.col += ss[best_site].mv.col; best_address += ss[best_site].offset; last_site = best_site; +#if defined(NEW_DIAMOND_SEARCH) + while (1) { + this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row; + this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col; + if ((this_col_offset > x->mv_col_min) && + (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && + (this_row_offset < x->mv_row_max)) { + check_here = ss[best_site].offset + best_address; + thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, + bestsad); + if (thissad < bestsad) { + this_mv.as_mv.row = this_row_offset; + this_mv.as_mv.col = this_col_offset; + thissad += mvsad_err_cost(&this_mv, &fcenter_mv, + mvjsadcost, mvsadcost, sad_per_bit); + if (thissad < bestsad) { + bestsad = thissad; + best_mv->as_mv.row += ss[best_site].mv.row; + best_mv->as_mv.col += ss[best_site].mv.col; + best_address += ss[best_site].offset; + continue; + } + } + } + break; + }; +#endif } else if (best_address == in_what) (*num00)++; } @@ -1706,6 +1764,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x, /* do_refine: If last step (1-away) of n-step search doesn't pick the center point as the best match, we will do a final 1-away diamond refining search */ + int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x, int_mv *mvp_full, int step_param, int sadpb, int further_steps, diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index fe276fa6b..7b50e076e 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -706,12 +706,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) { cpi->mode_chosen_counts[i] = 0; } - // Initialize cpi->max_mv_magnitude if appropriate. - if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only || - (cpi->common.show_frame == 0)) { - cpi->max_mv_magnitude = 0; - } - // best quality defaults sf->RD = 1; sf->search_method = NSTEP; @@ -773,7 +767,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) { #else sf->static_segmentation = 0; #endif - sf->auto_mv_step_size = 1; sf->use_avoid_tested_higherror = 1; sf->adaptive_rd_thresh = 1; sf->last_chroma_intra_mode = TM_PRED; @@ -798,6 +791,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->last_chroma_intra_mode = H_PRED; sf->use_rd_breakout = 1; sf->skip_encode_sb = 1; + sf->auto_mv_step_size = 1; } if (speed == 2) { sf->adjust_thresholds_by_speed = 1; @@ -824,6 +818,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->using_small_partition_info = 1; sf->disable_splitmv = (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0; + sf->auto_mv_step_size = 1; } if (speed == 3) { sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; @@ -840,6 +835,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->use_rd_breakout = 1; sf->skip_encode_sb = 1; sf->disable_splitmv = 1; + sf->auto_mv_step_size = 1; } if (speed == 4) { sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; @@ -856,6 +852,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { FLAG_SKIP_COMP_REFMISMATCH; sf->use_rd_breakout = 1; sf->optimize_coefficients = 0; + sf->auto_mv_step_size = 1; // sf->reduce_first_step_size = 1; // sf->reference_masking = 1; @@ -1222,7 +1219,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { { int i; - for (i = 0; i < MAX_MB_SEGMENTS; i++) + for (i = 0; i < MAX_SEGMENTS; i++) cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout; } @@ -2515,6 +2512,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, int undershoot_seen = 0; SPEED_FEATURES *sf = &cpi->sf; + unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height); #if RESET_FOREACH_FILTER int q_low0; int q_high0; @@ -2587,6 +2585,24 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Set default state for segment based loop filter update flags xd->lf.mode_ref_delta_update = 0; + // Initialize cpi->mv_step_param to default based on max resolution + cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def); + // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate. + if (sf->auto_mv_step_size) { + if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only) { + // initialize max_mv_magnitude for use in the first INTER frame + // after a key/intra-only frame + cpi->max_mv_magnitude = max_mv_def; + } else { + if (cm->show_frame) + // allow mv_steps to correspond to twice the max mv magnitude found + // in the previous frame, capped by the default max_mv_magnitude based + // on resolution + cpi->mv_step_param = vp9_init_search_range( + cpi, MIN(max_mv_def, 2 * cpi->max_mv_magnitude)); + cpi->max_mv_magnitude = 0; + } + } // Set various flags etc to special state if it is a key frame if (cm->frame_type == KEY_FRAME) { @@ -3444,15 +3460,24 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cm->last_width = cm->width; cm->last_height = cm->height; - // Don't increment frame counters if this was an altref buffer - // update not a real frame + // reset to normal state now that we are done. cm->last_show_frame = cm->show_frame; if (cm->show_frame) { + // current mip will be the prev_mip for the next frame + MODE_INFO *temp = cm->prev_mip; + cm->prev_mip = cm->mip; + cm->mip = temp; + + // update the upper left visible macroblock ptrs + cm->mi = cm->mip + cm->mode_info_stride + 1; + + // Don't increment frame counters if this was an altref buffer + // update not a real frame ++cm->current_video_frame; ++cpi->frames_since_key; } - - // reset to normal state now that we are done. + // restore prev_mi + cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; #if 0 { @@ -3470,17 +3495,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_write_yuv_rec_frame(cm); #endif - if (cm->show_frame) { - vpx_memcpy(cm->prev_mip, cm->mip, - cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE) * - sizeof(MODE_INFO)); - } else { - vpx_memset(cm->prev_mip, 0, - cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE) * - sizeof(MODE_INFO)); - } - // restore prev_mi - cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; } static void Pass2Encode(VP9_COMP *cpi, unsigned long *size, @@ -3973,11 +3987,11 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, } int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows, - unsigned int cols, int delta_q[MAX_MB_SEGMENTS], - int delta_lf[MAX_MB_SEGMENTS], - unsigned int threshold[MAX_MB_SEGMENTS]) { + unsigned int cols, int delta_q[MAX_SEGMENTS], + int delta_lf[MAX_SEGMENTS], + unsigned int threshold[MAX_SEGMENTS]) { VP9_COMP *cpi = (VP9_COMP *) comp; - signed char feature_data[SEG_LVL_MAX][MAX_MB_SEGMENTS]; + signed char feature_data[SEG_LVL_MAX][MAX_SEGMENTS]; MACROBLOCKD *xd = &cpi->mb.e_mbd; int i; @@ -3996,14 +4010,14 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows, vp9_enable_segmentation((VP9_PTR)cpi); // Set up the quan, LF and breakout threshold segment data - for (i = 0; i < MAX_MB_SEGMENTS; i++) { + for (i = 0; i < MAX_SEGMENTS; i++) { feature_data[SEG_LVL_ALT_Q][i] = delta_q[i]; feature_data[SEG_LVL_ALT_LF][i] = delta_lf[i]; cpi->segment_encode_breakout[i] = threshold[i]; } // Enable the loop and quant changes in the feature mask - for (i = 0; i < MAX_MB_SEGMENTS; i++) { + for (i = 0; i < MAX_SEGMENTS; i++) { if (delta_q[i]) vp9_enable_segfeature(&xd->seg, i, SEG_LVL_ALT_Q); else diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index 2c65fecd1..0798927bd 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -504,6 +504,7 @@ typedef struct VP9_COMP { int error_bins[1024]; unsigned int max_mv_magnitude; + int mv_step_param; // Data used for real time conferencing mode to help determine if it would be good to update the gf int inter_zz_count; @@ -513,7 +514,7 @@ typedef struct VP9_COMP { unsigned char *segmentation_map; // segment threashold for encode breakout - int segment_encode_breakout[MAX_MB_SEGMENTS]; + int segment_encode_breakout[MAX_SEGMENTS]; unsigned char *active_map; unsigned int active_map_enabled; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index d52091c70..9c6f9f8db 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -463,10 +463,8 @@ static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, BLOCK_SIZE_TYPE bs = BLOCK_SIZE_AB4X4; struct macroblock_plane *const p = &x->plane[0]; struct macroblockd_plane *const pd = &xd->plane[0]; - const int bwl = plane_block_width_log2by4(bsize, pd); - const int bhl = plane_block_height_log2by4(bsize, pd); - const int bw = 4 << bwl; - const int bh = 4 << bhl; + const int width = plane_block_width(bsize, pd); + const int height = plane_block_height(bsize, pd); int rate_sum = 0; int64_t dist_sum = 0; @@ -485,10 +483,9 @@ static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, } else { assert(0); } - assert(bs <= get_block_size(bwl, bhl)); *out_skip = 1; - for (j = 0; j < bh; j+=t) { - for (k = 0; k < bw; k+=t) { + for (j = 0; j < height; j += t) { + for (k = 0; k < width; k += t) { int rate; int64_t dist; unsigned int sse; @@ -711,8 +708,8 @@ static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize, static int rdcost_plane(VP9_COMMON * const cm, MACROBLOCK *x, int plane, BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { MACROBLOCKD * const xd = &x->e_mbd; - const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; - const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; + const int bwl = plane_block_width_log2by4(bsize, &xd->plane[plane]); + const int bhl = plane_block_height_log2by4(bsize, &xd->plane[plane]); const int bw = 1 << bwl, bh = 1 << bhl; struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, 0, 0, 0, INT64_MAX, 0 }; @@ -802,8 +799,8 @@ static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { MACROBLOCKD *const xd = &x->e_mbd; struct macroblockd_plane *const pd = &xd->plane[0]; - const int bwl = b_width_log2(bsize) - xd->plane[0].subsampling_x; - const int bhl = b_height_log2(bsize) - xd->plane[0].subsampling_y; + const int bwl = plane_block_width_log2by4(bsize, pd); + const int bhl = plane_block_height_log2by4(bsize, pd); const int bw = 1 << bwl, bh = 1 << bhl; struct rdcost_block_args args = { cm, x, { 0 }, { 0 }, tx_size, bw, bh, 0, 0, 0, ref_best_rd, 0 }; @@ -1185,8 +1182,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, ENTROPY_CONTEXT tl[2], templ[2]; TX_TYPE tx_type = DCT_DCT; TX_TYPE best_tx_type = DCT_DCT; - int bw = 1 << b_width_log2(bsize); - int bh = 1 << b_height_log2(bsize); + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; int idx, idy, block; DECLARE_ALIGNED(16, int16_t, best_dqcoeff[4][16]); @@ -1212,8 +1209,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, vpx_memcpy(tempa, ta, sizeof(ta)); vpx_memcpy(templ, tl, sizeof(tl)); - for (idy = 0; idy < bh; ++idy) { - for (idx = 0; idx < bw; ++idx) { + for (idy = 0; idy < num_4x4_blocks_high; ++idy) { + for (idx = 0; idx < num_4x4_blocks_wide; ++idx) { int64_t ssz; block = ib + idy * 2 + idx; @@ -1270,8 +1267,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, best_tx_type = tx_type; vpx_memcpy(a, tempa, sizeof(tempa)); vpx_memcpy(l, templ, sizeof(templ)); - for (idy = 0; idy < bh; ++idy) { - for (idx = 0; idx < bw; ++idx) { + for (idy = 0; idy < num_4x4_blocks_high; ++idy) { + for (idx = 0; idx < num_4x4_blocks_wide; ++idx) { block = ib + idy * 2 + idx; vpx_memcpy(best_dqcoeff[idy * 2 + idx], BLOCK_OFFSET(pd->dqcoeff, block, 16), @@ -1284,8 +1281,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, if (x->skip_encode) return best_rd; - for (idy = 0; idy < bh; ++idy) { - for (idx = 0; idx < bw; ++idx) { + for (idy = 0; idy < num_4x4_blocks_high; ++idy) { + for (idx = 0; idx < num_4x4_blocks_wide; ++idx) { block = ib + idy * 2 + idx; xd->mode_info_context->bmi[block].as_mode = *best_mode; src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block, @@ -1317,8 +1314,8 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, int i, j; MACROBLOCKD *const xd = &mb->e_mbd; BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; - int bw = 1 << b_width_log2(bsize); - int bh = 1 << b_height_log2(bsize); + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; int idx, idy; int cost = 0; int64_t distortion = 0; @@ -1333,8 +1330,8 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, bmode_costs = mb->mbmode_cost; - for (idy = 0; idy < 2; idy += bh) { - for (idx = 0; idx < 2; idx += bw) { + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { const int mis = xd->mode_info_stride; MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode); int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry); @@ -1357,9 +1354,9 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, tot_rate_y += ry; mic->bmi[i].as_mode = best_mode; - for (j = 1; j < bh; ++j) + for (j = 1; j < num_4x4_blocks_high; ++j) mic->bmi[i + j * 2].as_mode = best_mode; - for (j = 1; j < bw; ++j) + for (j = 1; j < num_4x4_blocks_wide; ++j) mic->bmi[i + j].as_mode = best_mode; if (total_rd >= best_rd) @@ -1599,8 +1596,8 @@ static int labels2mode(MACROBLOCK *x, int i, MB_MODE_INFO * mbmi = &mic->mbmi; int cost = 0, thismvcost = 0; int idx, idy; - int bw = 1 << b_width_log2(mbmi->sb_type); - int bh = 1 << b_height_log2(mbmi->sb_type); + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type]; /* We have to be careful retrieving previously-encoded motion vectors. Ones from this macroblock have to be pulled from the BLOCKD array @@ -1650,8 +1647,8 @@ static int labels2mode(MACROBLOCK *x, int i, mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int; x->partition_info->bmi[i].mode = m; - for (idy = 0; idy < bh; ++idy) - for (idx = 0; idx < bw; ++idx) + for (idy = 0; idy < num_4x4_blocks_high; ++idy) + for (idx = 0; idx < num_4x4_blocks_wide; ++idx) vpx_memcpy(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i])); @@ -1671,10 +1668,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; - const int bwl = plane_block_width_log2by4(bsize, &xd->plane[0]); - const int bhl = plane_block_height_log2by4(bsize, &xd->plane[0]); - const int bw = 4 << bwl; - const int bh = 4 << bhl; + const int width = plane_block_width(bsize, &xd->plane[0]); + const int height = plane_block_height(bsize, &xd->plane[0]); int idx, idy; const int src_stride = x->plane[0].src.stride; uint8_t* const src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i, @@ -1698,7 +1693,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, xd->plane[0].dst.stride, &xd->mode_info_context->bmi[i].as_mv[0], &xd->scale_factor[0], - bw, bh, 0 /* no avg */, &xd->subpix, + width, height, 0, &xd->subpix, MV_PRECISION_Q3); if (xd->mode_info_context->mbmi.ref_frame[1] > 0) { @@ -1709,17 +1704,18 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride, dst, xd->plane[0].dst.stride, &xd->mode_info_context->bmi[i].as_mv[1], - &xd->scale_factor[1], bw, bh, 1, + &xd->scale_factor[1], + width, height, 1, &xd->subpix, MV_PRECISION_Q3); } - vp9_subtract_block(bh, bw, src_diff, 8, + vp9_subtract_block(height, width, src_diff, 8, src, src_stride, dst, xd->plane[0].dst.stride); k = i; - for (idy = 0; idy < bh / 4; ++idy) { - for (idx = 0; idx < bw / 4; ++idx) { + for (idy = 0; idy < height / 4; ++idy) { + for (idx = 0; idx < width / 4; ++idx) { int64_t ssz, rd, rd1, rd2; k += (idy * 2 + idx); @@ -1825,8 +1821,8 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, int label_mv_thresh; int segmentyrate = 0; BLOCK_SIZE_TYPE bsize = mbmi->sb_type; - int bwl = b_width_log2(bsize), bw = 1 << bwl; - int bhl = b_height_log2(bsize), bh = 1 << bhl; + int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; + int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; vp9_variance_fn_ptr_t *v_fn_ptr; ENTROPY_CONTEXT t_above[2], t_left[2]; BEST_SEG_INFO *bsi = bsi_buf + filter_idx; @@ -1836,7 +1832,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above)); vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left)); - v_fn_ptr = &cpi->fn_ptr[get_block_size(bwl, bhl)]; + v_fn_ptr = &cpi->fn_ptr[bsize]; // 64 makes this threshold really big effectively // making it so that we very rarely check mvs on @@ -1845,8 +1841,8 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, label_mv_thresh = 1 * bsi->mvthresh / label_count; // Segmentation method overheads - for (idy = 0; idy < 2; idy += bh) { - for (idx = 0; idx < 2; idx += bw) { + for (idy = 0; idy < 2; idy += num_4x4_blocks_high) { + for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) { // TODO(jingning,rbultje): rewrite the rate-distortion optimization // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop int_mv mode_mv[MB_MODE_COUNT], second_mode_mv[MB_MODE_COUNT]; @@ -1940,9 +1936,24 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, if (i == 2) bsi->mvp.as_int = x->e_mbd.mode_info_context->bmi[i - 2].as_mv[0].as_int; - step_param = 2; } } + if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) { + // Take wtd average of the step_params based on the last frame's + // max mv magnitude and the best ref mvs of the current block for + // the given reference. + if (i == 0) + step_param = (vp9_init_search_range( + cpi, x->max_mv_context[mbmi->ref_frame[0]]) + + cpi->mv_step_param) >> 1; + else + step_param = (vp9_init_search_range( + cpi, MAX(abs(bsi->mvp.as_mv.row), + abs(bsi->mvp.as_mv.col)) >> 3) + + cpi->mv_step_param) >> 1; + } else { + step_param = cpi->mv_step_param; + } further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; @@ -2023,19 +2034,19 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, x->mvcost, cpi); bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int; - if (bw > 1) + if (num_4x4_blocks_wide > 1) bsi->rdstat[i + 1][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int; - if (bh > 1) + if (num_4x4_blocks_high > 1) bsi->rdstat[i + 2][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int; if (mbmi->ref_frame[1] > 0) { bsi->rdstat[i][mode_idx].mvs[1].as_int = second_mode_mv[this_mode].as_int; - if (bw > 1) + if (num_4x4_blocks_wide > 1) bsi->rdstat[i + 1][mode_idx].mvs[1].as_int = second_mode_mv[this_mode].as_int; - if (bh > 1) + if (num_4x4_blocks_high > 1) bsi->rdstat[i + 2][mode_idx].mvs[1].as_int = second_mode_mv[this_mode].as_int; } @@ -2136,11 +2147,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, return; } - for (j = 1; j < bh; ++j) + for (j = 1; j < num_4x4_blocks_high; ++j) vpx_memcpy(&x->partition_info->bmi[i + j * 2], &x->partition_info->bmi[i], sizeof(x->partition_info->bmi[i])); - for (j = 1; j < bw; ++j) + for (j = 1; j < num_4x4_blocks_wide; ++j) vpx_memcpy(&x->partition_info->bmi[i + j], &x->partition_info->bmi[i], sizeof(x->partition_info->bmi[i])); @@ -2227,6 +2238,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x, int best_index = 0; int best_sad = INT_MAX; int this_sad = INT_MAX; + unsigned int max_mv = 0; uint8_t *src_y_ptr = x->plane[0].src.buf; uint8_t *ref_y_ptr; @@ -2236,6 +2248,8 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < MAX_MV_REF_CANDIDATES; i++) { this_mv.as_int = mbmi->ref_mvs[ref_frame][i].as_int; + max_mv = MAX(max_mv, + MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3); // The list is at an end if we see 0 for a second time. if (!this_mv.as_int && zero_seen) break; @@ -2259,6 +2273,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x, // Note the index of the mv that worked best in the reference list. x->mv_best_ref_index[ref_frame] = best_index; + x->max_mv_context[ref_frame] = max_mv; } static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id, @@ -2505,12 +2520,14 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, // Work out the size of the first step in the mv step search. // 0 here is maximum length first step. 1 is MAX >> 1 etc. if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) { - step_param = vp9_init_search_range(cpi, cpi->max_mv_magnitude); + // Take wtd average of the step_params based on the last frame's + // max mv magnitude and that based on the best ref mvs of the current + // block for the given reference. + step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) + + cpi->mv_step_param) >> 1; } else { - step_param = vp9_init_search_range( - cpi, MIN(cpi->common.width, cpi->common.height)); + step_param = cpi->mv_step_param; } - // mvp_full.as_int = ref_mv[0].as_int; mvp_full.as_int = mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int; diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c index 8d5b3860c..ef84cc5c0 100644 --- a/vp9/encoder/vp9_segmentation.c +++ b/vp9/encoder/vp9_segmentation.c @@ -219,11 +219,11 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { int i, tile_col, mi_row, mi_col; int temporal_predictor_count[PREDICTION_PROBS][2]; - int no_pred_segcounts[MAX_MB_SEGMENTS]; - int t_unpred_seg_counts[MAX_MB_SEGMENTS]; + int no_pred_segcounts[MAX_SEGMENTS]; + int t_unpred_seg_counts[MAX_SEGMENTS]; - vp9_prob no_pred_tree[MB_SEG_TREE_PROBS]; - vp9_prob t_pred_tree[MB_SEG_TREE_PROBS]; + vp9_prob no_pred_tree[SEG_TREE_PROBS]; + vp9_prob t_pred_tree[SEG_TREE_PROBS]; vp9_prob t_nopred_prob[PREDICTION_PROBS]; const int mis = cm->mode_info_stride; |