diff options
Diffstat (limited to 'vp8')
63 files changed, 2073 insertions, 2533 deletions
diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c index 2b45afe4b..29288519f 100644 --- a/vp8/common/arm/arm_systemdependent.c +++ b/vp8/common/arm/arm_systemdependent.c @@ -45,7 +45,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_armv6; rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6; - rtcd->idct.idct1 = vp8_short_idct4x4llm_1_v6; rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual; rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_v6; rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_v6; @@ -64,9 +63,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6; rtcd->recon.copy8x8 = vp8_copy_mem8x8_v6; rtcd->recon.copy8x4 = vp8_copy_mem8x4_v6; - rtcd->recon.recon = vp8_recon_b_armv6; - rtcd->recon.recon2 = vp8_recon2b_armv6; - rtcd->recon.recon4 = vp8_recon4b_armv6; } #endif @@ -82,7 +78,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_neon; rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon; - rtcd->idct.idct1 = vp8_short_idct4x4llm_1_neon; rtcd->idct.idct16 = vp8_short_idct4x4llm_neon; rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon; rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon; @@ -99,10 +94,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx) rtcd->recon.copy16x16 = vp8_copy_mem16x16_neon; rtcd->recon.copy8x8 = vp8_copy_mem8x8_neon; rtcd->recon.copy8x4 = vp8_copy_mem8x4_neon; - rtcd->recon.recon = vp8_recon_b_neon; - rtcd->recon.recon2 = vp8_recon2b_neon; - rtcd->recon.recon4 = vp8_recon4b_neon; - rtcd->recon.recon_mb = vp8_recon_mb_neon; rtcd->recon.build_intra_predictors_mby = vp8_build_intra_predictors_mby_neon; rtcd->recon.build_intra_predictors_mby_s = diff --git a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm index e0660e9fd..9aa659fa7 100644 --- a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm +++ b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm @@ -11,25 +11,27 @@ AREA |.text|, CODE, READONLY -;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr, -; unsigned char *dst_ptr, int pitch, int stride) +;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, +; int pred_stride, unsigned char *dst_ptr, +; int dst_stride) ; r0 input_dc ; r1 pred_ptr -; r2 dest_ptr -; r3 pitch -; sp stride +; r2 pred_stride +; r3 dst_ptr +; sp dst_stride |vp8_dc_only_idct_add_v6| PROC - stmdb sp!, {r4 - r7, lr} + stmdb sp!, {r4 - r7} add r0, r0, #4 ; input_dc += 4 ldr r12, c0x0000FFFF - ldr r4, [r1], r3 - ldr r6, [r1], r3 + ldr r4, [r1], r2 and r0, r12, r0, asr #3 ; input_dc >> 3 + mask - ldr lr, [sp, #20] + ldr r6, [r1], r2 orr r0, r0, r0, lsl #16 ; a1 | a1 + ldr r12, [sp, #16] ; dst stride + uxtab16 r5, r0, r4 ; a1+2 | a1+0 uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1 uxtab16 r7, r0, r6 @@ -40,10 +42,10 @@ usat16 r6, #8, r6 orr r5, r5, r4, lsl #8 orr r7, r7, r6, lsl #8 - ldr r4, [r1], r3 + ldr r4, [r1], r2 + str r5, [r3], r12 ldr r6, [r1] - str r5, [r2], lr - str r7, [r2], lr + str r7, [r3], r12 uxtab16 r5, r0, r4 uxtab16 r4, r0, r4, ror #8 @@ -55,10 +57,11 @@ usat16 r6, #8, r6 orr r5, r5, r4, lsl #8 orr r7, r7, r6, lsl #8 - str r5, [r2], lr - str r7, [r2] + str r5, [r3], r12 + str r7, [r3] - ldmia sp!, {r4 - r7, pc} + ldmia sp!, {r4 - r7} + bx lr ENDP ; |vp8_dc_only_idct_add_v6| diff --git a/vp8/common/arm/armv6/idct_v6.asm b/vp8/common/arm/armv6/idct_v6.asm index 27215afcd..b4d44cbeb 100644 --- a/vp8/common/arm/armv6/idct_v6.asm +++ b/vp8/common/arm/armv6/idct_v6.asm @@ -9,337 +9,194 @@ ; -; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14 - EXPORT |vp8_short_idct4x4llm_1_v6| - EXPORT |vp8_short_idct4x4llm_v6| - EXPORT |vp8_short_idct4x4llm_v6_scott| EXPORT |vp8_short_idct4x4llm_v6_dual| AREA |.text|, CODE, READONLY -;******************************************************************************** -;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch) -;* r0 INT16 * input -;* r1 INT16 * output -;* r2 INT32 pitch -;* bench: 3/5 -;******************************************************************************** - -|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit - ; - ldrsh r0, [r0] ; load input[0] 1, r0 un 2 - add r0, r0, #4 ; 1 +4 - stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup - mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3 - pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack - mov r5, r4 ; expand expand - - strd r4, [r1], r2 ; *output = r0, post inc 1 - strd r4, [r1], r2 ; 1 - strd r4, [r1], r2 ; 1 - strd r4, [r1] ; 1 - ; - ldmia sp!, {r4, r5, pc} ; replace vars, return restore - ENDP ; |vp8_short_idct4x4llm_1_v6| -;******************************************************************************** -;******************************************************************************** -;******************************************************************************** - -;******************************************************************************** -;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch) -;* r0 INT16 * input -;* r1 INT16 * output -;* r2 INT32 pitch -;* bench: -;******************************************************************************** - -|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit - ; - stmdb sp!, {r4-r11, lr} ; backup registers 1 backup - ; - mov r4, #0x00004E00 ; 1 cst - orr r4, r4, #0x0000007B ; cospi8sqrt2minus1 - mov r5, #0x00008A00 ; 1 cst - orr r5, r5, #0x0000008C ; sinpi8sqrt2 - ; - mov r6, #4 ; i=4 1 i -loop1 ; - ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4] - ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12] - ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8] - ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0] - smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1 - smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2 - add r9, r7, r8 ; a1 = [0] + [8] 1 a1 - sub r7, r7, r8 ; b1 = [0] - [8] 1 b1 - add r11, r3, r11 ; temp2 1 - rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1 - smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2 - smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1 - add r8, r7, r11 ; b1 + c1 1 b+c - strh r8, [r1, r2] ; out[pitch] = b1+c1 1 - sub r7, r7, r11 ; b1 - c1 1 b-c - add r10, r12, r10 ; temp1 1 - add r3, r10, r3 ; d1 = temp1 + temp2 1 d1 - add r10, r9, r3 ; a1 + d1 1 a+d - sub r3, r9, r3 ; a1 - d1 1 a-d - add r8, r2, r2 ; pitch * 2 1 p*2 - strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1 - add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3 - strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1 - subs r6, r6, #1 ; i-- 1 -- - strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++ - bne loop1 ; if i>0, continue - ; - sub r1, r1, #8 ; set up out for next loop 1 -4 - ; for this iteration, input=prev output - mov r6, #4 ; i=4 1 i -; b returnfull -loop2 ; - ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1] - ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3] - ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2] - ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0] - smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1 - smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2 - add r7, r0, r3 ; a1 = [0] + [2] 1 a1 - sub r0, r0, r3 ; b1 = [0] - [2] 1 b1 - add r10, r8, r10 ; temp2 1 - rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1 - smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2 - smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1 - add r3, r0, r9 ; b1+c1 1 b+c - add r3, r3, #4 ; b1+c1+4 1 +4 - add r10, r11, r10 ; temp1 1 - mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3 - strh r3, [r1, #2] ; out[1] = b1+c1 1 - add r10, r10, r8 ; d1 = temp1 + temp2 1 d1 - add r3, r7, r10 ; a1+d1 1 a+d - add r3, r3, #4 ; a1+d1+4 1 +4 - sub r7, r7, r10 ; a1-d1 1 a-d - add r7, r7, #4 ; a1-d1+4 1 +4 - mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3 - mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3 - strh r7, [r1, #6] ; out[3] = a1-d1 1 - sub r0, r0, r9 ; b1-c1 1 b-c - add r0, r0, #4 ; b1-c1+4 1 +4 - subs r6, r6, #1 ; i-- 1 -- - mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3 - strh r0, [r1, #4] ; out[2] = b1-c1 1 - strh r3, [r1], r2 ; out[0] = a1+d1 1 -; add r1, r1, r2 ; out += pitch 1 ++ - bne loop2 ; if i>0, continue -returnfull ; - ldmia sp!, {r4 - r11, pc} ; replace vars, return restore - ENDP -;******************************************************************************** -;******************************************************************************** -;******************************************************************************** - -;******************************************************************************** -;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch) -;* r0 INT16 * input -;* r1 INT16 * output -;* r2 INT32 pitch -;* bench: -;******************************************************************************** - -|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit -; mov r0, #0 ; -; ldr r0, [r0] ; - stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup - ; - mov r3, #0x00004E00 ; cos - orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 - mov r4, #0x00008A00 ; sin - orr r4, r4, #0x0000008C ; sinpi8sqrt2 - ; - mov r5, #0x2 ; i i - ; -short_idct4x4llm_v6_scott_loop1 ; - ldr r10, [r0, #(4*2)] ; i5 | i4 5,4 - ldr r11, [r0, #(12*2)] ; i13 | i12 13,12 - ; - smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1 - smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2 - ; - smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2 - smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1 - ; - add r6, r6, r7 ; partial c1 lt1-lt2 - add r12, r12, r14 ; partial d1 l2t2+l2t1 - ; - smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1 - smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2 - ; - smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1 - smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2 - ; - add r7, r14, r7 ; partial c1_2 ht1+ht2 - sub r8, r8, r9 ; partial d1_2 h2t1-h2t2 - ; - pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack - pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack - ; - usub16 r6, r6, r10 ; c1_2 | c1_1 c - uadd16 r12, r12, r11 ; d1_2 | d1_1 d - ; - ldr r10, [r0, #0] ; i1 | i0 1,0 - ldr r11, [r0, #(8*2)] ; i9 | i10 9,10 - ; -;;;;;; add r0, r0, #0x4 ; +4 -;;;;;; add r1, r1, #0x4 ; +4 - ; - uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a - usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b - ; - uadd16 r7, r8, r12 ; a1 + d1 pair a+d - usub16 r14, r8, r12 ; a1 - d1 pair a-d - ; - str r7, [r1] ; op[0] = a1 + d1 - str r14, [r1, r2] ; op[pitch*3] = a1 - d1 - ; - add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++ - add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++ - ; - subs r5, r5, #0x1 ; -- - bne short_idct4x4llm_v6_scott_loop1 ; - ; - sub r1, r1, #16 ; reset output ptr - mov r5, #0x4 ; - mov r0, r1 ; input = output - ; -short_idct4x4llm_v6_scott_loop2 ; - ; - subs r5, r5, #0x1 ; - bne short_idct4x4llm_v6_scott_loop2 ; - ; - ldmia sp!, {r4 - r11, pc} ; - ENDP ; - ; -;******************************************************************************** -;******************************************************************************** -;******************************************************************************** - -;******************************************************************************** -;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch) -;* r0 INT16 * input -;* r1 INT16 * output -;* r2 INT32 pitch -;* bench: -;******************************************************************************** - -|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit - ; - stmdb sp!, {r4-r11, lr} ; backup registers 1 backup - mov r3, #0x00004E00 ; cos - orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 - mov r4, #0x00008A00 ; sin - orr r4, r4, #0x0000008C ; sinpi8sqrt2 - mov r5, #0x2 ; i=2 i +; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, +; unsigned char *dst, int stride) +; r0 short* input +; r1 unsigned char* pred +; r2 int pitch +; r3 unsigned char* dst +; sp int stride + +|vp8_short_idct4x4llm_v6_dual| PROC + stmdb sp!, {r4-r11, lr} + + sub sp, sp, #4 + + mov r4, #0x00008A00 ; sin + orr r4, r4, #0x0000008C ; sinpi8sqrt2 + + mov r5, #0x00004E00 ; cos + orr r5, r5, #0x0000007B ; cospi8sqrt2minus1 + orr r5, r5, #1<<31 ; loop counter on top bit + loop1_dual - ldr r6, [r0, #(4*2)] ; i5 | i4 5|4 - ldr r12, [r0, #(12*2)] ; i13 | i12 13|12 - ldr r14, [r0, #(8*2)] ; i9 | i8 9|8 - - smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c - smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s - smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s - pkhbt r7, r7, r9, lsl #16 ; 5c | 4c - smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c + ldr r6, [r0, #(4*2)] ; i5 | i4 + ldr r12, [r0, #(12*2)] ; i13|i12 + ldr r14, [r0, #(8*2)] ; i9 | i8 + + smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 + smulbb r7, r5, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 + smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 + + smulbt r11, r5, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 + pkhtb r7, r9, r7, asr #16 ; 5c | 4c pkhbt r8, r8, r10, lsl #16 ; 5s | 4s - uadd16 r6, r6, r7 ; 5c+5 | 4c+4 - smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s - smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c - smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s - subs r5, r5, #0x1 ; i-- -- - pkhbt r9, r9, r11, lsl #16 ; 13c | 12c - ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0 + uadd16 r6, r6, r7 ; 5c+5 | 4c+4 + + smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 + smulbb r9, r5, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 + smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 + + subs r5, r5, #1<<31 ; i-- + + pkhtb r9, r11, r9, asr #16 ; 13c | 12c + ldr r11, [r0] ; i1 | i0 pkhbt r10, r10, r7, lsl #16 ; 13s | 12s - uadd16 r7, r12, r9 ; 13c+13 | 12c+12 - usub16 r7, r8, r7 ; c c - uadd16 r6, r6, r10 ; d d - uadd16 r10, r11, r14 ; a a - usub16 r8, r11, r14 ; b b - uadd16 r9, r10, r6 ; a+d a+d - usub16 r10, r10, r6 ; a-d a-d - uadd16 r6, r8, r7 ; b+c b+c - usub16 r7, r8, r7 ; b-c b-c - str r6, [r1, r2] ; o5 | o4 - add r6, r2, r2 ; pitch * 2 p2 - str r7, [r1, r6] ; o9 | o8 - add r6, r6, r2 ; pitch * 3 p3 - str r10, [r1, r6] ; o13 | o12 - str r9, [r1], #0x4 ; o1 | o0 ++ - bne loop1_dual ; - mov r5, #0x2 ; i=2 i - sub r0, r1, #8 ; reset input/output i/o + uadd16 r7, r12, r9 ; 13c+13 | 12c+12 + + usub16 r7, r8, r7 ; c + uadd16 r6, r6, r10 ; d + uadd16 r10, r11, r14 ; a + usub16 r8, r11, r14 ; b + + uadd16 r9, r10, r6 ; a+d + usub16 r10, r10, r6 ; a-d + uadd16 r6, r8, r7 ; b+c + usub16 r7, r8, r7 ; b-c + + ; use input buffer to store intermediate results + str r6, [r0, #(4*2)] ; o5 | o4 + str r7, [r0, #(8*2)] ; o9 | o8 + str r10,[r0, #(12*2)] ; o13|o12 + str r9, [r0], #4 ; o1 | o0 + + bcs loop1_dual + + sub r0, r0, #8 ; reset input/output + str r0, [sp] + loop2_dual - ldr r6, [r0, r2] ; i5 | i4 5|4 - ldr r1, [r0] ; i1 | i0 1|0 - ldr r12, [r0, #0x4] ; i3 | i2 3|2 - add r14, r2, #0x4 ; pitch + 2 p+2 - ldr r14, [r0, r14] ; i7 | i6 7|6 - smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c - smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s - smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s - pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4 - pkhbt r7, r9, r7, lsl #16 ; 1c | 5c - pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1 - pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5 - uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2 - pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6 - uadd16 r10, r11, r9 ; a a - usub16 r9, r11, r9 ; b b - pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7 - subs r5, r5, #0x1 ; i-- -- - smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c - smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s - smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c - smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s - - pkhbt r7, r12, r7, lsl #16 ; 3c | 7c - pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1 - uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2 - usub16 r12, r8, r6 ; c (o1 | o5) c - uadd16 r6, r11, r1 ; d (o3 | o7) d - uadd16 r7, r10, r6 ; a+d a+d - mov r8, #0x4 ; set up 4's 4 - orr r8, r8, #0x40000 ; 4|4 - usub16 r6, r10, r6 ; a-d a-d - uadd16 r6, r6, r8 ; a-d+4 3|7 - uadd16 r7, r7, r8 ; a+d+4 0|4 - uadd16 r10, r9, r12 ; b+c b+c - usub16 r1, r9, r12 ; b-c b-c - uadd16 r10, r10, r8 ; b+c+4 1|5 - uadd16 r1, r1, r8 ; b-c+4 2|6 - mov r8, r10, asr #19 ; o1 >> 3 - strh r8, [r0, #2] ; o1 - mov r8, r1, asr #19 ; o2 >> 3 - strh r8, [r0, #4] ; o2 - mov r8, r6, asr #19 ; o3 >> 3 - strh r8, [r0, #6] ; o3 - mov r8, r7, asr #19 ; o0 >> 3 - strh r8, [r0], r2 ; o0 +p - sxth r10, r10 ; - mov r8, r10, asr #3 ; o5 >> 3 - strh r8, [r0, #2] ; o5 - sxth r1, r1 ; - mov r8, r1, asr #3 ; o6 >> 3 - strh r8, [r0, #4] ; o6 - sxth r6, r6 ; - mov r8, r6, asr #3 ; o7 >> 3 - strh r8, [r0, #6] ; o7 - sxth r7, r7 ; - mov r8, r7, asr #3 ; o4 >> 3 - strh r8, [r0], r2 ; o4 +p -;;;;; subs r5, r5, #0x1 ; i-- -- - bne loop2_dual ; - ; - ldmia sp!, {r4 - r11, pc} ; replace vars, return restore + + ldr r6, [r0, #(4*2)] ; i5 | i4 + ldr r12,[r0, #(2*2)] ; i3 | i2 + ldr r14,[r0, #(6*2)] ; i7 | i6 + ldr r0, [r0, #(0*2)] ; i1 | i0 + + smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 + smulbt r7, r5, r0 ; (ip[1] * cospi8sqrt2minus1) >> 16 + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 + smulwt r8, r4, r0 ; (ip[1] * sinpi8sqrt2) >> 16 + + pkhbt r11, r6, r0, lsl #16 ; i0 | i4 + pkhtb r7, r7, r9, asr #16 ; 1c | 5c + pkhtb r0, r0, r6, asr #16 ; i1 | i5 + pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 + + uadd16 r0, r7, r0 ; 1c+1 | 5c+5 = temp2 + pkhbt r9, r14, r12, lsl #16 ; i2 | i6 + uadd16 r10, r11, r9 ; a + usub16 r9, r11, r9 ; b + pkhtb r6, r12, r14, asr #16 ; i3 | i7 + + subs r5, r5, #1<<31 ; i-- + + smulbt r7, r5, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 + smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 + smulbb r12, r5, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 + smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 + + pkhtb r7, r7, r12, asr #16 ; 3c | 7c + pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 + + uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 + usub16 r12, r8, r6 ; c (o1 | o5) + uadd16 r6, r11, r0 ; d (o3 | o7) + uadd16 r7, r10, r6 ; a+d + + mov r8, #4 ; set up 4's + orr r8, r8, #0x40000 ; 4|4 + + usub16 r6, r10, r6 ; a-d + uadd16 r6, r6, r8 ; a-d+4, 3|7 + uadd16 r7, r7, r8 ; a+d+4, 0|4 + uadd16 r10, r9, r12 ; b+c + usub16 r0, r9, r12 ; b-c + uadd16 r10, r10, r8 ; b+c+4, 1|5 + uadd16 r8, r0, r8 ; b-c+4, 2|6 + + ldr lr, [sp, #40] ; dst stride + + ldrb r0, [r1] ; pred p0 + ldrb r11, [r1, #1] ; pred p1 + ldrb r12, [r1, #2] ; pred p2 + + add r0, r0, r7, asr #19 ; p0 + o0 + add r11, r11, r10, asr #19 ; p1 + o1 + add r12, r12, r8, asr #19 ; p2 + o2 + + usat r0, #8, r0 ; d0 = clip8(p0 + o0) + usat r11, #8, r11 ; d1 = clip8(p1 + o1) + usat r12, #8, r12 ; d2 = clip8(p2 + o2) + + add r0, r0, r11, lsl #8 ; |--|--|d1|d0| + + ldrb r11, [r1, #3] ; pred p3 + + add r0, r0, r12, lsl #16 ; |--|d2|d1|d0| + + add r11, r11, r6, asr #19 ; p3 + o3 + + sxth r7, r7 ; + sxth r10, r10 ; + + usat r11, #8, r11 ; d3 = clip8(p3 + o3) + + sxth r8, r8 ; + sxth r6, r6 ; + + add r0, r0, r11, lsl #24 ; |d3|d2|d1|d0| + + ldrb r12, [r1, r2]! ; pred p4 + str r0, [r3], lr + ldrb r11, [r1, #1] ; pred p5 + + add r12, r12, r7, asr #3 ; p4 + o4 + add r11, r11, r10, asr #3 ; p5 + o5 + + usat r12, #8, r12 ; d4 = clip8(p4 + o4) + usat r11, #8, r11 ; d5 = clip8(p5 + o5) + + ldrb r7, [r1, #2] ; pred p6 + ldrb r10, [r1, #3] ; pred p6 + + add r12, r12, r11, lsl #8 ; |--|--|d5|d4| + + add r7, r7, r8, asr #3 ; p6 + o6 + add r10, r10, r6, asr #3 ; p7 + o7 + + ldr r0, [sp] ; load input pointer + + usat r7, #8, r7 ; d6 = clip8(p6 + o6) + usat r10, #8, r10 ; d7 = clip8(p7 + o7) + + add r12, r12, r7, lsl #16 ; |--|d6|d5|d4| + add r12, r12, r10, lsl #24 ; |d7|d6|d5|d4| + + str r12, [r3], lr + add r0, r0, #16 + add r1, r1, r2 ; pred + pitch + + bcs loop2_dual + + add sp, sp, #4 ; idct_output buffer + ldmia sp!, {r4 - r11, pc} + ENDP END diff --git a/vp8/common/arm/armv6/recon_v6.asm b/vp8/common/arm/armv6/recon_v6.asm deleted file mode 100644 index 99c7bcf2d..000000000 --- a/vp8/common/arm/armv6/recon_v6.asm +++ /dev/null @@ -1,281 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_recon_b_armv6| - EXPORT |vp8_recon2b_armv6| - EXPORT |vp8_recon4b_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code -prd RN r0 -dif RN r1 -dst RN r2 -stride RN r3 - -;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride) -; R0 char* pred_ptr -; R1 short * dif_ptr -; R2 char * dst_ptr -; R3 int stride - -; Description: -; Loop through the block adding the Pred and Diff together. Clamp and then -; store back into the Dst. - -; Restrictions : -; all buffers are expected to be 4 byte aligned coming in and -; going out. -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -; -; -; -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp8_recon_b_armv6| PROC - stmdb sp!, {r4 - r9, lr} - - ;0, 1, 2, 3 - ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 - ldr r6, [dif, #0] ; 1 | 0 - ldr r7, [dif, #4] ; 3 | 2 - - pkhbt r8, r6, r7, lsl #16 ; 2 | 0 - pkhtb r9, r7, r6, asr #16 ; 3 | 1 - - uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 - uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 - - usat16 r8, #8, r8 - usat16 r9, #8, r9 - add dif, dif, #32 - orr r8, r8, r9, lsl #8 - - str r8, [dst], stride - - ;0, 1, 2, 3 - ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 -;; ldr r6, [dif, #8] ; 1 | 0 -;; ldr r7, [dif, #12] ; 3 | 2 - ldr r6, [dif, #0] ; 1 | 0 - ldr r7, [dif, #4] ; 3 | 2 - - pkhbt r8, r6, r7, lsl #16 ; 2 | 0 - pkhtb r9, r7, r6, asr #16 ; 3 | 1 - - uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 - uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 - - usat16 r8, #8, r8 - usat16 r9, #8, r9 - add dif, dif, #32 - orr r8, r8, r9, lsl #8 - - str r8, [dst], stride - - ;0, 1, 2, 3 - ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 -;; ldr r6, [dif, #16] ; 1 | 0 -;; ldr r7, [dif, #20] ; 3 | 2 - ldr r6, [dif, #0] ; 1 | 0 - ldr r7, [dif, #4] ; 3 | 2 - - pkhbt r8, r6, r7, lsl #16 ; 2 | 0 - pkhtb r9, r7, r6, asr #16 ; 3 | 1 - - uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 - uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 - - usat16 r8, #8, r8 - usat16 r9, #8, r9 - add dif, dif, #32 - orr r8, r8, r9, lsl #8 - - str r8, [dst], stride - - ;0, 1, 2, 3 - ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 -;; ldr r6, [dif, #24] ; 1 | 0 -;; ldr r7, [dif, #28] ; 3 | 2 - ldr r6, [dif, #0] ; 1 | 0 - ldr r7, [dif, #4] ; 3 | 2 - - pkhbt r8, r6, r7, lsl #16 ; 2 | 0 - pkhtb r9, r7, r6, asr #16 ; 3 | 1 - - uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 - uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 - - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst], stride - - ldmia sp!, {r4 - r9, pc} - - ENDP ; |recon_b| - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -; -; -; -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -; R0 char *pred_ptr -; R1 short *dif_ptr -; R2 char *dst_ptr -; R3 int stride -|vp8_recon4b_armv6| PROC - stmdb sp!, {r4 - r9, lr} - - mov lr, #4 - -recon4b_loop - ;0, 1, 2, 3 - ldr r4, [prd], #4 ; 3 | 2 | 1 | 0 - ldr r6, [dif, #0] ; 1 | 0 - ldr r7, [dif, #4] ; 3 | 2 - - pkhbt r8, r6, r7, lsl #16 ; 2 | 0 - pkhtb r9, r7, r6, asr #16 ; 3 | 1 - - uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 - uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 - - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst] - - ;4, 5, 6, 7 - ldr r4, [prd], #4 -;; ldr r6, [dif, #32] -;; ldr r7, [dif, #36] - ldr r6, [dif, #8] - ldr r7, [dif, #12] - - pkhbt r8, r6, r7, lsl #16 - pkhtb r9, r7, r6, asr #16 - - uxtab16 r8, r8, r4 - uxtab16 r9, r9, r4, ror #8 - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst, #4] - - ;8, 9, 10, 11 - ldr r4, [prd], #4 -;; ldr r6, [dif, #64] -;; ldr r7, [dif, #68] - ldr r6, [dif, #16] - ldr r7, [dif, #20] - - pkhbt r8, r6, r7, lsl #16 - pkhtb r9, r7, r6, asr #16 - - uxtab16 r8, r8, r4 - uxtab16 r9, r9, r4, ror #8 - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst, #8] - - ;12, 13, 14, 15 - ldr r4, [prd], #4 -;; ldr r6, [dif, #96] -;; ldr r7, [dif, #100] - ldr r6, [dif, #24] - ldr r7, [dif, #28] - - pkhbt r8, r6, r7, lsl #16 - pkhtb r9, r7, r6, asr #16 - - uxtab16 r8, r8, r4 - uxtab16 r9, r9, r4, ror #8 - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst, #12] - - add dst, dst, stride -;; add dif, dif, #8 - add dif, dif, #32 - - subs lr, lr, #1 - bne recon4b_loop - - ldmia sp!, {r4 - r9, pc} - - ENDP ; |Recon4B| - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -; -; -; -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -; R0 char *pred_ptr -; R1 short *dif_ptr -; R2 char *dst_ptr -; R3 int stride -|vp8_recon2b_armv6| PROC - stmdb sp!, {r4 - r9, lr} - - mov lr, #4 - -recon2b_loop - ;0, 1, 2, 3 - ldr r4, [prd], #4 - ldr r6, [dif, #0] - ldr r7, [dif, #4] - - pkhbt r8, r6, r7, lsl #16 - pkhtb r9, r7, r6, asr #16 - - uxtab16 r8, r8, r4 - uxtab16 r9, r9, r4, ror #8 - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst] - - ;4, 5, 6, 7 - ldr r4, [prd], #4 -;; ldr r6, [dif, #32] -;; ldr r7, [dif, #36] - ldr r6, [dif, #8] - ldr r7, [dif, #12] - - pkhbt r8, r6, r7, lsl #16 - pkhtb r9, r7, r6, asr #16 - - uxtab16 r8, r8, r4 - uxtab16 r9, r9, r4, ror #8 - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst, #4] - - add dst, dst, stride -;; add dif, dif, #8 - add dif, dif, #16 - - subs lr, lr, #1 - bne recon2b_loop - - ldmia sp!, {r4 - r9, pc} - - ENDP ; |Recon2B| - - END diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h index 8b8d17917..c710c2eb0 100644 --- a/vp8/common/arm/idct_arm.h +++ b/vp8/common/arm/idct_arm.h @@ -13,16 +13,12 @@ #define IDCT_ARM_H #if HAVE_ARMV6 -extern prototype_idct(vp8_short_idct4x4llm_1_v6); extern prototype_idct(vp8_short_idct4x4llm_v6_dual); extern prototype_idct_scalar_add(vp8_dc_only_idct_add_v6); extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6); extern prototype_second_order(vp8_short_inv_walsh4x4_v6); #if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_idct_idct1 -#define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6 - #undef vp8_idct_idct16 #define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual @@ -38,16 +34,12 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6); #endif #if HAVE_ARMV7 -extern prototype_idct(vp8_short_idct4x4llm_1_neon); extern prototype_idct(vp8_short_idct4x4llm_neon); extern prototype_idct_scalar_add(vp8_dc_only_idct_add_neon); extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon); extern prototype_second_order(vp8_short_inv_walsh4x4_neon); #if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_idct_idct1 -#define vp8_idct_idct1 vp8_short_idct4x4llm_1_neon - #undef vp8_idct_idct16 #define vp8_idct_idct16 vp8_short_idct4x4llm_neon diff --git a/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/vp8/common/arm/neon/dc_only_idct_add_neon.asm index 49ba05fb0..65a4680c1 100644 --- a/vp8/common/arm/neon/dc_only_idct_add_neon.asm +++ b/vp8/common/arm/neon/dc_only_idct_add_neon.asm @@ -14,22 +14,26 @@ PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr, -; unsigned char *dst_ptr, int pitch, int stride) + +;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, +; int pred_stride, unsigned char *dst_ptr, +; int dst_stride) + ; r0 input_dc ; r1 pred_ptr -; r2 dst_ptr -; r3 pitch -; sp stride +; r2 pred_stride +; r3 dst_ptr +; sp dst_stride + |vp8_dc_only_idct_add_neon| PROC add r0, r0, #4 asr r0, r0, #3 ldr r12, [sp] vdup.16 q0, r0 - vld1.32 {d2[0]}, [r1], r3 - vld1.32 {d2[1]}, [r1], r3 - vld1.32 {d4[0]}, [r1], r3 + vld1.32 {d2[0]}, [r1], r2 + vld1.32 {d2[1]}, [r1], r2 + vld1.32 {d4[0]}, [r1], r2 vld1.32 {d4[1]}, [r1] vaddw.u8 q1, q0, d2 @@ -38,12 +42,13 @@ vqmovun.s16 d2, q1 vqmovun.s16 d4, q2 - vst1.32 {d2[0]}, [r2], r12 - vst1.32 {d2[1]}, [r2], r12 - vst1.32 {d4[0]}, [r2], r12 - vst1.32 {d4[1]}, [r2] - - bx lr + vst1.32 {d2[0]}, [r3], r12 + vst1.32 {d2[1]}, [r3], r12 + vst1.32 {d4[0]}, [r3], r12 + vst1.32 {d4[1]}, [r3] + + bx lr ENDP + END diff --git a/vp8/common/arm/neon/recon16x16mb_neon.asm b/vp8/common/arm/neon/recon16x16mb_neon.asm deleted file mode 100644 index 3f1a30f48..000000000 --- a/vp8/common/arm/neon/recon16x16mb_neon.asm +++ /dev/null @@ -1,131 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_recon16x16mb_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *pred_ptr, -; r1 short *diff_ptr, -; r2 unsigned char *dst_ptr, -; r3 int ystride, -; stack unsigned char *udst_ptr, -; stack unsigned char *vdst_ptr - -|vp8_recon16x16mb_neon| PROC - mov r12, #4 ;loop counter for Y loop - -recon16x16mb_loop_y - vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr - vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr - vld1.u8 {q14, q15}, [r0]! - vld1.16 {q10, q11}, [r1]! - - vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits - vmovl.u8 q1, d25 - vmovl.u8 q2, d26 - vmovl.u8 q3, d27 - vmovl.u8 q4, d28 - vmovl.u8 q5, d29 - vmovl.u8 q6, d30 - vld1.16 {q12, q13}, [r1]! - vmovl.u8 q7, d31 - vld1.16 {q14, q15}, [r1]! - - pld [r0] - pld [r1] - pld [r1, #64] - - vadd.s16 q0, q0, q8 ;add Diff data and Pred data together - vadd.s16 q1, q1, q9 - vadd.s16 q2, q2, q10 - vadd.s16 q3, q3, q11 - vadd.s16 q4, q4, q12 - vadd.s16 q5, q5, q13 - vadd.s16 q6, q6, q14 - vadd.s16 q7, q7, q15 - - vqmovun.s16 d0, q0 ;CLAMP() saturation - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - vqmovun.s16 d3, q3 - vqmovun.s16 d4, q4 - vqmovun.s16 d5, q5 - vst1.u8 {q0}, [r2], r3 ;store result - vqmovun.s16 d6, q6 - vst1.u8 {q1}, [r2], r3 - vqmovun.s16 d7, q7 - vst1.u8 {q2}, [r2], r3 - subs r12, r12, #1 - - moveq r12, #2 ;loop counter for UV loop - - vst1.u8 {q3}, [r2], r3 - bne recon16x16mb_loop_y - - mov r3, r3, lsr #1 ;uv_stride = ystride>>1 - ldr r2, [sp] ;load upred_ptr - -recon16x16mb_loop_uv - vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr - vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr - vld1.u8 {q14, q15}, [r0]! - vld1.16 {q10, q11}, [r1]! - - vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits - vmovl.u8 q1, d25 - vmovl.u8 q2, d26 - vmovl.u8 q3, d27 - vmovl.u8 q4, d28 - vmovl.u8 q5, d29 - vmovl.u8 q6, d30 - vld1.16 {q12, q13}, [r1]! - vmovl.u8 q7, d31 - vld1.16 {q14, q15}, [r1]! - - vadd.s16 q0, q0, q8 ;add Diff data and Pred data together - vadd.s16 q1, q1, q9 - vadd.s16 q2, q2, q10 - vadd.s16 q3, q3, q11 - vadd.s16 q4, q4, q12 - vadd.s16 q5, q5, q13 - vadd.s16 q6, q6, q14 - - vqmovun.s16 d0, q0 ;CLAMP() saturation - vadd.s16 q7, q7, q15 - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - vqmovun.s16 d3, q3 - vst1.u8 {d0}, [r2], r3 ;store result - vqmovun.s16 d4, q4 - vst1.u8 {d1}, [r2], r3 - vqmovun.s16 d5, q5 - vst1.u8 {d2}, [r2], r3 - vqmovun.s16 d6, q6 - vst1.u8 {d3}, [r2], r3 - vqmovun.s16 d7, q7 - vst1.u8 {d4}, [r2], r3 - subs r12, r12, #1 - - vst1.u8 {d5}, [r2], r3 - vst1.u8 {d6}, [r2], r3 - vst1.u8 {d7}, [r2], r3 - - ldrne r2, [sp, #4] ;load vpred_ptr - bne recon16x16mb_loop_uv - - bx lr - - ENDP - END diff --git a/vp8/common/arm/neon/recon2b_neon.asm b/vp8/common/arm/neon/recon2b_neon.asm deleted file mode 100644 index 99b251c91..000000000 --- a/vp8/common/arm/neon/recon2b_neon.asm +++ /dev/null @@ -1,54 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_recon2b_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *pred_ptr, -; r1 short *diff_ptr, -; r2 unsigned char *dst_ptr, -; r3 int stride - -|vp8_recon2b_neon| PROC - vld1.u8 {q8, q9}, [r0] ;load data from pred_ptr - vld1.16 {q4, q5}, [r1]! ;load data from diff_ptr - - vmovl.u8 q0, d16 ;modify Pred data from 8 bits to 16 bits - vld1.16 {q6, q7}, [r1]! - vmovl.u8 q1, d17 - vmovl.u8 q2, d18 - vmovl.u8 q3, d19 - - vadd.s16 q0, q0, q4 ;add Diff data and Pred data together - vadd.s16 q1, q1, q5 - vadd.s16 q2, q2, q6 - vadd.s16 q3, q3, q7 - - vqmovun.s16 d0, q0 ;CLAMP() saturation - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - vqmovun.s16 d3, q3 - add r0, r2, r3 - - vst1.u8 {d0}, [r2] ;store result - vst1.u8 {d1}, [r0], r3 - add r2, r0, r3 - vst1.u8 {d2}, [r0] - vst1.u8 {d3}, [r2], r3 - - bx lr - - ENDP - END diff --git a/vp8/common/arm/neon/recon4b_neon.asm b/vp8/common/arm/neon/recon4b_neon.asm deleted file mode 100644 index 991727746..000000000 --- a/vp8/common/arm/neon/recon4b_neon.asm +++ /dev/null @@ -1,69 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_recon4b_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *pred_ptr, -; r1 short *diff_ptr, -; r2 unsigned char *dst_ptr, -; r3 int stride - -|vp8_recon4b_neon| PROC - vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr - vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr - vld1.u8 {q14, q15}, [r0] - vld1.16 {q10, q11}, [r1]! - - vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits - vmovl.u8 q1, d25 - vmovl.u8 q2, d26 - vmovl.u8 q3, d27 - vmovl.u8 q4, d28 - vmovl.u8 q5, d29 - vmovl.u8 q6, d30 - vld1.16 {q12, q13}, [r1]! - vmovl.u8 q7, d31 - vld1.16 {q14, q15}, [r1] - - vadd.s16 q0, q0, q8 ;add Diff data and Pred data together - vadd.s16 q1, q1, q9 - vadd.s16 q2, q2, q10 - vadd.s16 q3, q3, q11 - vadd.s16 q4, q4, q12 - vadd.s16 q5, q5, q13 - vadd.s16 q6, q6, q14 - vadd.s16 q7, q7, q15 - - vqmovun.s16 d0, q0 ;CLAMP() saturation - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - vqmovun.s16 d3, q3 - vqmovun.s16 d4, q4 - vqmovun.s16 d5, q5 - vqmovun.s16 d6, q6 - vqmovun.s16 d7, q7 - add r0, r2, r3 - - vst1.u8 {q0}, [r2] ;store result - vst1.u8 {q1}, [r0], r3 - add r2, r0, r3 - vst1.u8 {q2}, [r0] - vst1.u8 {q3}, [r2], r3 - - bx lr - - ENDP - END diff --git a/vp8/common/arm/neon/recon_neon.c b/vp8/common/arm/neon/recon_neon.c deleted file mode 100644 index d2aafd51f..000000000 --- a/vp8/common/arm/neon/recon_neon.c +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_config.h" -#include "vp8/common/recon.h" -#include "vp8/common/blockd.h" - -extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr); - -void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) -{ - unsigned char *pred_ptr = &x->predictor[0]; - short *diff_ptr = &x->diff[0]; - unsigned char *dst_ptr = x->dst.y_buffer; - unsigned char *udst_ptr = x->dst.u_buffer; - unsigned char *vdst_ptr = x->dst.v_buffer; - int ystride = x->dst.y_stride; - /*int uv_stride = x->dst.uv_stride;*/ - - vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride, udst_ptr, vdst_ptr); -} diff --git a/vp8/common/arm/neon/reconb_neon.asm b/vp8/common/arm/neon/reconb_neon.asm deleted file mode 100644 index 288c0ef01..000000000 --- a/vp8/common/arm/neon/reconb_neon.asm +++ /dev/null @@ -1,61 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_recon_b_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *pred_ptr, -; r1 short *diff_ptr, -; r2 unsigned char *dst_ptr, -; r3 int stride - -|vp8_recon_b_neon| PROC - mov r12, #16 - - vld1.u8 {d28}, [r0], r12 ;load 4 data/line from pred_ptr - vld1.16 {q10, q11}, [r1]! ;load data from diff_ptr - vld1.u8 {d29}, [r0], r12 - vld1.16 {q11, q12}, [r1]! - vld1.u8 {d30}, [r0], r12 - vld1.16 {q12, q13}, [r1]! - vld1.u8 {d31}, [r0], r12 - vld1.16 {q13}, [r1] - - vmovl.u8 q0, d28 ;modify Pred data from 8 bits to 16 bits - vmovl.u8 q1, d29 ;Pred data in d0, d2, d4, d6 - vmovl.u8 q2, d30 - vmovl.u8 q3, d31 - - vadd.s16 d0, d0, d20 ;add Diff data and Pred data together - vadd.s16 d2, d2, d22 - vadd.s16 d4, d4, d24 - vadd.s16 d6, d6, d26 - - vqmovun.s16 d0, q0 ;CLAMP() saturation - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - vqmovun.s16 d3, q3 - add r1, r2, r3 - - vst1.32 {d0[0]}, [r2] ;store result - vst1.32 {d1[0]}, [r1], r3 - add r2, r1, r3 - vst1.32 {d2[0]}, [r1] - vst1.32 {d3[0]}, [r2], r3 - - bx lr - - ENDP - END diff --git a/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm b/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm deleted file mode 100644 index d7bdbae75..000000000 --- a/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm +++ /dev/null @@ -1,67 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_idct4x4llm_1_neon| - EXPORT |vp8_dc_only_idct_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch); -; r0 short *input; -; r1 short *output; -; r2 int pitch; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -|vp8_short_idct4x4llm_1_neon| PROC - vld1.16 {d0[]}, [r0] ;load input[0] - - add r3, r1, r2 - add r12, r3, r2 - - vrshr.s16 d0, d0, #3 - - add r0, r12, r2 - - vst1.16 {d0}, [r1] - vst1.16 {d0}, [r3] - vst1.16 {d0}, [r12] - vst1.16 {d0}, [r0] - - bx lr - ENDP - -;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch); -; r0 short input_dc; -; r1 short *output; -; r2 int pitch; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -|vp8_dc_only_idct_neon| PROC - vdup.16 d0, r0 - - add r3, r1, r2 - add r12, r3, r2 - - vrshr.s16 d0, d0, #3 - - add r0, r12, r2 - - vst1.16 {d0}, [r1] - vst1.16 {d0}, [r3] - vst1.16 {d0}, [r12] - vst1.16 {d0}, [r0] - - bx lr - - ENDP - END diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.asm b/vp8/common/arm/neon/shortidct4x4llm_neon.asm index b74c31521..67d2ab015 100644 --- a/vp8/common/arm/neon/shortidct4x4llm_neon.asm +++ b/vp8/common/arm/neon/shortidct4x4llm_neon.asm @@ -17,18 +17,24 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 ;************************************************************* -;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) +;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, +; unsigned char *dst, int stride) ;r0 short * input -;r1 short * output +;r1 short * pred ;r2 int pitch +;r3 unsigned char dst +;sp int stride ;************************************************************* -;static const int cospi8sqrt2minus1=20091; -;static const int sinpi8sqrt2 =35468; -;static const int rounding = 0; -;Optimization note: The resulted data from dequantization are signed 13-bit data that is -;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since -;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half -;result of the multiplication that is needed in IDCT. + +; static const int cospi8sqrt2minus1=20091; +; static const int sinpi8sqrt2 =35468; +; static const int rounding = 0; + +; Optimization note: The resulted data from dequantization are signed +; 13-bit data that is in the range of [-4096, 4095]. This allows to +; use "vqdmulh"(neon) instruction since it won't go out of range +; (13+16+1=30bits<32bits). This instruction gives the high half +; result of the multiplication that is needed in IDCT. |vp8_short_idct4x4llm_neon| PROC adr r12, idct_coeff @@ -36,6 +42,7 @@ vld1.16 {d0}, [r12] vswp d3, d4 ;q2(vp[4] vp[12]) + ldr r0, [sp] ; stride vqdmulh.s16 q3, q2, d0[2] vqdmulh.s16 q4, q2, d0[0] @@ -94,21 +101,31 @@ vrshr.s16 d4, d4, #3 vrshr.s16 d5, d5, #3 - add r3, r1, r2 - add r12, r3, r2 - add r0, r12, r2 - vtrn.32 d2, d4 vtrn.32 d3, d5 vtrn.16 d2, d3 vtrn.16 d4, d5 - vst1.16 {d2}, [r1] - vst1.16 {d3}, [r3] - vst1.16 {d4}, [r12] - vst1.16 {d5}, [r0] + ; load prediction data + vld1.32 d6[0], [r1], r2 + vld1.32 d6[1], [r1], r2 + vld1.32 d7[0], [r1], r2 + vld1.32 d7[1], [r1], r2 + + ; add prediction and residual + vaddw.u8 q1, q1, d6 + vaddw.u8 q2, q2, d7 + + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + + ; store to destination + vst1.32 d1[0], [r3], r0 + vst1.32 d1[1], [r3], r0 + vst1.32 d2[0], [r3], r0 + vst1.32 d2[1], [r3], r0 - bx lr + bx lr ENDP diff --git a/vp8/common/arm/recon_arm.h b/vp8/common/arm/recon_arm.h index 377cb2a07..dec7fc425 100644 --- a/vp8/common/arm/recon_arm.h +++ b/vp8/common/arm/recon_arm.h @@ -13,24 +13,12 @@ #define RECON_ARM_H #if HAVE_ARMV6 -extern prototype_recon_block(vp8_recon_b_armv6); -extern prototype_recon_block(vp8_recon2b_armv6); -extern prototype_recon_block(vp8_recon4b_armv6); extern prototype_copy_block(vp8_copy_mem8x8_v6); extern prototype_copy_block(vp8_copy_mem8x4_v6); extern prototype_copy_block(vp8_copy_mem16x16_v6); #if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_recon_recon -#define vp8_recon_recon vp8_recon_b_armv6 - -#undef vp8_recon_recon2 -#define vp8_recon_recon2 vp8_recon2b_armv6 - -#undef vp8_recon_recon4 -#define vp8_recon_recon4 vp8_recon4b_armv6 - #undef vp8_recon_copy8x8 #define vp8_recon_copy8x8 vp8_copy_mem8x8_v6 @@ -43,29 +31,15 @@ extern prototype_copy_block(vp8_copy_mem16x16_v6); #endif #if HAVE_ARMV7 -extern prototype_recon_block(vp8_recon_b_neon); -extern prototype_recon_block(vp8_recon2b_neon); -extern prototype_recon_block(vp8_recon4b_neon); extern prototype_copy_block(vp8_copy_mem8x8_neon); extern prototype_copy_block(vp8_copy_mem8x4_neon); extern prototype_copy_block(vp8_copy_mem16x16_neon); -extern prototype_recon_macroblock(vp8_recon_mb_neon); - extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_neon); extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon); #if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_recon_recon -#define vp8_recon_recon vp8_recon_b_neon - -#undef vp8_recon_recon2 -#define vp8_recon_recon2 vp8_recon2b_neon - -#undef vp8_recon_recon4 -#define vp8_recon_recon4 vp8_recon4b_neon - #undef vp8_recon_copy8x8 #define vp8_recon_copy8x8 vp8_copy_mem8x8_neon @@ -75,9 +49,6 @@ extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon); #undef vp8_recon_copy16x16 #define vp8_recon_copy16x16 vp8_copy_mem16x16_neon -#undef vp8_recon_recon_mb -#define vp8_recon_recon_mb vp8_recon_mb_neon - #undef vp8_recon_build_intra_predictors_mby #define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_neon diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c index d1dd60286..5c4fbb193 100644 --- a/vp8/common/generic/systemdependent.c +++ b/vp8/common/generic/systemdependent.c @@ -70,7 +70,6 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) #if CONFIG_RUNTIME_CPU_DETECT VP8_COMMON_RTCD *rtcd = &ctx->rtcd; - rtcd->idct.idct1 = vp8_short_idct4x4llm_1_c; rtcd->idct.idct16 = vp8_short_idct4x4llm_c; rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c; rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_c; @@ -79,11 +78,7 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) rtcd->recon.copy16x16 = vp8_copy_mem16x16_c; rtcd->recon.copy8x8 = vp8_copy_mem8x8_c; rtcd->recon.copy8x4 = vp8_copy_mem8x4_c; - rtcd->recon.recon = vp8_recon_b_c; - rtcd->recon.recon2 = vp8_recon2b_c; - rtcd->recon.recon4 = vp8_recon4b_c; - rtcd->recon.recon_mb = vp8_recon_mb_c; - rtcd->recon.recon_mby = vp8_recon_mby_c; + rtcd->recon.build_intra_predictors_mby = vp8_build_intra_predictors_mby; rtcd->recon.build_intra_predictors_mby_s = diff --git a/vp8/common/idct.h b/vp8/common/idct.h index f5fd94dfd..411a1b472 100644 --- a/vp8/common/idct.h +++ b/vp8/common/idct.h @@ -16,12 +16,14 @@ void sym(short *input, short *output) #define prototype_idct(sym) \ - void sym(short *input, short *output, int pitch) + void sym(short *input, unsigned char *pred, int pitch, unsigned char *dst, \ + int dst_stride) #define prototype_idct_scalar_add(sym) \ void sym(short input, \ - unsigned char *pred, unsigned char *output, \ - int pitch, int stride) + unsigned char *pred, int pred_stride, \ + unsigned char *dst, \ + int dst_stride) #if ARCH_X86 || ARCH_X86_64 #include "x86/idct_x86.h" @@ -31,11 +33,6 @@ #include "arm/idct_arm.h" #endif -#ifndef vp8_idct_idct1 -#define vp8_idct_idct1 vp8_short_idct4x4llm_1_c -#endif -extern prototype_idct(vp8_idct_idct1); - #ifndef vp8_idct_idct16 #define vp8_idct_idct16 vp8_short_idct4x4llm_c #endif @@ -63,7 +60,6 @@ typedef prototype_second_order((*vp8_second_order_fn_t)); typedef struct { - vp8_idct_fn_t idct1; vp8_idct_fn_t idct16; vp8_idct_scalar_add_fn_t idct1_scalar_add; diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c index 196062df6..49496abef 100644 --- a/vp8/common/idctllm.c +++ b/vp8/common/idctllm.c @@ -24,28 +24,31 @@ **************************************************************************/ static const int cospi8sqrt2minus1 = 20091; static const int sinpi8sqrt2 = 35468; -static const int rounding = 0; -void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { int i; + int r, c; int a1, b1, c1, d1; - + short output[16]; short *ip = input; short *op = output; int temp1, temp2; - int shortpitch = pitch >> 1; + int shortpitch = 4; for (i = 0; i < 4; i++) { a1 = ip[0] + ip[8]; b1 = ip[0] - ip[8]; - temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16; - temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16); + temp1 = (ip[4] * sinpi8sqrt2) >> 16; + temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16); c1 = temp1 - temp2; - temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16); - temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16; + temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[12] * sinpi8sqrt2) >> 16; d1 = temp1 + temp2; op[shortpitch*0] = a1 + d1; @@ -66,12 +69,12 @@ void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) a1 = ip[0] + ip[2]; b1 = ip[0] - ip[2]; - temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16; - temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16); + temp1 = (ip[1] * sinpi8sqrt2) >> 16; + temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16); c1 = temp1 - temp2; - temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16); - temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16; + temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16); + temp2 = (ip[3] * sinpi8sqrt2) >> 16; d1 = temp1 + temp2; @@ -84,27 +87,31 @@ void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) ip += shortpitch; op += shortpitch; } -} -void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch) -{ - int i; - int a1; - short *op = output; - int shortpitch = pitch >> 1; - a1 = ((input[0] + 4) >> 3); - - for (i = 0; i < 4; i++) + ip = output; + for (r = 0; r < 4; r++) { - op[0] = a1; - op[1] = a1; - op[2] = a1; - op[3] = a1; - op += shortpitch; + for (c = 0; c < 4; c++) + { + int a = ip[c] + pred_ptr[c] ; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a ; + } + ip += 4; + dst_ptr += dst_stride; + pred_ptr += pred_stride; } } -void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride) +void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, + int pred_stride, unsigned char *dst_ptr, + int dst_stride) { int a1 = ((input_dc + 4) >> 3); int r, c; @@ -124,8 +131,8 @@ void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned ch dst_ptr[c] = (unsigned char) a ; } - dst_ptr += stride; - pred_ptr += pitch; + dst_ptr += dst_stride; + pred_ptr += pred_stride; } } diff --git a/vp8/common/invtrans.c b/vp8/common/invtrans.c index 81a3f2d89..7712b59b7 100644 --- a/vp8/common/invtrans.c +++ b/vp8/common/invtrans.c @@ -12,6 +12,21 @@ #include "invtrans.h" +void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, + int pitch) +{ + if (b->eob > 1) + { + IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, pitch, + *(b->base_dst) + b->dst, b->dst_stride); + } + else + { + IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, pitch, + *(b->base_dst) + b->dst, b->dst_stride); + } + +} static void recon_dcblock(MACROBLOCKD *x) { @@ -25,15 +40,6 @@ static void recon_dcblock(MACROBLOCKD *x) } -void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch) -{ - if (b->eob > 1) - IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch); - else - IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch); -} - - void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x) { int i; @@ -45,7 +51,7 @@ void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD * for (i = 0; i < 16; i++) { - vp8_inverse_transform_b(rtcd, &x->block[i], 32); + vp8_inverse_transform_b(rtcd, &x->block[i], 16); } } @@ -55,34 +61,10 @@ void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD for (i = 16; i < 24; i++) { - vp8_inverse_transform_b(rtcd, &x->block[i], 16); + vp8_inverse_transform_b(rtcd, &x->block[i], 8); } } -void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x) -{ - int i; - if (x->mode_info_context->mbmi.mode != B_PRED && - x->mode_info_context->mbmi.mode != SPLITMV) - { - /* do 2nd order transform on the dc block */ - - IDCT_INVOKE(rtcd, iwalsh16)(&x->block[24].dqcoeff[0], x->block[24].diff); - recon_dcblock(x); - } - - for (i = 0; i < 16; i++) - { - vp8_inverse_transform_b(rtcd, &x->block[i], 32); - } - - - for (i = 16; i < 24; i++) - { - vp8_inverse_transform_b(rtcd, &x->block[i], 16); - } - -} diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index 015b4c4d4..e911ea0f4 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -19,6 +19,7 @@ extern "C" #include "vpx/internal/vpx_codec_internal.h" #include "vpx/vp8cx.h" +#include "vpx/vpx_encoder.h" #include "vpx_scale/yv12config.h" #include "type_aliases.h" #include "ppflags.h" @@ -145,9 +146,9 @@ extern "C" int over_shoot_pct; // buffering parameters - int starting_buffer_level; // in seconds - int optimal_buffer_level; - int maximum_buffer_size; + int64_t starting_buffer_level; // in seconds + int64_t optimal_buffer_level; + int64_t maximum_buffer_size; // controlling quality int fixed_q; @@ -198,6 +199,14 @@ extern "C" struct vpx_codec_pkt_list *output_pkt_list; vp8e_tuning tuning; + + // Temporal scaling parameters + unsigned int number_of_layers; + unsigned int target_bitrate[MAX_PERIODICITY]; + unsigned int rate_decimator[MAX_PERIODICITY]; + unsigned int periodicity; + unsigned int layer_id[MAX_PERIODICITY]; + } VP8_CONFIG; diff --git a/vp8/common/recon.h b/vp8/common/recon.h index 7cfc779cd..62bd71aac 100644 --- a/vp8/common/recon.h +++ b/vp8/common/recon.h @@ -18,7 +18,7 @@ void sym(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch) #define prototype_recon_block(sym) \ - void sym(unsigned char *pred, short *diff, unsigned char *dst, int pitch) + void sym(unsigned char *pred, short *diff, int diff_stride, unsigned char *dst, int pitch) #define prototype_recon_macroblock(sym) \ void sym(const struct vp8_recon_rtcd_vtable *rtcd, MACROBLOCKD *x) @@ -27,7 +27,7 @@ void sym(MACROBLOCKD *x) #define prototype_intra4x4_predict(sym) \ - void sym(BLOCKD *x, int b_mode, unsigned char *predictor) + void sym(BLOCKD *x, int b_mode, unsigned char *predictor, int stride) struct vp8_recon_rtcd_vtable; @@ -54,31 +54,6 @@ extern prototype_copy_block(vp8_recon_copy8x8); #endif extern prototype_copy_block(vp8_recon_copy8x4); -#ifndef vp8_recon_recon -#define vp8_recon_recon vp8_recon_b_c -#endif -extern prototype_recon_block(vp8_recon_recon); - -#ifndef vp8_recon_recon2 -#define vp8_recon_recon2 vp8_recon2b_c -#endif -extern prototype_recon_block(vp8_recon_recon2); - -#ifndef vp8_recon_recon4 -#define vp8_recon_recon4 vp8_recon4b_c -#endif -extern prototype_recon_block(vp8_recon_recon4); - -#ifndef vp8_recon_recon_mb -#define vp8_recon_recon_mb vp8_recon_mb_c -#endif -extern prototype_recon_macroblock(vp8_recon_recon_mb); - -#ifndef vp8_recon_recon_mby -#define vp8_recon_recon_mby vp8_recon_mby_c -#endif -extern prototype_recon_macroblock(vp8_recon_recon_mby); - #ifndef vp8_recon_build_intra_predictors_mby #define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby #endif @@ -111,8 +86,6 @@ extern prototype_intra4x4_predict\ typedef prototype_copy_block((*vp8_copy_block_fn_t)); -typedef prototype_recon_block((*vp8_recon_fn_t)); -typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t)); typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t)); typedef prototype_intra4x4_predict((*vp8_intra4x4_pred_fn_t)); typedef struct vp8_recon_rtcd_vtable @@ -120,11 +93,7 @@ typedef struct vp8_recon_rtcd_vtable vp8_copy_block_fn_t copy16x16; vp8_copy_block_fn_t copy8x8; vp8_copy_block_fn_t copy8x4; - vp8_recon_fn_t recon; - vp8_recon_fn_t recon2; - vp8_recon_fn_t recon4; - vp8_recon_mb_fn_t recon_mb; - vp8_recon_mb_fn_t recon_mby; + vp8_build_intra_pred_fn_t build_intra_predictors_mby_s; vp8_build_intra_pred_fn_t build_intra_predictors_mby; vp8_build_intra_pred_fn_t build_intra_predictors_mbuv_s; @@ -138,5 +107,4 @@ typedef struct vp8_recon_rtcd_vtable #define RECON_INVOKE(ctx,fn) vp8_recon_##fn #endif -void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x); #endif diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c index e4e8a80a4..24c09a353 100644 --- a/vp8/common/reconinter.c +++ b/vp8/common/reconinter.c @@ -123,7 +123,6 @@ void vp8_copy_mem8x4_c( } - void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf) { int r; @@ -159,41 +158,73 @@ void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf) } } -static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch) +static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride) { unsigned char *ptr_base; unsigned char *ptr; - unsigned char *pred_ptr = d->predictor; ptr_base = *(d->base_pre); ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) { - x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch); + x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride); } else { - RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, pred_ptr, pitch); + RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst, dst_stride); } } -static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch) +static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride) { unsigned char *ptr_base; unsigned char *ptr; - unsigned char *pred_ptr = d->predictor; ptr_base = *(d->base_pre); ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) { - x->subpixel_predict8x4(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch); + x->subpixel_predict8x4(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride); + } + else + { + RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d->pre_stride, dst, dst_stride); + } +} + +static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stride, vp8_subpix_fn_t sppf) +{ + int r; + unsigned char *ptr_base; + unsigned char *ptr; + + ptr_base = *(d->base_pre); + + if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) + { + ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + sppf(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride); } else { - RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d->pre_stride, pred_ptr, pitch); + ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + ptr = ptr_base; + + for (r = 0; r < 4; r++) + { +#if !(CONFIG_FAST_UNALIGNED) + dst[0] = ptr[0]; + dst[1] = ptr[1]; + dst[2] = ptr[2]; + dst[3] = ptr[3]; +#else + *(uint32_t *)dst = *(uint32_t *)ptr ; +#endif + dst += dst_stride; + ptr += d->pre_stride; + } } } @@ -292,7 +323,7 @@ void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x) BLOCKD *d1 = &x->block[i+1]; if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) - build_inter_predictors2b(x, d0, 8); + build_inter_predictors2b(x, d0, d0->predictor, 8); else { vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict); @@ -435,6 +466,9 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x) if (x->mode_info_context->mbmi.partitioning < 3) { + BLOCKD *b; + int dst_stride = x->block[ 0].dst_stride; + x->block[ 0].bmi = x->mode_info_context->bmi[ 0]; x->block[ 2].bmi = x->mode_info_context->bmi[ 2]; x->block[ 8].bmi = x->mode_info_context->bmi[ 8]; @@ -447,10 +481,14 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x) clamp_mv_to_umv_border(&x->block[10].bmi.mv.as_mv, x); } - build_inter_predictors4b(x, &x->block[ 0], 16); - build_inter_predictors4b(x, &x->block[ 2], 16); - build_inter_predictors4b(x, &x->block[ 8], 16); - build_inter_predictors4b(x, &x->block[10], 16); + b = &x->block[ 0]; + build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride); + b = &x->block[ 2]; + build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride); + b = &x->block[ 8]; + build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride); + b = &x->block[10]; + build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride); } else { @@ -458,6 +496,7 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x) { BLOCKD *d0 = &x->block[i]; BLOCKD *d1 = &x->block[i+1]; + int dst_stride = x->block[ 0].dst_stride; x->block[i+0].bmi = x->mode_info_context->bmi[i+0]; x->block[i+1].bmi = x->mode_info_context->bmi[i+1]; @@ -468,11 +507,11 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x) } if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) - build_inter_predictors2b(x, d0, 16); + build_inter_predictors2b(x, d0, *(d0->base_dst) + d0->dst, dst_stride); else { - vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict); - vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict); + build_inter_predictors_b(d0, *(d0->base_dst) + d0->dst, dst_stride, x->subpixel_predict); + build_inter_predictors_b(d1, *(d1->base_dst) + d1->dst, dst_stride, x->subpixel_predict); } } @@ -483,15 +522,16 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x) { BLOCKD *d0 = &x->block[i]; BLOCKD *d1 = &x->block[i+1]; + int dst_stride = x->block[ 16].dst_stride; /* Note: uv mvs already clamped in build_4x4uvmvs() */ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) - build_inter_predictors2b(x, d0, 8); + build_inter_predictors2b(x, d0, *(d0->base_dst) + d0->dst, dst_stride); else { - vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict); - vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict); + build_inter_predictors_b(d0, *(d0->base_dst) + d0->dst, dst_stride, x->subpixel_predict); + build_inter_predictors_b(d1, *(d1->base_dst) + d1->dst, dst_stride, x->subpixel_predict); } } } @@ -542,17 +582,83 @@ void build_4x4uvmvs(MACROBLOCKD *x) } } -void vp8_build_inter_predictors_mb(MACROBLOCKD *x) +void vp8_build_inter_predictors_mb(MACROBLOCKD *xd) { - if (x->mode_info_context->mbmi.mode != SPLITMV) + if (xd->mode_info_context->mbmi.mode != SPLITMV) { - vp8_build_inter16x16_predictors_mb(x, x->predictor, &x->predictor[256], - &x->predictor[320], 16, 8); + vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.y_stride, xd->dst.uv_stride); } else { - build_4x4uvmvs(x); - build_inter4x4_predictors_mb(x); + build_4x4uvmvs(xd); + build_inter4x4_predictors_mb(xd); } } +/* encoder only*/ +static void build_inter4x4_predictors_mb_e(MACROBLOCKD *x) +{ + int i; + if (x->mode_info_context->mbmi.partitioning < 3) + { + x->block[ 0].bmi = x->mode_info_context->bmi[ 0]; + x->block[ 2].bmi = x->mode_info_context->bmi[ 2]; + x->block[ 8].bmi = x->mode_info_context->bmi[ 8]; + x->block[10].bmi = x->mode_info_context->bmi[10]; + + build_inter_predictors4b(x, &x->block[ 0], x->block[ 0].predictor, 16); + build_inter_predictors4b(x, &x->block[ 2], x->block[ 2].predictor, 16); + build_inter_predictors4b(x, &x->block[ 8], x->block[ 8].predictor, 16); + build_inter_predictors4b(x, &x->block[10], x->block[10].predictor, 16); + } + else + { + for (i = 0; i < 16; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + x->block[i+0].bmi = x->mode_info_context->bmi[i+0]; + x->block[i+1].bmi = x->mode_info_context->bmi[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + build_inter_predictors2b(x, d0, d0->predictor, 16); + else + { + build_inter_predictors_b(d0, d0->predictor, 16, x->subpixel_predict); + build_inter_predictors_b(d1, d1->predictor, 16, x->subpixel_predict); + } + + } + + } + + for (i = 16; i < 24; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + build_inter_predictors2b(x, d0, d0->predictor, 8); + else + { + build_inter_predictors_b(d0, d0->predictor, 8, x->subpixel_predict); + build_inter_predictors_b(d1, d1->predictor, 8, x->subpixel_predict); + } + } +} +void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd) +{ + if (xd->mode_info_context->mbmi.mode != SPLITMV) + { + vp8_build_inter16x16_predictors_mb(xd, xd->predictor, &xd->predictor[256], + &xd->predictor[320], 16, 8); + } + else + { + build_4x4uvmvs(xd); + build_inter4x4_predictors_mb_e(xd); + } +} diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h index 456812ecd..86f9d5ae3 100644 --- a/vp8/common/reconinter.h +++ b/vp8/common/reconinter.h @@ -26,5 +26,6 @@ extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t s extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x); extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x); +extern void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd); #endif diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c index 16dadc47d..c0863eeb1 100644 --- a/vp8/common/reconintra.c +++ b/vp8/common/reconintra.c @@ -17,16 +17,6 @@ /* For skip_recon_mb(), add vp8_build_intra_predictors_mby_s(MACROBLOCKD *x) and * vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x). */ -void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) -{ - int i; - - for (i = 16; i < 24; i += 2) - { - BLOCKD *b = &x->block[i]; - RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); - } -} void vp8_build_intra_predictors_mby(MACROBLOCKD *x) { diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c index 0e1ebb584..12430da92 100644 --- a/vp8/common/reconintra4x4.c +++ b/vp8/common/reconintra4x4.c @@ -16,7 +16,7 @@ void vp8_intra4x4_predict(BLOCKD *x, int b_mode, - unsigned char *predictor) + unsigned char *predictor, int stride) { int i, r, c; @@ -50,7 +50,7 @@ void vp8_intra4x4_predict(BLOCKD *x, predictor[c] = expected_dc; } - predictor += 16; + predictor += stride; } } break; @@ -72,7 +72,7 @@ void vp8_intra4x4_predict(BLOCKD *x, predictor[c] = pred; } - predictor += 16; + predictor += stride; } } break; @@ -94,7 +94,7 @@ void vp8_intra4x4_predict(BLOCKD *x, predictor[c] = ap[c]; } - predictor += 16; + predictor += stride; } } @@ -117,29 +117,29 @@ void vp8_intra4x4_predict(BLOCKD *x, predictor[c] = lp[r]; } - predictor += 16; + predictor += stride; } } break; case B_LD_PRED: { unsigned char *ptr = Above; - predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2; - predictor[0 * 16 + 1] = - predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2; - predictor[0 * 16 + 2] = - predictor[1 * 16 + 1] = - predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2; - predictor[0 * 16 + 3] = - predictor[1 * 16 + 2] = - predictor[2 * 16 + 1] = - predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2; - predictor[1 * 16 + 3] = - predictor[2 * 16 + 2] = - predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2; - predictor[2 * 16 + 3] = - predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2; - predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2; + predictor[0 * stride + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2; + predictor[0 * stride + 1] = + predictor[1 * stride + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2; + predictor[0 * stride + 2] = + predictor[1 * stride + 1] = + predictor[2 * stride + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2; + predictor[0 * stride + 3] = + predictor[1 * stride + 2] = + predictor[2 * stride + 1] = + predictor[3 * stride + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2; + predictor[1 * stride + 3] = + predictor[2 * stride + 2] = + predictor[3 * stride + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2; + predictor[2 * stride + 3] = + predictor[3 * stride + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2; + predictor[3 * stride + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2; } break; @@ -158,22 +158,22 @@ void vp8_intra4x4_predict(BLOCKD *x, pp[7] = Above[2]; pp[8] = Above[3]; - predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[3 * 16 + 1] = - predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[3 * 16 + 2] = - predictor[2 * 16 + 1] = - predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[3 * 16 + 3] = - predictor[2 * 16 + 2] = - predictor[1 * 16 + 1] = - predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[2 * 16 + 3] = - predictor[1 * 16 + 2] = - predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[1 * 16 + 3] = - predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; - predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + predictor[3 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[3 * stride + 1] = + predictor[2 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[3 * stride + 2] = + predictor[2 * stride + 1] = + predictor[1 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[3 * stride + 3] = + predictor[2 * stride + 2] = + predictor[1 * stride + 1] = + predictor[0 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * stride + 3] = + predictor[1 * stride + 2] = + predictor[0 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[1 * stride + 3] = + predictor[0 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[0 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; } break; @@ -193,22 +193,22 @@ void vp8_intra4x4_predict(BLOCKD *x, pp[8] = Above[3]; - predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[3 * 16 + 1] = - predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[2 * 16 + 1] = - predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1; - predictor[3 * 16 + 2] = - predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[2 * 16 + 2] = - predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1; - predictor[3 * 16 + 3] = - predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; - predictor[2 * 16 + 3] = - predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1; - predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; - predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1; + predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[3 * stride + 1] = + predictor[1 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * stride + 1] = + predictor[0 * stride + 0] = (pp[4] + pp[5] + 1) >> 1; + predictor[3 * stride + 2] = + predictor[1 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[2 * stride + 2] = + predictor[0 * stride + 1] = (pp[5] + pp[6] + 1) >> 1; + predictor[3 * stride + 3] = + predictor[1 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[2 * stride + 3] = + predictor[0 * stride + 2] = (pp[6] + pp[7] + 1) >> 1; + predictor[1 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + predictor[0 * stride + 3] = (pp[7] + pp[8] + 1) >> 1; } break; @@ -217,22 +217,22 @@ void vp8_intra4x4_predict(BLOCKD *x, unsigned char *pp = Above; - predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; - predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[2 * 16 + 0] = - predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1; - predictor[1 * 16 + 1] = - predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[2 * 16 + 1] = - predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1; - predictor[3 * 16 + 1] = - predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[0 * 16 + 3] = - predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1; - predictor[1 * 16 + 3] = - predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[1 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[2 * stride + 0] = + predictor[0 * stride + 1] = (pp[1] + pp[2] + 1) >> 1; + predictor[1 * stride + 1] = + predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * stride + 1] = + predictor[0 * stride + 2] = (pp[2] + pp[3] + 1) >> 1; + predictor[3 * stride + 1] = + predictor[1 * stride + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[0 * stride + 3] = + predictor[2 * stride + 2] = (pp[3] + pp[4] + 1) >> 1; + predictor[1 * stride + 3] = + predictor[3 * stride + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * stride + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[3 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; } break; @@ -250,22 +250,22 @@ void vp8_intra4x4_predict(BLOCKD *x, pp[8] = Above[3]; - predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; - predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[2 * 16 + 0] = - predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1; - predictor[2 * 16 + 1] = - predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[2 * 16 + 2] = - predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1; - predictor[2 * 16 + 3] = - predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[1 * 16 + 2] = - predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1; - predictor[1 * 16 + 3] = - predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[3 * stride + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[3 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[2 * stride + 0] = + predictor[3 * stride + 2] = (pp[1] + pp[2] + 1) >> 1; + predictor[2 * stride + 1] = + predictor[3 * stride + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * stride + 2] = + predictor[1 * stride + 0] = (pp[2] + pp[3] + 1) >> 1; + predictor[2 * stride + 3] = + predictor[1 * stride + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[1 * stride + 2] = + predictor[0 * stride + 0] = (pp[3] + pp[4] + 1) >> 1; + predictor[1 * stride + 3] = + predictor[0 * stride + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[0 * stride + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[0 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; } break; @@ -273,28 +273,33 @@ void vp8_intra4x4_predict(BLOCKD *x, case B_HU_PRED: { unsigned char *pp = Left; - predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; - predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[0 * 16 + 2] = - predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1; - predictor[0 * 16 + 3] = - predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[1 * 16 + 2] = - predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1; - predictor[1 * 16 + 3] = - predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2; - predictor[2 * 16 + 2] = - predictor[2 * 16 + 3] = - predictor[3 * 16 + 0] = - predictor[3 * 16 + 1] = - predictor[3 * 16 + 2] = - predictor[3 * 16 + 3] = pp[3]; + predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[0 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[0 * stride + 2] = + predictor[1 * stride + 0] = (pp[1] + pp[2] + 1) >> 1; + predictor[0 * stride + 3] = + predictor[1 * stride + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[1 * stride + 2] = + predictor[2 * stride + 0] = (pp[2] + pp[3] + 1) >> 1; + predictor[1 * stride + 3] = + predictor[2 * stride + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2; + predictor[2 * stride + 2] = + predictor[2 * stride + 3] = + predictor[3 * stride + 0] = + predictor[3 * stride + 1] = + predictor[3 * stride + 2] = + predictor[3 * stride + 3] = pp[3]; } break; } } + + + + + /* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and * to the right prediction have filled in pixels to use. */ diff --git a/vp8/common/x86/idct_x86.h b/vp8/common/x86/idct_x86.h index f6e568cdc..f9e3a794d 100644 --- a/vp8/common/x86/idct_x86.h +++ b/vp8/common/x86/idct_x86.h @@ -20,7 +20,6 @@ */ #if HAVE_MMX -extern prototype_idct(vp8_short_idct4x4llm_1_mmx); extern prototype_idct(vp8_short_idct4x4llm_mmx); extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx); @@ -28,9 +27,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_mmx); extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx); #if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_idct_idct1 -#define vp8_idct_idct1 vp8_short_idct4x4llm_1_mmx - #undef vp8_idct_idct16 #define vp8_idct_idct16 vp8_short_idct4x4llm_mmx diff --git a/vp8/common/x86/idctllm_mmx.asm b/vp8/common/x86/idctllm_mmx.asm index 465626b8f..0c9c205c2 100644 --- a/vp8/common/x86/idctllm_mmx.asm +++ b/vp8/common/x86/idctllm_mmx.asm @@ -32,249 +32,252 @@ ; **************************************************************************/ -;void short_idct4x4llm_mmx(short *input, short *output, int pitch) +;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, +;int pitch, unsigned char *dest,int stride) global sym(vp8_short_idct4x4llm_mmx) sym(vp8_short_idct4x4llm_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 + SHADOW_ARGS_TO_STACK 5 GET_GOT rbx + push rsi + push rdi ; end prolog - mov rax, arg(0) ;input - mov rdx, arg(1) ;output - - movq mm0, [rax ] - movq mm1, [rax+ 8] - - movq mm2, [rax+16] - movq mm3, [rax+24] - - movsxd rax, dword ptr arg(2) ;pitch - - psubw mm0, mm2 ; b1= 0-2 - paddw mm2, mm2 ; - - movq mm5, mm1 - paddw mm2, mm0 ; a1 =0+2 - - pmulhw mm5, [GLOBAL(x_s1sqr2)] ; - paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + mov rax, arg(0) ;input + mov rsi, arg(1) ;pred - movq mm7, mm3 ; - pmulhw mm7, [GLOBAL(x_c1sqr2less1)] ; + movq mm0, [rax ] + movq mm1, [rax+ 8] + movq mm2, [rax+16] + movq mm3, [rax+24] - paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw mm7, mm5 ; c1 +%if 0 + pxor mm7, mm7 + movq [rax], mm7 + movq [rax+8], mm7 + movq [rax+16],mm7 + movq [rax+24],mm7 +%endif + movsxd rax, dword ptr arg(2) ;pitch + mov rdx, arg(3) ;dest + movsxd rdi, dword ptr arg(4) ;stride - movq mm5, mm1 - movq mm4, mm3 - pmulhw mm5, [GLOBAL(x_c1sqr2less1)] - paddw mm5, mm1 + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; - pmulhw mm3, [GLOBAL(x_s1sqr2)] - paddw mm3, mm4 + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 - paddw mm3, mm5 ; d1 - movq mm6, mm2 ; a1 + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) - movq mm4, mm0 ; b1 - paddw mm2, mm3 ;0 + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; - paddw mm4, mm7 ;1 - psubw mm0, mm7 ;2 + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 - psubw mm6, mm3 ;3 + movq mm5, mm1 + movq mm4, mm3 - movq mm1, mm2 ; 03 02 01 00 - movq mm3, mm4 ; 23 22 21 20 + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 - punpcklwd mm1, mm0 ; 11 01 10 00 - punpckhwd mm2, mm0 ; 13 03 12 02 + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 - punpcklwd mm3, mm6 ; 31 21 30 20 - punpckhwd mm4, mm6 ; 33 23 32 22 + paddw mm3, mm5 ; d1 + movq mm6, mm2 ; a1 - movq mm0, mm1 ; 11 01 10 00 - movq mm5, mm2 ; 13 03 12 02 + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 - punpckldq mm0, mm3 ; 30 20 10 00 - punpckhdq mm1, mm3 ; 31 21 11 01 + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 - punpckldq mm2, mm4 ; 32 22 12 02 - punpckhdq mm5, mm4 ; 33 23 13 03 + psubw mm6, mm3 ;3 - movq mm3, mm5 ; 33 23 13 03 + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 - psubw mm0, mm2 ; b1= 0-2 - paddw mm2, mm2 ; + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 - movq mm5, mm1 - paddw mm2, mm0 ; a1 =0+2 + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 - pmulhw mm5, [GLOBAL(x_s1sqr2)] ; - paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 - movq mm7, mm3 ; - pmulhw mm7, [GLOBAL(x_c1sqr2less1)] ; + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 - paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw mm7, mm5 ; c1 + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 - movq mm5, mm1 - movq mm4, mm3 + movq mm3, mm5 ; 33 23 13 03 - pmulhw mm5, [GLOBAL(x_c1sqr2less1)] - paddw mm5, mm1 + psubw mm0, mm2 ; b1= 0-2 + paddw mm2, mm2 ; - pmulhw mm3, [GLOBAL(x_s1sqr2)] - paddw mm3, mm4 + movq mm5, mm1 + paddw mm2, mm0 ; a1 =0+2 - paddw mm3, mm5 ; d1 - paddw mm0, [GLOBAL(fours)] + pmulhw mm5, [GLOBAL(x_s1sqr2)]; + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) - paddw mm2, [GLOBAL(fours)] - movq mm6, mm2 ; a1 + movq mm7, mm3 ; + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; - movq mm4, mm0 ; b1 - paddw mm2, mm3 ;0 + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw mm7, mm5 ; c1 - paddw mm4, mm7 ;1 - psubw mm0, mm7 ;2 + movq mm5, mm1 + movq mm4, mm3 - psubw mm6, mm3 ;3 - psraw mm2, 3 + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] + paddw mm5, mm1 - psraw mm0, 3 - psraw mm4, 3 + pmulhw mm3, [GLOBAL(x_s1sqr2)] + paddw mm3, mm4 - psraw mm6, 3 + paddw mm3, mm5 ; d1 + paddw mm0, [GLOBAL(fours)] - movq mm1, mm2 ; 03 02 01 00 - movq mm3, mm4 ; 23 22 21 20 + paddw mm2, [GLOBAL(fours)] + movq mm6, mm2 ; a1 - punpcklwd mm1, mm0 ; 11 01 10 00 - punpckhwd mm2, mm0 ; 13 03 12 02 + movq mm4, mm0 ; b1 + paddw mm2, mm3 ;0 - punpcklwd mm3, mm6 ; 31 21 30 20 - punpckhwd mm4, mm6 ; 33 23 32 22 + paddw mm4, mm7 ;1 + psubw mm0, mm7 ;2 - movq mm0, mm1 ; 11 01 10 00 - movq mm5, mm2 ; 13 03 12 02 + psubw mm6, mm3 ;3 + psraw mm2, 3 - punpckldq mm0, mm3 ; 30 20 10 00 - punpckhdq mm1, mm3 ; 31 21 11 01 + psraw mm0, 3 + psraw mm4, 3 - punpckldq mm2, mm4 ; 32 22 12 02 - punpckhdq mm5, mm4 ; 33 23 13 03 + psraw mm6, 3 - movq [rdx], mm0 + movq mm1, mm2 ; 03 02 01 00 + movq mm3, mm4 ; 23 22 21 20 - movq [rdx+rax], mm1 - movq [rdx+rax*2], mm2 + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm2, mm0 ; 13 03 12 02 - add rdx, rax - movq [rdx+rax*2], mm5 + punpcklwd mm3, mm6 ; 31 21 30 20 + punpckhwd mm4, mm6 ; 33 23 32 22 - ; begin epilog - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch) -global sym(vp8_short_idct4x4llm_1_mmx) -sym(vp8_short_idct4x4llm_1_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - ; end prolog + movq mm0, mm1 ; 11 01 10 00 + movq mm5, mm2 ; 13 03 12 02 - mov rax, arg(0) ;input - movd mm0, [rax] + punpckldq mm0, mm3 ; 30 20 10 00 + punpckhdq mm1, mm3 ; 31 21 11 01 - paddw mm0, [GLOBAL(fours)] - mov rdx, arg(1) ;output + punpckldq mm2, mm4 ; 32 22 12 02 + punpckhdq mm5, mm4 ; 33 23 13 03 - psraw mm0, 3 - movsxd rax, dword ptr arg(2) ;pitch + pxor mm7, mm7 - punpcklwd mm0, mm0 - punpckldq mm0, mm0 + movd mm4, [rsi] + punpcklbw mm4, mm7 + paddsw mm0, mm4 + packuswb mm0, mm7 + movd [rdx], mm0 - movq [rdx], mm0 - movq [rdx+rax], mm0 + movd mm4, [rsi+rax] + punpcklbw mm4, mm7 + paddsw mm1, mm4 + packuswb mm1, mm7 + movd [rdx+rdi], mm1 - movq [rdx+rax*2], mm0 - add rdx, rax + movd mm4, [rsi+2*rax] + punpcklbw mm4, mm7 + paddsw mm2, mm4 + packuswb mm2, mm7 + movd [rdx+rdi*2], mm2 - movq [rdx+rax*2], mm0 + add rdx, rdi + add rsi, rax + movd mm4, [rsi+2*rax] + punpcklbw mm4, mm7 + paddsw mm5, mm4 + packuswb mm5, mm7 + movd [rdx+rdi*2], mm5 ; begin epilog + pop rdi + pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret -;void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride) +;void vp8_dc_only_idct_add_mmx( +;short input_dc, +;unsigned char *pred_ptr, +;int pred_stride, +;unsigned char *dst_ptr, +;int stride) global sym(vp8_dc_only_idct_add_mmx) sym(vp8_dc_only_idct_add_mmx): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 GET_GOT rbx - push rsi - push rdi ; end prolog - mov rsi, arg(1) ;s -- prediction - mov rdi, arg(2) ;d -- destination - movsxd rax, dword ptr arg(4) ;stride - movsxd rdx, dword ptr arg(3) ;pitch - pxor mm0, mm0 - movd mm5, arg(0) ;input_dc + mov rax, arg(1) ;pred_ptr + movsxd rdx, dword ptr arg(2) ;pred_stride + + pxor mm0, mm0 paddw mm5, [GLOBAL(fours)] + lea rcx, [rdx + rdx*2] psraw mm5, 3 punpcklwd mm5, mm5 + punpckldq mm5, mm5 - movd mm1, [rsi] + movd mm1, [rax] + movd mm2, [rax+rdx] + movd mm3, [rax+2*rdx] + movd mm4, [rax+rcx] + + mov rax, arg(3) ;d -- destination + movsxd rdx, dword ptr arg(4) ;dst_stride + punpcklbw mm1, mm0 paddsw mm1, mm5 packuswb mm1, mm0 ; pack and unpack to saturate - movd [rdi], mm1 + lea rcx, [rdx + rdx*2] - movd mm2, [rsi+rdx] punpcklbw mm2, mm0 paddsw mm2, mm5 packuswb mm2, mm0 ; pack and unpack to saturate - movd [rdi+rax], mm2 - movd mm3, [rsi+2*rdx] punpcklbw mm3, mm0 paddsw mm3, mm5 packuswb mm3, mm0 ; pack and unpack to saturate - movd [rdi+2*rax], mm3 - add rdi, rax - add rsi, rdx - movd mm4, [rsi+2*rdx] punpcklbw mm4, mm0 paddsw mm4, mm5 packuswb mm4, mm0 ; pack and unpack to saturate - movd [rdi+2*rax], mm4 + + movd [rax], mm1 + movd [rax+rdx], mm2 + movd [rax+2*rdx], mm3 + movd [rax+rcx], mm4 ; begin epilog - pop rdi - pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm index 83d3765ff..abeb0b682 100644 --- a/vp8/common/x86/idctllm_sse2.asm +++ b/vp8/common/x86/idctllm_sse2.asm @@ -15,17 +15,15 @@ ; ( ; short *qcoeff - 0 ; short *dequant - 1 -; unsigned char *pre - 2 -; unsigned char *dst - 3 -; int dst_stride - 4 -; int blk_stride - 5 +; unsigned char *dst - 2 +; int dst_stride - 3 ; ) global sym(vp8_idct_dequant_0_2x_sse2) sym(vp8_idct_dequant_0_2x_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 4 GET_GOT rbx ; end prolog @@ -47,19 +45,20 @@ sym(vp8_idct_dequant_0_2x_sse2): movd [rax], xmm5 movd [rax+32], xmm5 ;pshufb + mov rax, arg(2) ; dst + movsxd rdx, dword ptr arg(3) ; dst_stride + pshuflw xmm4, xmm4, 00000000b pshufhw xmm4, xmm4, 00000000b - mov rax, arg(2) ; pre + lea rcx, [rdx + rdx*2] paddw xmm4, [GLOBAL(fours)] - movsxd rcx, dword ptr arg(5) ; blk_stride psraw xmm4, 3 movq xmm0, [rax] - movq xmm1, [rax+rcx] - movq xmm2, [rax+2*rcx] - lea rcx, [3*rcx] + movq xmm1, [rax+rdx] + movq xmm2, [rax+2*rdx] movq xmm3, [rax+rcx] punpcklbw xmm0, xmm5 @@ -67,8 +66,6 @@ sym(vp8_idct_dequant_0_2x_sse2): punpcklbw xmm2, xmm5 punpcklbw xmm3, xmm5 - mov rax, arg(3) ; dst - movsxd rdx, dword ptr arg(4) ; dst_stride ; Add to predict buffer paddw xmm0, xmm4 @@ -97,11 +94,18 @@ sym(vp8_idct_dequant_0_2x_sse2): pop rbp ret +;void vp8_idct_dequant_full_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; ) global sym(vp8_idct_dequant_full_2x_sse2) sym(vp8_idct_dequant_full_2x_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 + SHADOW_ARGS_TO_STACK 4 SAVE_XMM 7 GET_GOT rbx push rsi @@ -111,14 +115,13 @@ sym(vp8_idct_dequant_full_2x_sse2): ; special case when 2 blocks have 0 or 1 coeffs ; dc is set as first coeff, so no need to load qcoeff mov rax, arg(0) ; qcoeff - mov rsi, arg(2) ; pre - mov rdi, arg(3) ; dst - movsxd rcx, dword ptr arg(5) ; blk_stride + mov rdx, arg(1) ; dequant + mov rdi, arg(2) ; dst + ; Zero out xmm7, for use unpacking pxor xmm7, xmm7 - mov rdx, arg(1) ; dequant ; note the transpose of xmm1 and xmm2, necessary for shuffle ; to spit out sensicle data @@ -138,6 +141,7 @@ sym(vp8_idct_dequant_full_2x_sse2): pmullw xmm2, [rdx+16] pmullw xmm1, [rdx] pmullw xmm3, [rdx+16] + movsxd rdx, dword ptr arg(3) ; dst_stride ; repack so block 0 row x and block 1 row x are together movdqa xmm4, xmm0 @@ -162,6 +166,7 @@ sym(vp8_idct_dequant_full_2x_sse2): paddw xmm2, xmm0 ; a1 = 0+2 pmulhw xmm5, [GLOBAL(x_s1sqr2)] + lea rcx, [rdx + rdx*2] ;dst_stride * 3 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) movdqa xmm7, xmm3 @@ -304,8 +309,8 @@ sym(vp8_idct_dequant_full_2x_sse2): pxor xmm7, xmm7 ; Load up predict blocks - movq xmm4, [rsi] - movq xmm5, [rsi+rcx] + movq xmm4, [rdi] + movq xmm5, [rdi+rdx] punpcklbw xmm4, xmm7 punpcklbw xmm5, xmm7 @@ -313,9 +318,8 @@ sym(vp8_idct_dequant_full_2x_sse2): paddw xmm0, xmm4 paddw xmm1, xmm5 - movq xmm4, [rsi+2*rcx] - lea rcx, [3*rcx] - movq xmm5, [rsi+rcx] + movq xmm4, [rdi+2*rdx] + movq xmm5, [rdi+rcx] punpcklbw xmm4, xmm7 punpcklbw xmm5, xmm7 @@ -331,18 +335,11 @@ sym(vp8_idct_dequant_full_2x_sse2): packuswb xmm2, xmm7 packuswb xmm3, xmm7 - ; Load destination stride before writing out, - ; doesn't need to persist - movsxd rdx, dword ptr arg(4) ; dst_stride - ; store blocks back out movq [rdi], xmm0 movq [rdi + rdx], xmm1 - - lea rdi, [rdi + 2*rdx] - - movq [rdi], xmm2 - movq [rdi + rdx], xmm3 + movq [rdi + rdx*2], xmm2 + movq [rdi + rcx], xmm3 ; begin epilog pop rdi @@ -357,27 +354,25 @@ sym(vp8_idct_dequant_full_2x_sse2): ; ( ; short *qcoeff - 0 ; short *dequant - 1 -; unsigned char *pre - 2 -; unsigned char *dst - 3 -; int dst_stride - 4 -; short *dc - 5 +; unsigned char *dst - 2 +; int dst_stride - 3 +; short *dc - 4 ; ) global sym(vp8_idct_dequant_dc_0_2x_sse2) sym(vp8_idct_dequant_dc_0_2x_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 + SHADOW_ARGS_TO_STACK 5 GET_GOT rbx - push rsi push rdi ; end prolog ; special case when 2 blocks have 0 or 1 coeffs ; dc is set as first coeff, so no need to load qcoeff mov rax, arg(0) ; qcoeff - mov rsi, arg(2) ; pre - mov rdi, arg(3) ; dst - mov rdx, arg(5) ; dc + + mov rdi, arg(2) ; dst + mov rdx, arg(4) ; dc ; Zero out xmm5, for use unpacking pxor xmm5, xmm5 @@ -385,11 +380,13 @@ sym(vp8_idct_dequant_dc_0_2x_sse2): ; load up 2 dc words here == 2*16 = doubleword movd xmm4, [rdx] + movsxd rdx, dword ptr arg(3) ; dst_stride + lea rcx, [rdx + rdx*2] ; Load up predict blocks - movq xmm0, [rsi] - movq xmm1, [rsi+16] - movq xmm2, [rsi+32] - movq xmm3, [rsi+48] + movq xmm0, [rdi] + movq xmm1, [rdi+rdx*1] + movq xmm2, [rdi+rdx*2] + movq xmm3, [rdi+rcx] ; Duplicate and expand dc across punpcklwd xmm4, xmm4 @@ -417,48 +414,46 @@ sym(vp8_idct_dequant_dc_0_2x_sse2): packuswb xmm2, xmm5 packuswb xmm3, xmm5 - ; Load destination stride before writing out, - ; doesn't need to persist - movsxd rdx, dword ptr arg(4) ; dst_stride - ; store blocks back out movq [rdi], xmm0 movq [rdi + rdx], xmm1 - - lea rdi, [rdi + 2*rdx] - - movq [rdi], xmm2 - movq [rdi + rdx], xmm3 + movq [rdi + rdx*2], xmm2 + movq [rdi + rcx], xmm3 ; begin epilog pop rdi - pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret - +;void vp8_idct_dequant_dc_full_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *dst - 2 +; int dst_stride - 3 +; short *dc - 4 +; ) global sym(vp8_idct_dequant_dc_full_2x_sse2) sym(vp8_idct_dequant_dc_full_2x_sse2): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 + SHADOW_ARGS_TO_STACK 5 SAVE_XMM 7 GET_GOT rbx - push rsi push rdi ; end prolog ; special case when 2 blocks have 0 or 1 coeffs ; dc is set as first coeff, so no need to load qcoeff mov rax, arg(0) ; qcoeff - mov rsi, arg(2) ; pre - mov rdi, arg(3) ; dst + mov rdx, arg(1) ; dequant + + mov rdi, arg(2) ; dst ; Zero out xmm7, for use unpacking pxor xmm7, xmm7 - mov rdx, arg(1) ; dequant ; note the transpose of xmm1 and xmm2, necessary for shuffle ; to spit out sensicle data @@ -480,7 +475,7 @@ sym(vp8_idct_dequant_dc_full_2x_sse2): pmullw xmm3, [rdx+16] ; DC component - mov rdx, arg(5) + mov rdx, arg(4) ; repack so block 0 row x and block 1 row x are together movdqa xmm4, xmm0 @@ -651,8 +646,10 @@ sym(vp8_idct_dequant_dc_full_2x_sse2): pxor xmm7, xmm7 ; Load up predict blocks - movq xmm4, [rsi] - movq xmm5, [rsi+16] + movsxd rdx, dword ptr arg(3) ; dst_stride + movq xmm4, [rdi] + movq xmm5, [rdi+rdx] + lea rcx, [rdx + rdx*2] punpcklbw xmm4, xmm7 punpcklbw xmm5, xmm7 @@ -660,8 +657,8 @@ sym(vp8_idct_dequant_dc_full_2x_sse2): paddw xmm0, xmm4 paddw xmm1, xmm5 - movq xmm4, [rsi+32] - movq xmm5, [rsi+48] + movq xmm4, [rdi+rdx*2] + movq xmm5, [rdi+rcx] punpcklbw xmm4, xmm7 punpcklbw xmm5, xmm7 @@ -679,7 +676,7 @@ sym(vp8_idct_dequant_dc_full_2x_sse2): ; Load destination stride before writing out, ; doesn't need to persist - movsxd rdx, dword ptr arg(4) ; dst_stride + movsxd rdx, dword ptr arg(3) ; dst_stride ; store blocks back out movq [rdi], xmm0 @@ -693,7 +690,6 @@ sym(vp8_idct_dequant_dc_full_2x_sse2): ; begin epilog pop rdi - pop rsi RESTORE_GOT RESTORE_XMM UNSHADOW_ARGS diff --git a/vp8/common/x86/recon_mmx.asm b/vp8/common/x86/recon_mmx.asm index e7211fccb..19c0faf3f 100644 --- a/vp8/common/x86/recon_mmx.asm +++ b/vp8/common/x86/recon_mmx.asm @@ -10,53 +10,6 @@ %include "vpx_ports/x86_abi_support.asm" -;void vp8_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride) -global sym(vp8_recon_b_mmx) -sym(vp8_recon_b_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;s - mov rdi, arg(2) ;d - mov rdx, arg(1) ;q - movsxd rax, dword ptr arg(3) ;stride - pxor mm0, mm0 - - movd mm1, [rsi] - punpcklbw mm1, mm0 - paddsw mm1, [rdx] - packuswb mm1, mm0 ; pack and unpack to saturate - movd [rdi], mm1 - - movd mm2, [rsi+16] - punpcklbw mm2, mm0 - paddsw mm2, [rdx+32] - packuswb mm2, mm0 ; pack and unpack to saturate - movd [rdi+rax], mm2 - - movd mm3, [rsi+32] - punpcklbw mm3, mm0 - paddsw mm3, [rdx+64] - packuswb mm3, mm0 ; pack and unpack to saturate - movd [rdi+2*rax], mm3 - - add rdi, rax - movd mm4, [rsi+48] - punpcklbw mm4, mm0 - paddsw mm4, [rdx+96] - packuswb mm4, mm0 ; pack and unpack to saturate - movd [rdi+2*rax], mm4 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret ;void copy_mem8x8_mmx( diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm index f54cc4e7e..a82c1b4fd 100644 --- a/vp8/common/x86/recon_sse2.asm +++ b/vp8/common/x86/recon_sse2.asm @@ -10,121 +10,6 @@ %include "vpx_ports/x86_abi_support.asm" -;void vp8_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride) -global sym(vp8_recon2b_sse2) -sym(vp8_recon2b_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;s - mov rdi, arg(2) ;d - mov rdx, arg(1) ;q - movsxd rax, dword ptr arg(3) ;stride - pxor xmm0, xmm0 - - movq xmm1, MMWORD PTR [rsi] - punpcklbw xmm1, xmm0 - paddsw xmm1, XMMWORD PTR [rdx] - packuswb xmm1, xmm0 ; pack and unpack to saturate - movq MMWORD PTR [rdi], xmm1 - - - movq xmm2, MMWORD PTR [rsi+8] - punpcklbw xmm2, xmm0 - paddsw xmm2, XMMWORD PTR [rdx+16] - packuswb xmm2, xmm0 ; pack and unpack to saturate - movq MMWORD PTR [rdi+rax], xmm2 - - - movq xmm3, MMWORD PTR [rsi+16] - punpcklbw xmm3, xmm0 - paddsw xmm3, XMMWORD PTR [rdx+32] - packuswb xmm3, xmm0 ; pack and unpack to saturate - movq MMWORD PTR [rdi+rax*2], xmm3 - - add rdi, rax - movq xmm4, MMWORD PTR [rsi+24] - punpcklbw xmm4, xmm0 - paddsw xmm4, XMMWORD PTR [rdx+48] - packuswb xmm4, xmm0 ; pack and unpack to saturate - movq MMWORD PTR [rdi+rax*2], xmm4 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vp8_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride) -global sym(vp8_recon4b_sse2) -sym(vp8_recon4b_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;s - mov rdi, arg(2) ;d - mov rdx, arg(1) ;q - movsxd rax, dword ptr arg(3) ;stride - pxor xmm0, xmm0 - - movdqa xmm1, XMMWORD PTR [rsi] - movdqa xmm5, xmm1 - punpcklbw xmm1, xmm0 - punpckhbw xmm5, xmm0 - paddsw xmm1, XMMWORD PTR [rdx] - paddsw xmm5, XMMWORD PTR [rdx+16] - packuswb xmm1, xmm5 ; pack and unpack to saturate - movdqa XMMWORD PTR [rdi], xmm1 - - - movdqa xmm2, XMMWORD PTR [rsi+16] - movdqa xmm6, xmm2 - punpcklbw xmm2, xmm0 - punpckhbw xmm6, xmm0 - paddsw xmm2, XMMWORD PTR [rdx+32] - paddsw xmm6, XMMWORD PTR [rdx+48] - packuswb xmm2, xmm6 ; pack and unpack to saturate - movdqa XMMWORD PTR [rdi+rax], xmm2 - - - movdqa xmm3, XMMWORD PTR [rsi+32] - movdqa xmm7, xmm3 - punpcklbw xmm3, xmm0 - punpckhbw xmm7, xmm0 - paddsw xmm3, XMMWORD PTR [rdx+64] - paddsw xmm7, XMMWORD PTR [rdx+80] - packuswb xmm3, xmm7 ; pack and unpack to saturate - movdqa XMMWORD PTR [rdi+rax*2], xmm3 - - add rdi, rax - movdqa xmm4, XMMWORD PTR [rsi+48] - movdqa xmm5, xmm4 - punpcklbw xmm4, xmm0 - punpckhbw xmm5, xmm0 - paddsw xmm4, XMMWORD PTR [rdx+96] - paddsw xmm5, XMMWORD PTR [rdx+112] - packuswb xmm4, xmm5 ; pack and unpack to saturate - movdqa XMMWORD PTR [rdi+rax*2], xmm4 - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - ;void copy_mem16x16_sse2( ; unsigned char *src, diff --git a/vp8/common/x86/recon_x86.h b/vp8/common/x86/recon_x86.h index fe0f8f0bc..fbb3dcb63 100644 --- a/vp8/common/x86/recon_x86.h +++ b/vp8/common/x86/recon_x86.h @@ -20,16 +20,12 @@ */ #if HAVE_MMX -extern prototype_recon_block(vp8_recon_b_mmx); extern prototype_copy_block(vp8_copy_mem8x8_mmx); extern prototype_copy_block(vp8_copy_mem8x4_mmx); extern prototype_copy_block(vp8_copy_mem16x16_mmx); #if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_recon_recon -#define vp8_recon_recon vp8_recon_b_mmx - #undef vp8_recon_copy8x8 #define vp8_recon_copy8x8 vp8_copy_mem8x8_mmx @@ -43,19 +39,11 @@ extern prototype_copy_block(vp8_copy_mem16x16_mmx); #endif #if HAVE_SSE2 -extern prototype_recon_block(vp8_recon2b_sse2); -extern prototype_recon_block(vp8_recon4b_sse2); extern prototype_copy_block(vp8_copy_mem16x16_sse2); extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_sse2); extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2); #if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_recon_recon2 -#define vp8_recon_recon2 vp8_recon2b_sse2 - -#undef vp8_recon_recon4 -#define vp8_recon_recon4 vp8_recon4b_sse2 - #undef vp8_recon_copy16x16 #define vp8_recon_copy16x16 vp8_copy_mem16x16_sse2 diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c index 33a984b79..c4e616a67 100644 --- a/vp8/common/x86/x86_systemdependent.c +++ b/vp8/common/x86/x86_systemdependent.c @@ -37,7 +37,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) if (flags & HAS_MMX) { - rtcd->idct.idct1 = vp8_short_idct4x4llm_1_mmx; rtcd->idct.idct16 = vp8_short_idct4x4llm_mmx; rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx; rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_mmx; @@ -45,7 +44,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) - rtcd->recon.recon = vp8_recon_b_mmx; rtcd->recon.copy8x8 = vp8_copy_mem8x8_mmx; rtcd->recon.copy8x4 = vp8_copy_mem8x4_mmx; rtcd->recon.copy16x16 = vp8_copy_mem16x16_mmx; @@ -81,8 +79,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) if (flags & HAS_SSE2) { - rtcd->recon.recon2 = vp8_recon2b_sse2; - rtcd->recon.recon4 = vp8_recon4b_sse2; rtcd->recon.copy16x16 = vp8_copy_mem16x16_sse2; rtcd->recon.build_intra_predictors_mbuv = vp8_build_intra_predictors_mbuv_sse2; diff --git a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm index 6bebda24f..19f94e089 100644 --- a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm +++ b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm @@ -12,21 +12,19 @@ AREA |.text|, CODE, READONLY -;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred, -; unsigned char *dest, int pitch, int stride, int Dc) +;void vp8_dequant_dc_idct_v6(short *input, short *dq, +; unsigned char *dest, int stride, int Dc) ; r0 = input ; r1 = dq -; r2 = pred -; r3 = dest -; sp + 36 = pitch ; +4 = 40 -; sp + 40 = stride ; +4 = 44 -; sp + 44 = Dc ; +4 = 48 +; r2 = dst +; r3 = stride +; sp + 36 = Dc |vp8_dequant_dc_idct_add_v6| PROC stmdb sp!, {r4-r11, lr} - ldr r6, [sp, #44] + ldr r6, [sp, #36] ldr r4, [r0] ;input ldr r5, [r1], #4 ;dq @@ -149,7 +147,7 @@ vp8_dequant_dc_idct_loop2_v6 usub16 r1, r12, r8 uadd16 r8, r11, r6 ldr r9, c0x00040004 - ldr r12, [sp, #40] + ldr r12, [sp] ; get stride from stack uadd16 r6, r10, r8 usub16 r7, r10, r8 uadd16 r7, r7, r9 @@ -158,7 +156,7 @@ vp8_dequant_dc_idct_loop2_v6 usub16 r1, r14, r1 uadd16 r10, r10, r9 uadd16 r1, r1, r9 - ldr r11, [r2], r12 + ldr r11, [r2] ; load input from dst mov r8, r7, asr #3 pkhtb r9, r8, r10, asr #19 mov r8, r1, asr #3 @@ -170,9 +168,7 @@ vp8_dequant_dc_idct_loop2_v6 usat16 r9, #8, r9 usat16 r8, #8, r8 orr r9, r8, r9, lsl #8 - ldr r11, [r2], r12 - ldr lr, [sp] - ldr r12, [sp, #44] + ldr r11, [r2, r12] ; load input from dst mov r7, r7, lsl #16 mov r1, r1, lsl #16 mov r10, r10, lsl #16 @@ -188,9 +184,8 @@ vp8_dequant_dc_idct_loop2_v6 usat16 r7, #8, r7 usat16 r1, #8, r1 orr r1, r1, r7, lsl #8 - str r9, [lr], r12 - str r1, [lr], r12 - str lr, [sp] + str r9, [r2], r12 ; store output to dst + str r1, [r2], r12 ; store output to dst bne vp8_dequant_dc_idct_loop2_v6 ; vpx_memset diff --git a/vp8/decoder/arm/armv6/dequant_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_idct_v6.asm index 47b671ca6..2510ad838 100644 --- a/vp8/decoder/arm/armv6/dequant_idct_v6.asm +++ b/vp8/decoder/arm/armv6/dequant_idct_v6.asm @@ -10,15 +10,12 @@ EXPORT |vp8_dequant_idct_add_v6| AREA |.text|, CODE, READONLY -;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred, -; unsigned char *dest, int pitch, int stride) -; r0 = input +;void vp8_dequant_idct_v6(short *input, short *dq, +; unsigned char *dest, int stride) +; r0 = q ; r1 = dq -; r2 = pred -; r3 = dest -; sp + 36 = pitch ; +4 = 40 -; sp + 40 = stride ; +4 = 44 - +; r2 = dst +; r3 = stride |vp8_dequant_idct_add_v6| PROC stmdb sp!, {r4-r11, lr} @@ -127,7 +124,7 @@ vp8_dequant_idct_loop2_v6 usub16 r1, r12, r8 uadd16 r8, r11, r6 ldr r9, c0x00040004 - ldr r12, [sp, #40] + ldr r12, [sp] ; get stride from stack uadd16 r6, r10, r8 usub16 r7, r10, r8 uadd16 r7, r7, r9 @@ -136,7 +133,7 @@ vp8_dequant_idct_loop2_v6 usub16 r1, r14, r1 uadd16 r10, r10, r9 uadd16 r1, r1, r9 - ldr r11, [r2], r12 + ldr r11, [r2] ; load input from dst mov r8, r7, asr #3 pkhtb r9, r8, r10, asr #19 mov r8, r1, asr #3 @@ -148,9 +145,7 @@ vp8_dequant_idct_loop2_v6 usat16 r9, #8, r9 usat16 r8, #8, r8 orr r9, r8, r9, lsl #8 - ldr r11, [r2], r12 - ldr lr, [sp] - ldr r12, [sp, #44] + ldr r11, [r2, r12] ; load input from dst mov r7, r7, lsl #16 mov r1, r1, lsl #16 mov r10, r10, lsl #16 @@ -166,9 +161,8 @@ vp8_dequant_idct_loop2_v6 usat16 r7, #8, r7 usat16 r1, #8, r1 orr r1, r1, r7, lsl #8 - str r9, [lr], r12 - str r1, [lr], r12 - str lr, [sp] + str r9, [r2], r12 ; store output to dst + str r1, [r2], r12 ; store output to dst bne vp8_dequant_idct_loop2_v6 ; vpx_memset diff --git a/vp8/decoder/arm/armv6/idct_blk_v6.c b/vp8/decoder/arm/armv6/idct_blk_v6.c index 5c7592f35..686bb737f 100644 --- a/vp8/decoder/arm/armv6/idct_blk_v6.c +++ b/vp8/decoder/arm/armv6/idct_blk_v6.c @@ -12,115 +12,121 @@ #include "vp8/common/idct.h" #include "vp8/decoder/dequantize.h" -void vp8_dequant_dc_idct_add_y_block_v6 - (short *q, short *dq, unsigned char *pre, - unsigned char *dst, int stride, char *eobs, short *dc) + +void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq, + unsigned char *dst, int stride, + char *eobs, short *dc) { int i; for (i = 0; i < 4; i++) { if (eobs[0] > 1) - vp8_dequant_dc_idct_add_v6 (q, dq, pre, dst, 16, stride, dc[0]); - else - vp8_dc_only_idct_add_v6 (dc[0], pre, dst, 16, stride); + vp8_dequant_dc_idct_add_v6 (q, dq, dst, stride, dc[0]); + else if (eobs[0] == 1) + vp8_dc_only_idct_add_v6 (dc[0], dst, stride, dst, stride); if (eobs[1] > 1) - vp8_dequant_dc_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride, dc[1]); - else - vp8_dc_only_idct_add_v6 (dc[1], pre+4, dst+4, 16, stride); + { + vp8_dequant_dc_idct_add_v6 (q+16, dq, dst+4, stride, dc[1]); + } + else if (eobs[1] == 1) + vp8_dc_only_idct_add_v6 (dc[1], dst+4, stride, dst+4, stride); if (eobs[2] > 1) - vp8_dequant_dc_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride, dc[2]); - else - vp8_dc_only_idct_add_v6 (dc[2], pre+8, dst+8, 16, stride); + { + vp8_dequant_dc_idct_add_v6 (q+32, dq, dst+8, stride, dc[2]); + } + else if (eobs[2] == 1) + vp8_dc_only_idct_add_v6 (dc[2], dst+8, stride, dst+8, stride); if (eobs[3] > 1) - vp8_dequant_dc_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride, dc[3]); - else - vp8_dc_only_idct_add_v6 (dc[3], pre+12, dst+12, 16, stride); + { + vp8_dequant_dc_idct_add_v6 (q+48, dq, dst+12, stride, dc[3]); + } + else if (eobs[3] == 1) + vp8_dc_only_idct_add_v6 (dc[3], dst+12, stride, dst+12, stride); q += 64; dc += 4; - pre += 64; dst += 4*stride; eobs += 4; } } -void vp8_dequant_idct_add_y_block_v6 - (short *q, short *dq, unsigned char *pre, - unsigned char *dst, int stride, char *eobs) +void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, + unsigned char *dst, + int stride, char *eobs) { int i; for (i = 0; i < 4; i++) { if (eobs[0] > 1) - vp8_dequant_idct_add_v6 (q, dq, pre, dst, 16, stride); - else + vp8_dequant_idct_add_v6 (q, dq, dst, stride); + else if (eobs[0] == 1) { - vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dst, 16, stride); + vp8_dc_only_idct_add_v6 (q[0]*dq[0], dst, stride, dst, stride); ((int *)q)[0] = 0; } if (eobs[1] > 1) - vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride); - else + vp8_dequant_idct_add_v6 (q+16, dq, dst+4, stride); + else if (eobs[1] == 1) { - vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dst+4, 16, stride); + vp8_dc_only_idct_add_v6 (q[16]*dq[0], dst+4, stride, dst+4, stride); ((int *)(q+16))[0] = 0; } if (eobs[2] > 1) - vp8_dequant_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride); - else + vp8_dequant_idct_add_v6 (q+32, dq, dst+8, stride); + else if (eobs[2] == 1) { - vp8_dc_only_idct_add_v6 (q[32]*dq[0], pre+8, dst+8, 16, stride); + vp8_dc_only_idct_add_v6 (q[32]*dq[0], dst+8, stride, dst+8, stride); ((int *)(q+32))[0] = 0; } if (eobs[3] > 1) - vp8_dequant_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride); - else + vp8_dequant_idct_add_v6 (q+48, dq, dst+12, stride); + else if (eobs[3] == 1) { - vp8_dc_only_idct_add_v6 (q[48]*dq[0], pre+12, dst+12, 16, stride); + vp8_dc_only_idct_add_v6 (q[48]*dq[0], dst+12, stride,dst+12,stride); ((int *)(q+48))[0] = 0; } q += 64; - pre += 64; dst += 4*stride; eobs += 4; } } -void vp8_dequant_idct_add_uv_block_v6 - (short *q, short *dq, unsigned char *pre, - unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, + unsigned char *dstu, + unsigned char *dstv, + int stride, char *eobs) { int i; for (i = 0; i < 2; i++) { if (eobs[0] > 1) - vp8_dequant_idct_add_v6 (q, dq, pre, dstu, 8, stride); - else + vp8_dequant_idct_add_v6 (q, dq, dstu, stride); + else if (eobs[0] == 1) { - vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstu, 8, stride); + vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstu, stride, dstu, stride); ((int *)q)[0] = 0; } if (eobs[1] > 1) - vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstu+4, 8, stride); - else + vp8_dequant_idct_add_v6 (q+16, dq, dstu+4, stride); + else if (eobs[1] == 1) { - vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstu+4, 8, stride); + vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstu+4, stride, + dstu+4, stride); ((int *)(q+16))[0] = 0; } q += 32; - pre += 32; dstu += 4*stride; eobs += 2; } @@ -128,23 +134,23 @@ void vp8_dequant_idct_add_uv_block_v6 for (i = 0; i < 2; i++) { if (eobs[0] > 1) - vp8_dequant_idct_add_v6 (q, dq, pre, dstv, 8, stride); - else + vp8_dequant_idct_add_v6 (q, dq, dstv, stride); + else if (eobs[0] == 1) { - vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstv, 8, stride); + vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstv, stride, dstv, stride); ((int *)q)[0] = 0; } if (eobs[1] > 1) - vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstv+4, 8, stride); - else + vp8_dequant_idct_add_v6 (q+16, dq, dstv+4, stride); + else if (eobs[1] == 1) { - vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstv+4, 8, stride); + vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstv+4, stride, + dstv+4, stride); ((int *)(q+16))[0] = 0; } q += 32; - pre += 32; dstv += 4*stride; eobs += 2; } diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/decoder/arm/dequantize_arm.h index b7d800d26..c020c8530 100644 --- a/vp8/decoder/arm/dequantize_arm.h +++ b/vp8/decoder/arm/dequantize_arm.h @@ -49,6 +49,7 @@ extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neo extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon); extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon); + #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_dequant_block #define vp8_dequant_block vp8_dequantize_b_neon @@ -68,6 +69,7 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon); #undef vp8_dequant_idct_add_uv_block #define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon #endif + #endif #endif diff --git a/vp8/decoder/arm/neon/dequant_idct_neon.asm b/vp8/decoder/arm/neon/dequant_idct_neon.asm index 4bf661857..602cce676 100644 --- a/vp8/decoder/arm/neon/dequant_idct_neon.asm +++ b/vp8/decoder/arm/neon/dequant_idct_neon.asm @@ -15,25 +15,24 @@ PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred, -; unsigned char *dest, int pitch, int stride) +;void vp8_dequant_idct_add_neon(short *input, short *dq, +; unsigned char *dest, int stride) ; r0 short *input, ; r1 short *dq, -; r2 unsigned char *pred -; r3 unsigned char *dest -; sp int pitch -; sp+4 int stride +; r2 unsigned char *dest +; r3 int stride |vp8_dequant_idct_add_neon| PROC vld1.16 {q3, q4}, [r0] vld1.16 {q5, q6}, [r1] - ldr r1, [sp] ; pitch - vld1.32 {d14[0]}, [r2], r1 - vld1.32 {d14[1]}, [r2], r1 - vld1.32 {d15[0]}, [r2], r1 - vld1.32 {d15[1]}, [r2] - ldr r1, [sp, #4] ; stride + add r1, r2, r3 ; r1 = dest + stride + lsl r3, #1 ; 2x stride + + vld1.32 {d14[0]}, [r2], r3 + vld1.32 {d14[1]}, [r1], r3 + vld1.32 {d15[0]}, [r2] + vld1.32 {d15[1]}, [r1] adr r12, cospi8sqrt2minus1 ; pointer to the first constant @@ -110,13 +109,16 @@ vaddw.u8 q1, q1, d14 vaddw.u8 q2, q2, d15 + sub r2, r2, r3 + sub r1, r1, r3 + vqmovun.s16 d0, q1 vqmovun.s16 d1, q2 - vst1.32 {d0[0]}, [r3], r1 - vst1.32 {d0[1]}, [r3], r1 - vst1.32 {d1[0]}, [r3], r1 - vst1.32 {d1[1]}, [r3] + vst1.32 {d0[0]}, [r2], r3 + vst1.32 {d0[1]}, [r1], r3 + vst1.32 {d1[0]}, [r2] + vst1.32 {d1[1]}, [r1] bx lr diff --git a/vp8/decoder/arm/neon/idct_blk_neon.c b/vp8/decoder/arm/neon/idct_blk_neon.c index f31654060..086293114 100644 --- a/vp8/decoder/arm/neon/idct_blk_neon.c +++ b/vp8/decoder/arm/neon/idct_blk_neon.c @@ -15,101 +15,118 @@ /* place these declarations here because we don't want to maintain them * outside of this scope */ -void idct_dequant_dc_full_2x_neon - (short *input, short *dq, unsigned char *pre, unsigned char *dst, - int stride, short *dc); -void idct_dequant_dc_0_2x_neon - (short *dc, unsigned char *pre, unsigned char *dst, int stride); -void idct_dequant_full_2x_neon - (short *q, short *dq, unsigned char *pre, unsigned char *dst, - int pitch, int stride); -void idct_dequant_0_2x_neon - (short *q, short dq, unsigned char *pre, int pitch, - unsigned char *dst, int stride); - -void vp8_dequant_dc_idct_add_y_block_neon - (short *q, short *dq, unsigned char *pre, - unsigned char *dst, int stride, char *eobs, short *dc) +void idct_dequant_dc_full_2x_neon(short *input, short *dq, + unsigned char *dst, + int stride, short *dc); +void idct_dequant_dc_0_2x_neon(short *input, short *dq, + unsigned char *dst, + int stride, short *dc); +void idct_dequant_full_2x_neon(short *q, short *dq, + unsigned char *dst, int stride); +void idct_dequant_0_2x_neon(short *q, short dq, + unsigned char *dst, int stride); + +void vp8_dequant_dc_idct_add_y_block_neon(short *q, short *dq, + unsigned char *dst, + int stride, char *eobs, short *dc) { int i; for (i = 0; i < 4; i++) { - if (((short *)eobs)[0] & 0xfefe) - idct_dequant_dc_full_2x_neon (q, dq, pre, dst, stride, dc); - else - idct_dequant_dc_0_2x_neon(dc, pre, dst, stride); - - if (((short *)eobs)[1] & 0xfefe) - idct_dequant_dc_full_2x_neon (q+32, dq, pre+8, dst+8, stride, dc+2); - else - idct_dequant_dc_0_2x_neon(dc+2, pre+8, dst+8, stride); - + if (((short *)(eobs))[0]) + { + if (((short *)eobs)[0] & 0xfefe) + idct_dequant_dc_full_2x_neon (q, dq, dst, stride, dc); + else + idct_dequant_dc_0_2x_neon(q, dq, dst, stride, dc); + } + + if (((short *)(eobs))[1]) + { + if (((short *)eobs)[1] & 0xfefe) + idct_dequant_dc_full_2x_neon (q+32, dq, dst+8, stride, dc+2); + else + idct_dequant_dc_0_2x_neon(q+32, dq, dst+8, stride, dc+2); + } q += 64; dc += 4; - pre += 64; dst += 4*stride; eobs += 4; } } -void vp8_dequant_idct_add_y_block_neon - (short *q, short *dq, unsigned char *pre, - unsigned char *dst, int stride, char *eobs) +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, + unsigned char *dst, + int stride, char *eobs) { int i; for (i = 0; i < 4; i++) { - if (((short *)eobs)[0] & 0xfefe) - idct_dequant_full_2x_neon (q, dq, pre, dst, 16, stride); - else - idct_dequant_0_2x_neon (q, dq[0], pre, 16, dst, stride); - - if (((short *)eobs)[1] & 0xfefe) - idct_dequant_full_2x_neon (q+32, dq, pre+8, dst+8, 16, stride); - else - idct_dequant_0_2x_neon (q+32, dq[0], pre+8, 16, dst+8, stride); - + if (((short *)(eobs))[0]) + { + if (((short *)eobs)[0] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dst, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dst, stride); + } + + if (((short *)(eobs))[1]) + { + if (((short *)eobs)[1] & 0xfefe) + idct_dequant_full_2x_neon (q+32, dq, dst+8, stride); + else + idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride); + } q += 64; - pre += 64; dst += 4*stride; eobs += 4; } } -void vp8_dequant_idct_add_uv_block_neon - (short *q, short *dq, unsigned char *pre, - unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, + unsigned char *dstu, + unsigned char *dstv, + int stride, char *eobs) { - if (((short *)eobs)[0] & 0xfefe) - idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride); - else - idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride); + if (((short *)(eobs))[0]) + { + if (((short *)eobs)[0] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstu, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstu, stride); + } q += 32; - pre += 32; dstu += 4*stride; - if (((short *)eobs)[1] & 0xfefe) - idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride); - else - idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride); + if (((short *)(eobs))[1]) + { + if (((short *)eobs)[1] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstu, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstu, stride); + } q += 32; - pre += 32; - if (((short *)eobs)[2] & 0xfefe) - idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride); - else - idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride); + if (((short *)(eobs))[2]) + { + if (((short *)eobs)[2] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstv, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstv, stride); + } q += 32; - pre += 32; dstv += 4*stride; - if (((short *)eobs)[3] & 0xfefe) - idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride); - else - idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride); + if (((short *)(eobs))[3]) + { + if (((short *)eobs)[3] & 0xfefe) + idct_dequant_full_2x_neon (q, dq, dstv, stride); + else + idct_dequant_0_2x_neon (q, dq[0], dstv, stride); + } } diff --git a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm index 456f8e1d4..6c29c5586 100644 --- a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm +++ b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm @@ -14,38 +14,38 @@ PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre, -; int pitch, unsigned char *dst, int stride); +;void idct_dequant_0_2x_neon(short *q, short dq, +; unsigned char *dst, int stride); ; r0 *q ; r1 dq -; r2 *pre -; r3 pitch -; sp *dst -; sp+4 stride +; r2 *dst +; r3 stride |idct_dequant_0_2x_neon| PROC + push {r4, r5} + add r12, r2, #4 vld1.32 {d2[0]}, [r2], r3 - vld1.32 {d2[1]}, [r2], r3 - vld1.32 {d4[0]}, [r2], r3 - vld1.32 {d4[1]}, [r2] vld1.32 {d8[0]}, [r12], r3 + vld1.32 {d2[1]}, [r2], r3 vld1.32 {d8[1]}, [r12], r3 + vld1.32 {d4[0]}, [r2], r3 vld1.32 {d10[0]}, [r12], r3 - vld1.32 {d10[1]}, [r12] + vld1.32 {d4[1]}, [r2], r3 + vld1.32 {d10[1]}, [r12], r3 ldrh r12, [r0] ; lo q - ldrh r2, [r0, #32] ; hi q - mov r3, #0 - strh r3, [r0] - strh r3, [r0, #32] + ldrh r4, [r0, #32] ; hi q + mov r5, #0 + strh r5, [r0] + strh r5, [r0, #32] sxth r12, r12 ; lo mul r0, r12, r1 add r0, r0, #4 asr r0, r0, #3 vdup.16 q0, r0 - sxth r2, r2 ; hi - mul r0, r2, r1 + sxth r4, r4 ; hi + mul r0, r4, r1 add r0, r0, #4 asr r0, r0, #3 vdup.16 q3, r0 @@ -55,25 +55,25 @@ vaddw.u8 q4, q3, d8 ; hi vaddw.u8 q5, q3, d10 - ldr r2, [sp] ; dst - ldr r3, [sp, #4] ; stride + sub r2, r2, r3, lsl #2 ; dst - 4*stride + add r0, r2, #4 vqmovun.s16 d2, q1 ; lo vqmovun.s16 d4, q2 vqmovun.s16 d8, q4 ; hi vqmovun.s16 d10, q5 - add r0, r2, #4 vst1.32 {d2[0]}, [r2], r3 ; lo - vst1.32 {d2[1]}, [r2], r3 - vst1.32 {d4[0]}, [r2], r3 - vst1.32 {d4[1]}, [r2] vst1.32 {d8[0]}, [r0], r3 ; hi + vst1.32 {d2[1]}, [r2], r3 vst1.32 {d8[1]}, [r0], r3 + vst1.32 {d4[0]}, [r2], r3 vst1.32 {d10[0]}, [r0], r3 + vst1.32 {d4[1]}, [r2] vst1.32 {d10[1]}, [r0] - bx lr + pop {r4, r5} + bx lr - ENDP ; |idct_dequant_0_2x_neon| + ENDP ; |idct_dequant_0_2x_neon| END diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm index 0dc036acb..bf8d7ddcd 100644 --- a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm +++ b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm @@ -14,25 +14,29 @@ PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre, + +;void idct_dequant_dc_0_2x_neon(short *q, short *dq, ; unsigned char *dst, int stride); -; r0 *dc -; r1 *pre -; r2 *dst -; r3 stride +; r0 *q, +; r1 *dq, +; r2 *dst +; r3 stride +; sp *dc |idct_dequant_dc_0_2x_neon| PROC - ldr r0, [r0] ; *dc - mov r12, #16 - vld1.32 {d2[0]}, [r1], r12 ; lo - vld1.32 {d2[1]}, [r1], r12 - vld1.32 {d4[0]}, [r1], r12 - vld1.32 {d4[1]}, [r1] - sub r1, r1, #44 - vld1.32 {d8[0]}, [r1], r12 ; hi - vld1.32 {d8[1]}, [r1], r12 - vld1.32 {d10[0]}, [r1], r12 - vld1.32 {d10[1]}, [r1] + ; no q- or dq-coeffs, so r0 and r1 are free to use + ldr r1, [sp] ; *dc + add r12, r2, #4 + ldr r0, [r1] + + vld1.32 {d2[0]}, [r2], r3 ; lo + vld1.32 {d8[0]}, [r12], r3 ; hi + vld1.32 {d2[1]}, [r2], r3 + vld1.32 {d8[1]}, [r12], r3 + vld1.32 {d4[0]}, [r2], r3 + vld1.32 {d10[0]}, [r12], r3 + vld1.32 {d4[1]}, [r2], r3 + vld1.32 {d10[1]}, [r12] sxth r1, r0 ; lo *dc add r1, r1, #4 @@ -53,14 +57,16 @@ vqmovun.s16 d8, q4 ; hi vqmovun.s16 d10, q5 + sub r2, r2, r3, lsl #2 ; dst - 4*stride add r0, r2, #4 + vst1.32 {d2[0]}, [r2], r3 ; lo - vst1.32 {d2[1]}, [r2], r3 - vst1.32 {d4[0]}, [r2], r3 - vst1.32 {d4[1]}, [r2] vst1.32 {d8[0]}, [r0], r3 ; hi + vst1.32 {d2[1]}, [r2], r3 vst1.32 {d8[1]}, [r0], r3 + vst1.32 {d4[0]}, [r2], r3 vst1.32 {d10[0]}, [r0], r3 + vst1.32 {d4[1]}, [r2] vst1.32 {d10[1]}, [r0] bx lr diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm index 61fa66075..eea41f68c 100644 --- a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm +++ b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm @@ -15,33 +15,34 @@ PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre, +;void idct_dequant_dc_full_2x_neon(short *q, short *dq, ; unsigned char *dst, int stride, short *dc); ; r0 *q, ; r1 *dq, -; r2 *pre -; r3 *dst -; sp stride -; sp+4 *dc +; r2 *dst +; r3 stride +; sp *dc |idct_dequant_dc_full_2x_neon| PROC + push {r4} + vld1.16 {q0, q1}, [r1] ; dq (same l/r) vld1.16 {q2, q3}, [r0] ; l q - mov r1, #16 ; pitch add r0, r0, #32 vld1.16 {q4, q5}, [r0] ; r q add r12, r2, #4 + ; interleave the predictors - vld1.32 {d28[0]}, [r2], r1 ; l pre - vld1.32 {d28[1]}, [r12], r1 ; r pre - vld1.32 {d29[0]}, [r2], r1 - vld1.32 {d29[1]}, [r12], r1 - vld1.32 {d30[0]}, [r2], r1 - vld1.32 {d30[1]}, [r12], r1 - vld1.32 {d31[0]}, [r2] - ldr r1, [sp, #4] + vld1.32 {d28[0]}, [r2], r3 ; l pre + vld1.32 {d28[1]}, [r12], r3 ; r pre + vld1.32 {d29[0]}, [r2], r3 + vld1.32 {d29[1]}, [r12], r3 + vld1.32 {d30[0]}, [r2], r3 + vld1.32 {d30[1]}, [r12], r3 + vld1.32 {d31[0]}, [r2], r3 + ldr r1, [sp, #4] ; *dc vld1.32 {d31[1]}, [r12] - adr r2, cospi8sqrt2minus1 ; pointer to the first constant + adr r4, cospi8sqrt2minus1 ; pointer to the first constant ldrh r12, [r1], #2 ; lo *dc ldrh r1, [r1] ; hi *dc @@ -56,7 +57,7 @@ vmov.16 d4[0], r12 vmov.16 d8[0], r1 - vld1.16 {d0}, [r2] + vld1.16 {d0}, [r4] ; q2: l0r0 q3: l8r8 ; q4: l4r4 q5: l12r12 @@ -176,26 +177,28 @@ sub r0, r0, #32 vst1.16 {q14, q15}, [r0] ; write over low input + sub r2, r2, r3, lsl #2 ; dst - 4*stride + add r1, r2, #4 ; hi + ;saturate and narrow vqmovun.s16 d0, q4 ; lo vqmovun.s16 d1, q5 vqmovun.s16 d2, q6 ; hi vqmovun.s16 d3, q7 - ldr r1, [sp] ; stride - add r2, r3, #4 ; hi - vst1.32 {d0[0]}, [r3], r1 ; lo - vst1.32 {d0[1]}, [r2], r1 ; hi - vst1.32 {d1[0]}, [r3], r1 - vst1.32 {d1[1]}, [r2], r1 - vst1.32 {d2[0]}, [r3], r1 - vst1.32 {d2[1]}, [r2], r1 - vst1.32 {d3[0]}, [r3] - vst1.32 {d3[1]}, [r2] + vst1.32 {d0[0]}, [r2], r3 ; lo + vst1.32 {d0[1]}, [r1], r3 ; hi + vst1.32 {d1[0]}, [r2], r3 + vst1.32 {d1[1]}, [r1], r3 + vst1.32 {d2[0]}, [r2], r3 + vst1.32 {d2[1]}, [r1], r3 + vst1.32 {d3[0]}, [r2] + vst1.32 {d3[1]}, [r1] - bx lr + pop {r4} + bx lr - ENDP ; |idct_dequant_dc_full_2x_neon| + ENDP ; |idct_dequant_dc_full_2x_neon| ; Constant Pool cospi8sqrt2minus1 DCD 0x4e7b diff --git a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm index 772ec4685..d5dce63f6 100644 --- a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm +++ b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm @@ -15,32 +15,30 @@ PRESERVE8 AREA ||.text||, CODE, READONLY, ALIGN=2 -;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre, -; unsigned char *dst, int pitch, int stride); +;void idct_dequant_full_2x_neon(short *q, short *dq, +; unsigned char *dst, int stride); ; r0 *q, ; r1 *dq, -; r2 *pre -; r3 *dst -; sp pitch -; sp+4 stride +; r2 *dst +; r3 stride |idct_dequant_full_2x_neon| PROC vld1.16 {q0, q1}, [r1] ; dq (same l/r) vld1.16 {q2, q3}, [r0] ; l q - ldr r1, [sp] ; pitch add r0, r0, #32 vld1.16 {q4, q5}, [r0] ; r q add r12, r2, #4 + ; interleave the predictors - vld1.32 {d28[0]}, [r2], r1 ; l pre - vld1.32 {d28[1]}, [r12], r1 ; r pre - vld1.32 {d29[0]}, [r2], r1 - vld1.32 {d29[1]}, [r12], r1 - vld1.32 {d30[0]}, [r2], r1 - vld1.32 {d30[1]}, [r12], r1 - vld1.32 {d31[0]}, [r2] + vld1.32 {d28[0]}, [r2], r3 ; l pre + vld1.32 {d28[1]}, [r12], r3 ; r pre + vld1.32 {d29[0]}, [r2], r3 + vld1.32 {d29[1]}, [r12], r3 + vld1.32 {d30[0]}, [r2], r3 + vld1.32 {d30[1]}, [r12], r3 + vld1.32 {d31[0]}, [r2], r3 vld1.32 {d31[1]}, [r12] - adr r2, cospi8sqrt2minus1 ; pointer to the first constant + adr r1, cospi8sqrt2minus1 ; pointer to the first constant ; dequant: q[i] = q[i] * dq[i] vmul.i16 q2, q2, q0 @@ -48,7 +46,7 @@ vmul.i16 q4, q4, q0 vmul.i16 q5, q5, q1 - vld1.16 {d0}, [r2] + vld1.16 {d0}, [r1] ; q2: l0r0 q3: l8r8 ; q4: l4r4 q5: l12r12 @@ -168,22 +166,23 @@ sub r0, r0, #32 vst1.16 {q14, q15}, [r0] ; write over low input + sub r2, r2, r3, lsl #2 ; dst - 4*stride + add r1, r2, #4 ; hi + ;saturate and narrow vqmovun.s16 d0, q4 ; lo vqmovun.s16 d1, q5 vqmovun.s16 d2, q6 ; hi vqmovun.s16 d3, q7 - ldr r1, [sp, #4] ; stride - add r2, r3, #4 ; hi - vst1.32 {d0[0]}, [r3], r1 ; lo - vst1.32 {d0[1]}, [r2], r1 ; hi - vst1.32 {d1[0]}, [r3], r1 - vst1.32 {d1[1]}, [r2], r1 - vst1.32 {d2[0]}, [r3], r1 - vst1.32 {d2[1]}, [r2], r1 - vst1.32 {d3[0]}, [r3] - vst1.32 {d3[1]}, [r2] + vst1.32 {d0[0]}, [r2], r3 ; lo + vst1.32 {d0[1]}, [r1], r3 ; hi + vst1.32 {d1[0]}, [r2], r3 + vst1.32 {d1[1]}, [r1], r3 + vst1.32 {d2[0]}, [r2], r3 + vst1.32 {d2[1]}, [r1], r3 + vst1.32 {d3[0]}, [r2] + vst1.32 {d3[1]}, [r1] bx lr diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c index 6bbc71f79..81f28db89 100644 --- a/vp8/decoder/decodframe.c +++ b/vp8/decoder/decodframe.c @@ -167,12 +167,12 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, /* do prediction */ if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) { - RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv)(xd); + RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd); if (mode != B_PRED) { RECON_INVOKE(&pbi->common.rtcd.recon, - build_intra_predictors_mby)(xd); + build_intra_predictors_mby_s)(xd); } else { vp8_intra_prediction_down_copy(xd); } @@ -211,20 +211,24 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int b_mode = xd->mode_info_context->bmi[i].as_mode; RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict) - (b, b_mode, b->predictor); + (b, b_mode, *(b->base_dst) + b->dst, b->dst_stride); - if (xd->eobs[i] > 1) + if (xd->eobs[i] ) { - DEQUANT_INVOKE(&pbi->dequant, idct_add) - (b->qcoeff, b->dequant, b->predictor, - *(b->base_dst) + b->dst, 16, b->dst_stride); - } - else - { - IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add) - (b->qcoeff[0] * b->dequant[0], b->predictor, - *(b->base_dst) + b->dst, 16, b->dst_stride); - ((int *)b->qcoeff)[0] = 0; + if (xd->eobs[i] > 1) + { + DEQUANT_INVOKE(&pbi->dequant, idct_add) + (b->qcoeff, b->dequant, + *(b->base_dst) + b->dst, b->dst_stride); + } + else + { + IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add) + (b->qcoeff[0] * b->dequant[0], + *(b->base_dst) + b->dst, b->dst_stride, + *(b->base_dst) + b->dst, b->dst_stride); + ((int *)b->qcoeff)[0] = 0; + } } } @@ -233,18 +237,18 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, { DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block) (xd->qcoeff, xd->block[0].dequant, - xd->predictor, xd->dst.y_buffer, + xd->dst.y_buffer, xd->dst.y_stride, xd->eobs); } else { BLOCKD *b = &xd->block[24]; - DEQUANT_INVOKE(&pbi->dequant, block)(b); - /* do 2nd order transform on the dc block */ if (xd->eobs[24] > 1) { + DEQUANT_INVOKE(&pbi->dequant, block)(b); + IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff); ((int *)b->qcoeff)[0] = 0; ((int *)b->qcoeff)[1] = 0; @@ -257,19 +261,20 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, } else { + b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0]; IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff); ((int *)b->qcoeff)[0] = 0; } DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block) (xd->qcoeff, xd->block[0].dequant, - xd->predictor, xd->dst.y_buffer, + xd->dst.y_buffer, xd->dst.y_stride, xd->eobs, xd->block[24].diff); } DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block) (xd->qcoeff+16*16, xd->block[16].dequant, - xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.u_buffer, xd->dst.v_buffer, xd->dst.uv_stride, xd->eobs+16); } diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c index a60442fe8..0861965eb 100644 --- a/vp8/decoder/dequantize.c +++ b/vp8/decoder/dequantize.c @@ -14,10 +14,6 @@ #include "vp8/common/idct.h" #include "vpx_mem/vpx_mem.h" -extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) ; -extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch); - - void vp8_dequantize_b_c(BLOCKD *d) { int i; @@ -31,12 +27,9 @@ void vp8_dequantize_b_c(BLOCKD *d) } } -void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, - unsigned char *dest, int pitch, int stride) +void vp8_dequant_idct_add_c(short *input, short *dq, + unsigned char *dest, int stride) { - short output[16]; - short *diff_ptr = output; - int r, c; int i; for (i = 0; i < 16; i++) @@ -44,40 +37,17 @@ void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, input[i] = dq[i] * input[i]; } - /* the idct halves ( >> 1) the pitch */ - vp8_short_idct4x4llm_c(input, output, 4 << 1); + vp8_short_idct4x4llm_c(input, dest, stride, dest, stride); vpx_memset(input, 0, 32); - for (r = 0; r < 4; r++) - { - for (c = 0; c < 4; c++) - { - int a = diff_ptr[c] + pred[c]; - - if (a < 0) - a = 0; - - if (a > 255) - a = 255; - - dest[c] = (unsigned char) a; - } - - dest += stride; - diff_ptr += 4; - pred += pitch; - } } -void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, - unsigned char *dest, int pitch, int stride, +void vp8_dequant_dc_idct_add_c(short *input, short *dq, + unsigned char *dest, int stride, int Dc) { int i; - short output[16]; - short *diff_ptr = output; - int r, c; input[0] = (short)Dc; @@ -86,28 +56,8 @@ void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, input[i] = dq[i] * input[i]; } - /* the idct halves ( >> 1) the pitch */ - vp8_short_idct4x4llm_c(input, output, 4 << 1); + vp8_short_idct4x4llm_c(input, dest, stride, dest, stride); vpx_memset(input, 0, 32); - for (r = 0; r < 4; r++) - { - for (c = 0; c < 4; c++) - { - int a = diff_ptr[c] + pred[c]; - - if (a < 0) - a = 0; - - if (a > 255) - a = 255; - - dest[c] = (unsigned char) a; - } - - dest += stride; - diff_ptr += 4; - pred += pitch; - } } diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h index 2e662a593..019b7f6d1 100644 --- a/vp8/decoder/dequantize.h +++ b/vp8/decoder/dequantize.h @@ -18,28 +18,28 @@ #define prototype_dequant_idct_add(sym) \ void sym(short *input, short *dq, \ - unsigned char *pred, unsigned char *output, \ - int pitch, int stride) + unsigned char *output, \ + int stride) #define prototype_dequant_dc_idct_add(sym) \ void sym(short *input, short *dq, \ - unsigned char *pred, unsigned char *output, \ - int pitch, int stride, \ + unsigned char *dst, \ + int stride, \ int dc) #define prototype_dequant_dc_idct_add_y_block(sym) \ void sym(short *q, short *dq, \ - unsigned char *pre, unsigned char *dst, \ + unsigned char *dst, \ int stride, char *eobs, short *dc) #define prototype_dequant_idct_add_y_block(sym) \ void sym(short *q, short *dq, \ - unsigned char *pre, unsigned char *dst, \ + unsigned char *dst, \ int stride, char *eobs) #define prototype_dequant_idct_add_uv_block(sym) \ void sym(short *q, short *dq, \ - unsigned char *pre, unsigned char *dst_u, \ + unsigned char *dst_u, \ unsigned char *dst_v, int stride, char *eobs) #if ARCH_X86 || ARCH_X86_64 diff --git a/vp8/decoder/error_concealment.c b/vp8/decoder/error_concealment.c index 48f97b565..86fa191d3 100644 --- a/vp8/decoder/error_concealment.c +++ b/vp8/decoder/error_concealment.c @@ -621,9 +621,8 @@ void vp8_conceal_corrupt_mb(MACROBLOCKD *xd) { /* This macroblock has corrupt residual, use the motion compensated image (predictor) for concealment */ - vp8_recon_copy16x16(xd->predictor, 16, xd->dst.y_buffer, xd->dst.y_stride); - vp8_recon_copy8x8(xd->predictor + 256, 8, - xd->dst.u_buffer, xd->dst.uv_stride); - vp8_recon_copy8x8(xd->predictor + 320, 8, - xd->dst.v_buffer, xd->dst.uv_stride); + + /* The build predictor functions now output directly into the dst buffer, + * so the copies are no longer necessary */ + } diff --git a/vp8/decoder/idct_blk.c b/vp8/decoder/idct_blk.c index 04bce665e..1c16b92a9 100644 --- a/vp8/decoder/idct_blk.c +++ b/vp8/decoder/idct_blk.c @@ -12,16 +12,17 @@ #include "vp8/common/idct.h" #include "dequantize.h" -void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred, - unsigned char *dest, int pitch, int stride, +void vp8_dequant_dc_idct_add_c(short *input, short *dq, + unsigned char *dest, int stride, int Dc); -void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred, - unsigned char *dest, int pitch, int stride); -void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, - unsigned char *dst_ptr, int pitch, int stride); +void vp8_dequant_idct_add_c(short *input, short *dq, + unsigned char *dest, int stride); +void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred, + int pred_stride, unsigned char *dst_ptr, + int dst_stride); void vp8_dequant_dc_idct_add_y_block_c - (short *q, short *dq, unsigned char *pre, + (short *q, short *dq, unsigned char *dst, int stride, char *eobs, short *dc) { int i, j; @@ -31,23 +32,21 @@ void vp8_dequant_dc_idct_add_y_block_c for (j = 0; j < 4; j++) { if (*eobs++ > 1) - vp8_dequant_dc_idct_add_c (q, dq, pre, dst, 16, stride, dc[0]); + vp8_dequant_dc_idct_add_c (q, dq, dst, stride, dc[0]); else - vp8_dc_only_idct_add_c (dc[0], pre, dst, 16, stride); + vp8_dc_only_idct_add_c (dc[0], dst, stride, dst, stride); q += 16; - pre += 4; dst += 4; dc ++; } - pre += 64 - 16; dst += 4*stride - 16; } } void vp8_dequant_idct_add_y_block_c - (short *q, short *dq, unsigned char *pre, + (short *q, short *dq, unsigned char *dst, int stride, char *eobs) { int i, j; @@ -57,25 +56,23 @@ void vp8_dequant_idct_add_y_block_c for (j = 0; j < 4; j++) { if (*eobs++ > 1) - vp8_dequant_idct_add_c (q, dq, pre, dst, 16, stride); + vp8_dequant_idct_add_c (q, dq, dst, stride); else { - vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dst, 16, stride); + vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride); ((int *)q)[0] = 0; } q += 16; - pre += 4; dst += 4; } - pre += 64 - 16; dst += 4*stride - 16; } } void vp8_dequant_idct_add_uv_block_c - (short *q, short *dq, unsigned char *pre, + (short *q, short *dq, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) { int i, j; @@ -85,19 +82,17 @@ void vp8_dequant_idct_add_uv_block_c for (j = 0; j < 2; j++) { if (*eobs++ > 1) - vp8_dequant_idct_add_c (q, dq, pre, dstu, 8, stride); + vp8_dequant_idct_add_c (q, dq, dstu, stride); else { - vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstu, 8, stride); + vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride); ((int *)q)[0] = 0; } q += 16; - pre += 4; dstu += 4; } - pre += 32 - 8; dstu += 4*stride - 8; } @@ -106,19 +101,17 @@ void vp8_dequant_idct_add_uv_block_c for (j = 0; j < 2; j++) { if (*eobs++ > 1) - vp8_dequant_idct_add_c (q, dq, pre, dstv, 8, stride); + vp8_dequant_idct_add_c (q, dq, dstv, stride); else { - vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstv, 8, stride); + vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride); ((int *)q)[0] = 0; } q += 16; - pre += 4; dstv += 4; } - pre += 32 - 8; dstv += 4*stride - 8; } } diff --git a/vp8/decoder/reconintra_mt.c b/vp8/decoder/reconintra_mt.c index 9bba5b75f..bcb2636fd 100644 --- a/vp8/decoder/reconintra_mt.c +++ b/vp8/decoder/reconintra_mt.c @@ -606,6 +606,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi, MACROBLOCKD *xd, int b_mode, unsigned char *predictor, + int stride, int mb_row, int mb_col, int num) @@ -662,7 +663,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi, predictor[c] = expected_dc; } - predictor += 16; + predictor += stride; } } break; @@ -684,7 +685,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi, predictor[c] = pred; } - predictor += 16; + predictor += stride; } } break; @@ -706,7 +707,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi, predictor[c] = ap[c]; } - predictor += 16; + predictor += stride; } } @@ -729,29 +730,29 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi, predictor[c] = lp[r]; } - predictor += 16; + predictor += stride; } } break; case B_LD_PRED: { unsigned char *ptr = Above; - predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2; - predictor[0 * 16 + 1] = - predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2; - predictor[0 * 16 + 2] = - predictor[1 * 16 + 1] = - predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2; - predictor[0 * 16 + 3] = - predictor[1 * 16 + 2] = - predictor[2 * 16 + 1] = - predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2; - predictor[1 * 16 + 3] = - predictor[2 * 16 + 2] = - predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2; - predictor[2 * 16 + 3] = - predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2; - predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2; + predictor[0 * stride + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2; + predictor[0 * stride + 1] = + predictor[1 * stride + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2; + predictor[0 * stride + 2] = + predictor[1 * stride + 1] = + predictor[2 * stride + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2; + predictor[0 * stride + 3] = + predictor[1 * stride + 2] = + predictor[2 * stride + 1] = + predictor[3 * stride + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2; + predictor[1 * stride + 3] = + predictor[2 * stride + 2] = + predictor[3 * stride + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2; + predictor[2 * stride + 3] = + predictor[3 * stride + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2; + predictor[3 * stride + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2; } break; @@ -770,22 +771,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi, pp[7] = Above[2]; pp[8] = Above[3]; - predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[3 * 16 + 1] = - predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[3 * 16 + 2] = - predictor[2 * 16 + 1] = - predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[3 * 16 + 3] = - predictor[2 * 16 + 2] = - predictor[1 * 16 + 1] = - predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[2 * 16 + 3] = - predictor[1 * 16 + 2] = - predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[1 * 16 + 3] = - predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; - predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + predictor[3 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[3 * stride + 1] = + predictor[2 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[3 * stride + 2] = + predictor[2 * stride + 1] = + predictor[1 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[3 * stride + 3] = + predictor[2 * stride + 2] = + predictor[1 * stride + 1] = + predictor[0 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * stride + 3] = + predictor[1 * stride + 2] = + predictor[0 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[1 * stride + 3] = + predictor[0 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[0 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; } break; @@ -805,22 +806,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi, pp[8] = Above[3]; - predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[3 * 16 + 1] = - predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[2 * 16 + 1] = - predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1; - predictor[3 * 16 + 2] = - predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[2 * 16 + 2] = - predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1; - predictor[3 * 16 + 3] = - predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; - predictor[2 * 16 + 3] = - predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1; - predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; - predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1; + predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[3 * stride + 1] = + predictor[1 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * stride + 1] = + predictor[0 * stride + 0] = (pp[4] + pp[5] + 1) >> 1; + predictor[3 * stride + 2] = + predictor[1 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[2 * stride + 2] = + predictor[0 * stride + 1] = (pp[5] + pp[6] + 1) >> 1; + predictor[3 * stride + 3] = + predictor[1 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[2 * stride + 3] = + predictor[0 * stride + 2] = (pp[6] + pp[7] + 1) >> 1; + predictor[1 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + predictor[0 * stride + 3] = (pp[7] + pp[8] + 1) >> 1; } break; @@ -829,22 +830,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi, unsigned char *pp = Above; - predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; - predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[2 * 16 + 0] = - predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1; - predictor[1 * 16 + 1] = - predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[2 * 16 + 1] = - predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1; - predictor[3 * 16 + 1] = - predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[0 * 16 + 3] = - predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1; - predictor[1 * 16 + 3] = - predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[1 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[2 * stride + 0] = + predictor[0 * stride + 1] = (pp[1] + pp[2] + 1) >> 1; + predictor[1 * stride + 1] = + predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * stride + 1] = + predictor[0 * stride + 2] = (pp[2] + pp[3] + 1) >> 1; + predictor[3 * stride + 1] = + predictor[1 * stride + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[0 * stride + 3] = + predictor[2 * stride + 2] = (pp[3] + pp[4] + 1) >> 1; + predictor[1 * stride + 3] = + predictor[3 * stride + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * stride + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[3 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; } break; @@ -862,22 +863,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi, pp[8] = Above[3]; - predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; - predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[2 * 16 + 0] = - predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1; - predictor[2 * 16 + 1] = - predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[2 * 16 + 2] = - predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1; - predictor[2 * 16 + 3] = - predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[1 * 16 + 2] = - predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1; - predictor[1 * 16 + 3] = - predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[3 * stride + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[3 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[2 * stride + 0] = + predictor[3 * stride + 2] = (pp[1] + pp[2] + 1) >> 1; + predictor[2 * stride + 1] = + predictor[3 * stride + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * stride + 2] = + predictor[1 * stride + 0] = (pp[2] + pp[3] + 1) >> 1; + predictor[2 * stride + 3] = + predictor[1 * stride + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[1 * stride + 2] = + predictor[0 * stride + 0] = (pp[3] + pp[4] + 1) >> 1; + predictor[1 * stride + 3] = + predictor[0 * stride + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[0 * stride + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[0 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; } break; @@ -885,22 +886,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi, case B_HU_PRED: { unsigned char *pp = Left; - predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; - predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[0 * 16 + 2] = - predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1; - predictor[0 * 16 + 3] = - predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[1 * 16 + 2] = - predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1; - predictor[1 * 16 + 3] = - predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2; - predictor[2 * 16 + 2] = - predictor[2 * 16 + 3] = - predictor[3 * 16 + 0] = - predictor[3 * 16 + 1] = - predictor[3 * 16 + 2] = - predictor[3 * 16 + 3] = pp[3]; + predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[0 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[0 * stride + 2] = + predictor[1 * stride + 0] = (pp[1] + pp[2] + 1) >> 1; + predictor[0 * stride + 3] = + predictor[1 * stride + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[1 * stride + 2] = + predictor[2 * stride + 0] = (pp[2] + pp[3] + 1) >> 1; + predictor[1 * stride + 3] = + predictor[2 * stride + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2; + predictor[2 * stride + 2] = + predictor[2 * stride + 3] = + predictor[3 * stride + 0] = + predictor[3 * stride + 1] = + predictor[3 * stride + 2] = + predictor[3 * stride + 3] = pp[3]; } break; diff --git a/vp8/decoder/reconintra_mt.h b/vp8/decoder/reconintra_mt.h index d401295b2..4576a8064 100644 --- a/vp8/decoder/reconintra_mt.h +++ b/vp8/decoder/reconintra_mt.h @@ -19,7 +19,7 @@ extern void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, i extern void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col); extern void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col); -extern void vp8mt_predict_intra4x4(VP8D_COMP *pbi, MACROBLOCKD *x, int b_mode, unsigned char *predictor, int mb_row, int mb_col, int num); +extern void vp8mt_predict_intra4x4(VP8D_COMP *pbi, MACROBLOCKD *x, int b_mode, unsigned char *predictor, int stride, int mb_row, int mb_col, int num); extern void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col); #endif diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index bfe09735c..eba5830d5 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -138,11 +138,11 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m /* do prediction */ if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) { - vp8mt_build_intra_predictors_mbuv(pbi, xd, mb_row, mb_col); + vp8mt_build_intra_predictors_mbuv_s(pbi, xd, mb_row, mb_col); if (xd->mode_info_context->mbmi.mode != B_PRED) { - vp8mt_build_intra_predictors_mby(pbi, xd, mb_row, mb_col); + vp8mt_build_intra_predictors_mby_s(pbi, xd, mb_row, mb_col); } else { vp8mt_intra_prediction_down_copy(pbi, xd, mb_row, mb_col); } @@ -201,7 +201,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block) (xd->qcoeff, xd->block[0].dequant, - xd->predictor, xd->dst.y_buffer, + xd->dst.y_buffer, xd->dst.y_stride, xd->eobs, xd->block[24].diff); } else if (xd->mode_info_context->mbmi.mode == B_PRED) @@ -211,19 +211,21 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m BLOCKD *b = &xd->block[i]; int b_mode = xd->mode_info_context->bmi[i].as_mode; - vp8mt_predict_intra4x4(pbi, xd, b_mode, b->predictor, mb_row, mb_col, i); + vp8mt_predict_intra4x4(pbi, xd, b_mode, *(b->base_dst) + b->dst, + b->dst_stride, mb_row, mb_col, i); if (xd->eobs[i] > 1) { DEQUANT_INVOKE(&pbi->dequant, idct_add) - (b->qcoeff, b->dequant, b->predictor, - *(b->base_dst) + b->dst, 16, b->dst_stride); + (b->qcoeff, b->dequant, + *(b->base_dst) + b->dst, b->dst_stride); } else { IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add) - (b->qcoeff[0] * b->dequant[0], b->predictor, - *(b->base_dst) + b->dst, 16, b->dst_stride); + (b->qcoeff[0] * b->dequant[0], + *(b->base_dst) + b->dst, b->dst_stride, + *(b->base_dst) + b->dst, b->dst_stride); ((int *)b->qcoeff)[0] = 0; } } @@ -232,13 +234,13 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m { DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block) (xd->qcoeff, xd->block[0].dequant, - xd->predictor, xd->dst.y_buffer, + xd->dst.y_buffer, xd->dst.y_stride, xd->eobs); } DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block) (xd->qcoeff+16*16, xd->block[16].dequant, - xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.u_buffer, xd->dst.v_buffer, xd->dst.uv_stride, xd->eobs+16); } diff --git a/vp8/decoder/x86/dequantize_mmx.asm b/vp8/decoder/x86/dequantize_mmx.asm index 0d6133a46..648bde4c5 100644 --- a/vp8/decoder/x86/dequantize_mmx.asm +++ b/vp8/decoder/x86/dequantize_mmx.asm @@ -50,14 +50,17 @@ sym(vp8_dequantize_b_impl_mmx): ret -;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride) +;void dequant_idct_add_mmx( +;short *input, 0 +;short *dq, 1 +;unsigned char *dest, 2 +;int stride) 3 global sym(vp8_dequant_idct_add_mmx) sym(vp8_dequant_idct_add_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 + SHADOW_ARGS_TO_STACK 4 GET_GOT rbx - push rsi push rdi ; end prolog @@ -77,8 +80,8 @@ sym(vp8_dequant_idct_add_mmx): movq mm3, [rax+24] pmullw mm3, [rdx+24] - mov rdx, arg(3) ;dest - mov rsi, arg(2) ;pred + mov rdx, arg(2) ;dest + pxor mm7, mm7 @@ -89,8 +92,7 @@ sym(vp8_dequant_idct_add_mmx): movq [rax+24],mm7 - movsxd rax, dword ptr arg(4) ;pitch - movsxd rdi, dword ptr arg(5) ;stride + movsxd rdi, dword ptr arg(3) ;stride psubw mm0, mm2 ; b1= 0-2 paddw mm2, mm2 ; @@ -211,28 +213,27 @@ sym(vp8_dequant_idct_add_mmx): pxor mm7, mm7 - movd mm4, [rsi] + movd mm4, [rdx] punpcklbw mm4, mm7 paddsw mm0, mm4 packuswb mm0, mm7 movd [rdx], mm0 - movd mm4, [rsi+rax] + movd mm4, [rdx+rdi] punpcklbw mm4, mm7 paddsw mm1, mm4 packuswb mm1, mm7 movd [rdx+rdi], mm1 - movd mm4, [rsi+2*rax] + movd mm4, [rdx+2*rdi] punpcklbw mm4, mm7 paddsw mm2, mm4 packuswb mm2, mm7 movd [rdx+rdi*2], mm2 add rdx, rdi - add rsi, rax - movd mm4, [rsi+2*rax] + movd mm4, [rdx+2*rdi] punpcklbw mm4, mm7 paddsw mm5, mm4 packuswb mm5, mm7 @@ -240,22 +241,24 @@ sym(vp8_dequant_idct_add_mmx): ; begin epilog pop rdi - pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp ret -;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc) +;void dequant_dc_idct_add_mmx( +;short *input, 0 +;short *dq, 1 +;unsigned char *dest, 2 +;int stride, 3 +;int Dc) 4 global sym(vp8_dequant_dc_idct_add_mmx) sym(vp8_dequant_dc_idct_add_mmx): push rbp mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 + SHADOW_ARGS_TO_STACK 5 GET_GOT rbx - push rsi - push rdi ; end prolog mov rax, arg(0) ;input @@ -273,8 +276,7 @@ sym(vp8_dequant_dc_idct_add_mmx): movq mm3, [rax+24] pmullw mm3, [rdx+24] - mov rdx, arg(3) ;dest - mov rsi, arg(2) ;pred + mov rdx, arg(2) ;pred pxor mm7, mm7 @@ -286,13 +288,12 @@ sym(vp8_dequant_dc_idct_add_mmx): ; move lower word of Dc to lower word of mm0 psrlq mm0, 16 - movzx rcx, word ptr arg(6) ;Dc + movzx rcx, word ptr arg(4) ;Dc psllq mm0, 16 movq mm7, rcx por mm0, mm7 - movsxd rax, dword ptr arg(4) ;pitch - movsxd rdi, dword ptr arg(5) ;stride + movsxd rax, dword ptr arg(3) ;stride psubw mm0, mm2 ; b1= 0-2 paddw mm2, mm2 ; @@ -413,36 +414,33 @@ sym(vp8_dequant_dc_idct_add_mmx): pxor mm7, mm7 - movd mm4, [rsi] + movd mm4, [rdx] punpcklbw mm4, mm7 paddsw mm0, mm4 packuswb mm0, mm7 movd [rdx], mm0 - movd mm4, [rsi+rax] + movd mm4, [rdx+rax] punpcklbw mm4, mm7 paddsw mm1, mm4 packuswb mm1, mm7 - movd [rdx+rdi], mm1 + movd [rdx+rax], mm1 - movd mm4, [rsi+2*rax] + movd mm4, [rdx+2*rax] punpcklbw mm4, mm7 paddsw mm2, mm4 packuswb mm2, mm7 - movd [rdx+rdi*2], mm2 + movd [rdx+rax*2], mm2 - add rdx, rdi - add rsi, rax + add rdx, rax - movd mm4, [rsi+2*rax] + movd mm4, [rdx+2*rax] punpcklbw mm4, mm7 paddsw mm5, mm4 packuswb mm5, mm7 - movd [rdx+rdi*2], mm5 + movd [rdx+rax*2], mm5 ; begin epilog - pop rdi - pop rsi RESTORE_GOT UNSHADOW_ARGS pop rbp diff --git a/vp8/decoder/x86/idct_blk_mmx.c b/vp8/decoder/x86/idct_blk_mmx.c index 558dbaf7e..37de5b9fd 100644 --- a/vp8/decoder/x86/idct_blk_mmx.c +++ b/vp8/decoder/x86/idct_blk_mmx.c @@ -13,7 +13,7 @@ #include "vp8/decoder/dequantize.h" void vp8_dequant_dc_idct_add_y_block_mmx - (short *q, short *dq, unsigned char *pre, + (short *q, short *dq, unsigned char *dst, int stride, char *eobs, short *dc) { int i; @@ -21,35 +21,34 @@ void vp8_dequant_dc_idct_add_y_block_mmx for (i = 0; i < 4; i++) { if (eobs[0] > 1) - vp8_dequant_dc_idct_add_mmx (q, dq, pre, dst, 16, stride, dc[0]); - else - vp8_dc_only_idct_add_mmx (dc[0], pre, dst, 16, stride); + vp8_dequant_dc_idct_add_mmx (q, dq, dst, stride, dc[0]); + else if (eobs[0] == 1) + vp8_dc_only_idct_add_mmx (dc[0], dst, stride, dst, stride); if (eobs[1] > 1) - vp8_dequant_dc_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride, dc[1]); - else - vp8_dc_only_idct_add_mmx (dc[1], pre+4, dst+4, 16, stride); + vp8_dequant_dc_idct_add_mmx (q+16, dq, dst+4, stride, dc[1]); + else if (eobs[1] == 1) + vp8_dc_only_idct_add_mmx (dc[1], dst+4, stride, dst+4, stride); if (eobs[2] > 1) - vp8_dequant_dc_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride, dc[2]); - else - vp8_dc_only_idct_add_mmx (dc[2], pre+8, dst+8, 16, stride); + vp8_dequant_dc_idct_add_mmx (q+32, dq, dst+8, stride, dc[2]); + else if (eobs[2] == 1) + vp8_dc_only_idct_add_mmx (dc[2], dst+8, stride, dst+8, stride); if (eobs[3] > 1) - vp8_dequant_dc_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride, dc[3]); - else - vp8_dc_only_idct_add_mmx (dc[3], pre+12, dst+12, 16, stride); + vp8_dequant_dc_idct_add_mmx (q+48, dq, dst+12, stride, dc[3]); + else if (eobs[3] == 1) + vp8_dc_only_idct_add_mmx (dc[3], dst+12, stride, dst+12, stride); q += 64; dc += 4; - pre += 64; dst += 4*stride; eobs += 4; } } void vp8_dequant_idct_add_y_block_mmx - (short *q, short *dq, unsigned char *pre, + (short *q, short *dq, unsigned char *dst, int stride, char *eobs) { int i; @@ -57,46 +56,48 @@ void vp8_dequant_idct_add_y_block_mmx for (i = 0; i < 4; i++) { if (eobs[0] > 1) - vp8_dequant_idct_add_mmx (q, dq, pre, dst, 16, stride); - else + vp8_dequant_idct_add_mmx (q, dq, dst, stride); + else if (eobs[0] == 1) { - vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dst, 16, stride); + vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride); ((int *)q)[0] = 0; } if (eobs[1] > 1) - vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride); - else + vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride); + else if (eobs[1] == 1) { - vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dst+4, 16, stride); + vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride, + dst+4, stride); ((int *)(q+16))[0] = 0; } if (eobs[2] > 1) - vp8_dequant_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride); - else + vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride); + else if (eobs[2] == 1) { - vp8_dc_only_idct_add_mmx (q[32]*dq[0], pre+8, dst+8, 16, stride); + vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride, + dst+8, stride); ((int *)(q+32))[0] = 0; } if (eobs[3] > 1) - vp8_dequant_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride); - else + vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride); + else if (eobs[3] == 1) { - vp8_dc_only_idct_add_mmx (q[48]*dq[0], pre+12, dst+12, 16, stride); + vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride, + dst+12, stride); ((int *)(q+48))[0] = 0; } q += 64; - pre += 64; dst += 4*stride; eobs += 4; } } void vp8_dequant_idct_add_uv_block_mmx - (short *q, short *dq, unsigned char *pre, + (short *q, short *dq, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) { int i; @@ -104,23 +105,23 @@ void vp8_dequant_idct_add_uv_block_mmx for (i = 0; i < 2; i++) { if (eobs[0] > 1) - vp8_dequant_idct_add_mmx (q, dq, pre, dstu, 8, stride); - else + vp8_dequant_idct_add_mmx (q, dq, dstu, stride); + else if (eobs[0] == 1) { - vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstu, 8, stride); + vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride); ((int *)q)[0] = 0; } if (eobs[1] > 1) - vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstu+4, 8, stride); - else + vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride); + else if (eobs[1] == 1) { - vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstu+4, 8, stride); + vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride, + dstu+4, stride); ((int *)(q+16))[0] = 0; } q += 32; - pre += 32; dstu += 4*stride; eobs += 2; } @@ -128,23 +129,23 @@ void vp8_dequant_idct_add_uv_block_mmx for (i = 0; i < 2; i++) { if (eobs[0] > 1) - vp8_dequant_idct_add_mmx (q, dq, pre, dstv, 8, stride); - else + vp8_dequant_idct_add_mmx (q, dq, dstv, stride); + else if (eobs[0] == 1) { - vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstv, 8, stride); + vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride); ((int *)q)[0] = 0; } if (eobs[1] > 1) - vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstv+4, 8, stride); - else + vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride); + else if (eobs[1] == 1) { - vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstv+4, 8, stride); + vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride, + dstv+4, stride); ((int *)(q+16))[0] = 0; } q += 32; - pre += 32; dstv += 4*stride; eobs += 2; } diff --git a/vp8/decoder/x86/idct_blk_sse2.c b/vp8/decoder/x86/idct_blk_sse2.c index a6a720639..0495b0610 100644 --- a/vp8/decoder/x86/idct_blk_sse2.c +++ b/vp8/decoder/x86/idct_blk_sse2.c @@ -13,102 +13,115 @@ #include "vp8/decoder/dequantize.h" void vp8_idct_dequant_dc_0_2x_sse2 - (short *q, short *dq, unsigned char *pre, + (short *q, short *dq, unsigned char *dst, int dst_stride, short *dc); void vp8_idct_dequant_dc_full_2x_sse2 - (short *q, short *dq, unsigned char *pre, + (short *q, short *dq, unsigned char *dst, int dst_stride, short *dc); void vp8_idct_dequant_0_2x_sse2 - (short *q, short *dq ,unsigned char *pre, - unsigned char *dst, int dst_stride, int blk_stride); + (short *q, short *dq , + unsigned char *dst, int dst_stride); void vp8_idct_dequant_full_2x_sse2 - (short *q, short *dq ,unsigned char *pre, - unsigned char *dst, int dst_stride, int blk_stride); + (short *q, short *dq , + unsigned char *dst, int dst_stride); void vp8_dequant_dc_idct_add_y_block_sse2 - (short *q, short *dq, unsigned char *pre, + (short *q, short *dq, unsigned char *dst, int stride, char *eobs, short *dc) { int i; for (i = 0; i < 4; i++) { - if (((short *)(eobs))[0] & 0xfefe) - vp8_idct_dequant_dc_full_2x_sse2 (q, dq, pre, dst, stride, dc); - else - vp8_idct_dequant_dc_0_2x_sse2 (q, dq, pre, dst, stride, dc); - - if (((short *)(eobs))[1] & 0xfefe) - vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2); - else - vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2); - + if (((short *)(eobs))[0]) + { + if (((short *)(eobs))[0] & 0xfefe) + vp8_idct_dequant_dc_full_2x_sse2 (q, dq, dst, stride, dc); + else + vp8_idct_dequant_dc_0_2x_sse2 (q, dq, dst, stride, dc); + } + + if (((short *)(eobs))[1]) + { + if (((short *)(eobs))[1] & 0xfefe) + vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, dst+8, stride, dc+2); + else + vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, dst+8, stride, dc+2); + } q += 64; dc += 4; - pre += 64; dst += stride*4; eobs += 4; } } void vp8_dequant_idct_add_y_block_sse2 - (short *q, short *dq, unsigned char *pre, + (short *q, short *dq, unsigned char *dst, int stride, char *eobs) { int i; for (i = 0; i < 4; i++) { - if (((short *)(eobs))[0] & 0xfefe) - vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dst, stride, 16); - else - vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dst, stride, 16); - - if (((short *)(eobs))[1] & 0xfefe) - vp8_idct_dequant_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16); - else - vp8_idct_dequant_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16); - + if (((short *)(eobs))[0]) + { + if (((short *)(eobs))[0] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride); + } + if (((short *)(eobs))[1]) + { + if (((short *)(eobs))[1] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride); + else + vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride); + } q += 64; - pre += 64; dst += stride*4; eobs += 4; } } void vp8_dequant_idct_add_uv_block_sse2 - (short *q, short *dq, unsigned char *pre, + (short *q, short *dq, unsigned char *dstu, unsigned char *dstv, int stride, char *eobs) { - if (((short *)(eobs))[0] & 0xfefe) - vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8); - else - vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8); - + if (((short *)(eobs))[0]) + { + if (((short *)(eobs))[0] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride); + } q += 32; - pre += 32; dstu += stride*4; - if (((short *)(eobs))[1] & 0xfefe) - vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8); - else - vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8); - + if (((short *)(eobs))[1]) + { + if (((short *)(eobs))[1] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride); + } q += 32; - pre += 32; - - if (((short *)(eobs))[2] & 0xfefe) - vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8); - else - vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8); + if (((short *)(eobs))[2]) + { + if (((short *)(eobs))[2] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride); + } q += 32; - pre += 32; dstv += stride*4; - if (((short *)(eobs))[3] & 0xfefe) - vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8); - else - vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8); + if (((short *)(eobs))[3]) + { + if (((short *)(eobs))[3] & 0xfefe) + vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride); + else + vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride); + } } diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm index c00375e88..c061b2fab 100644 --- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm +++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm @@ -65,7 +65,7 @@ numparts_loop ldr r10, [sp, #40] ; ptr ldr r5, [sp, #36] ; move mb_rows to the counting section - sub r5, r5, r11 ; move start point with each partition + subs r5, r5, r11 ; move start point with each partition ; mb_rows starts at i str r5, [sp, #12] @@ -80,6 +80,8 @@ numparts_loop str r2, [r0, #vp8_writer_pos] str r10, [r0, #vp8_writer_buffer] + ble end_partition ; if (mb_rows <= 0) end partition + mb_row_loop ldr r1, [r7, #tokenlist_start] @@ -344,6 +346,7 @@ check_p_lt_stop str r6, [sp, #12] bgt mb_row_loop +end_partition mov r12, #32 stop_encode_loop diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 510e4cc98..7f2b46daa 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -456,7 +456,7 @@ void encode_mb_row(VP8_COMP *cpi, vp8_activity_masking(cpi, x); // Is segmentation enabled - // MB level adjutment to quantizer + // MB level adjustment to quantizer if (xd->segmentation_enabled) { // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking) @@ -505,7 +505,8 @@ void encode_mb_row(VP8_COMP *cpi, // Special case code for cyclic refresh // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map - if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled) + if ((cpi->current_layer == 0) && + (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)) { cpi->segmentation_map[map_index+mb_col] = xd->mode_info_context->mbmi.segment_id; @@ -648,6 +649,30 @@ void init_encode_frame_mb_context(VP8_COMP *cpi) + vp8_cost_one(255) + vp8_cost_one(128); } + else if ((cpi->oxcf.number_of_layers > 1) && + (cpi->ref_frame_flags == VP8_GOLD_FLAG)) + { + xd->ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_zero(1); + xd->ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_one(1) + + vp8_cost_zero(255); + xd->ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_one(1) + + vp8_cost_one(255); + } + else if ((cpi->oxcf.number_of_layers > 1) && + (cpi->ref_frame_flags == VP8_ALT_FLAG)) + { + xd->ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_zero(1); + xd->ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_one(1) + + vp8_cost_zero(1); + xd->ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded) + + vp8_cost_one(1) + + vp8_cost_one(1); + } else { xd->ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded) @@ -937,7 +962,8 @@ void vp8_encode_frame(VP8_COMP *cpi) // Adjust the projected reference frame useage probability numbers to reflect // what we have just seen. This may be usefull when we make multiple itterations // of the recode loop rather than continuing to use values from the previous frame. - if ((cm->frame_type != KEY_FRAME) && !cm->refresh_alt_ref_frame && !cm->refresh_golden_frame) + if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) || + (!cm->refresh_alt_ref_frame && !cm->refresh_golden_frame))) { const int *const rfct = cpi->count_mb_ref_frame_usage; const int rf_intra = rfct[INTRA_FRAME]; @@ -1220,7 +1246,7 @@ int vp8cx_encode_inter_macroblock if (xd->segmentation_enabled) { // If cyclic update enabled - if (cpi->cyclic_refresh_mode_enabled) + if (cpi->current_layer == 0 && cpi->cyclic_refresh_mode_enabled) { // Clear segment_id back to 0 if not coded (last frame 0,0) if ((xd->mode_info_context->mbmi.segment_id == 1) && diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index 4a77c1ff3..74e40323d 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -64,7 +64,7 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, BLOCK *be = &x->block[ib]; RECON_INVOKE(&rtcd->common->recon, intra4x4_predict) - (b, b->bmi.as_mode, b->predictor); + (b, b->bmi.as_mode, b->predictor, 16); ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16); @@ -72,9 +72,8 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, x->quantize_b(be, b); - vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 32); + vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 16); - RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); } void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb) @@ -106,9 +105,6 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd); - RECON_INVOKE(&rtcd->common->recon, recon_mby) - (IF_RTCD(&rtcd->common->recon), &x->e_mbd); - } void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) @@ -126,5 +122,4 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd); - vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd); } diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c index ff9e3e6ee..b3c7df502 100644 --- a/vp8/encoder/encodemb.c +++ b/vp8/encoder/encodemb.c @@ -577,9 +577,70 @@ void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) } } +static void recon_dcblock(MACROBLOCKD *x) +{ + BLOCKD *b = &x->block[24]; + int i; + + for (i = 0; i < 16; i++) + { + x->block[i].dqcoeff[0] = b->diff[i]; + } + +} + + +static void inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, + MACROBLOCKD *x) +{ + int i; + + if (x->mode_info_context->mbmi.mode != B_PRED && + x->mode_info_context->mbmi.mode != SPLITMV) + { + /* do 2nd order transform on the dc block */ + + IDCT_INVOKE(rtcd, iwalsh16)(&x->block[24].dqcoeff[0], x->block[24].diff); + recon_dcblock(x); + } + + for (i = 0; i < 16; i++) + { + BLOCKD *b = &x->block[i]; + + if (b->eob > 1) + { + IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, 16, + *(b->base_dst) + b->dst, b->dst_stride); + } + else + { + IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, 16, + *(b->base_dst) + b->dst, b->dst_stride); + } + } + + + for (i = 16; i < 24; i++) + { + BLOCKD *b = &x->block[i]; + + if (b->eob > 1) + { + IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, 8, + *(b->base_dst) + b->dst, b->dst_stride); + } + else + { + IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, 8, + *(b->base_dst) + b->dst, b->dst_stride); + } + } + +} void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { - vp8_build_inter_predictors_mb(&x->e_mbd); + vp8_build_inter_predictors_mb_e(&x->e_mbd); vp8_subtract_mb(rtcd, x); @@ -590,10 +651,8 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) if (x->optimize) optimize_mb(x, rtcd); - vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd); + inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd); - RECON_INVOKE(&rtcd->common->recon, recon_mb) - (IF_RTCD(&rtcd->common->recon), &x->e_mbd); } @@ -612,6 +671,4 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd); - RECON_INVOKE(&rtcd->common->recon, recon_mby) - (IF_RTCD(&rtcd->common->recon), &x->e_mbd); } diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index cac92057c..43c971480 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -237,6 +237,79 @@ void vp8_initialize() extern FILE *vpxlogc; #endif +static void save_layer_context(VP8_COMP *cpi) +{ + LAYER_CONTEXT *lc = &cpi->layer_context[cpi->current_layer]; + + // Save layer dependent coding state + lc->target_bandwidth = cpi->target_bandwidth; + //lc->target_bandwidth = cpi->oxcf.target_bandwidth; + lc->starting_buffer_level = cpi->oxcf.starting_buffer_level; + lc->optimal_buffer_level = cpi->oxcf.optimal_buffer_level; + lc->maximum_buffer_size = cpi->oxcf.maximum_buffer_size; + lc->buffer_level = cpi->buffer_level; + lc->bits_off_target = cpi->bits_off_target; + lc->total_actual_bits = cpi->total_actual_bits; + lc->worst_quality = cpi->worst_quality; + lc->active_worst_quality = cpi->active_worst_quality; + lc->best_quality = cpi->best_quality; + lc->active_best_quality = cpi->active_best_quality; + lc->ni_av_qi = cpi->ni_av_qi; + lc->ni_tot_qi = cpi->ni_tot_qi; + lc->ni_frames = cpi->ni_frames; + lc->avg_frame_qindex = cpi->avg_frame_qindex; + lc->rate_correction_factor = cpi->rate_correction_factor; + lc->key_frame_rate_correction_factor = cpi->key_frame_rate_correction_factor; + lc->gf_rate_correction_factor = cpi->gf_rate_correction_factor; + lc->zbin_over_quant = cpi->zbin_over_quant; + lc->inter_frame_target = cpi->inter_frame_target; + lc->total_byte_count = cpi->total_byte_count; + lc->filter_level = cpi->common.filter_level; + + lc->last_frame_percent_intra = cpi->last_frame_percent_intra; + + memcpy (lc->count_mb_ref_frame_usage, + cpi->count_mb_ref_frame_usage, + sizeof(cpi->count_mb_ref_frame_usage)); +} + +static void restore_layer_context(VP8_COMP *cpi, const int layer) +{ + LAYER_CONTEXT *lc = &cpi->layer_context[layer]; + + // Restore layer dependent coding state + cpi->current_layer = layer; + cpi->target_bandwidth = lc->target_bandwidth; + cpi->oxcf.target_bandwidth = lc->target_bandwidth; + cpi->oxcf.starting_buffer_level = lc->starting_buffer_level; + cpi->oxcf.optimal_buffer_level = lc->optimal_buffer_level; + cpi->oxcf.maximum_buffer_size = lc->maximum_buffer_size; + cpi->buffer_level = lc->buffer_level; + cpi->bits_off_target = lc->bits_off_target; + cpi->total_actual_bits = lc->total_actual_bits; + //cpi->worst_quality = lc->worst_quality; + cpi->active_worst_quality = lc->active_worst_quality; + //cpi->best_quality = lc->best_quality; + cpi->active_best_quality = lc->active_best_quality; + cpi->ni_av_qi = lc->ni_av_qi; + cpi->ni_tot_qi = lc->ni_tot_qi; + cpi->ni_frames = lc->ni_frames; + cpi->avg_frame_qindex = lc->avg_frame_qindex; + cpi->rate_correction_factor = lc->rate_correction_factor; + cpi->key_frame_rate_correction_factor = lc->key_frame_rate_correction_factor; + cpi->gf_rate_correction_factor = lc->gf_rate_correction_factor; + cpi->zbin_over_quant = lc->zbin_over_quant; + cpi->inter_frame_target = lc->inter_frame_target; + cpi->total_byte_count = lc->total_byte_count; + cpi->common.filter_level = lc->filter_level; + + cpi->last_frame_percent_intra = lc->last_frame_percent_intra; + + memcpy (cpi->count_mb_ref_frame_usage, + lc->count_mb_ref_frame_usage, + sizeof(cpi->count_mb_ref_frame_usage)); +} + static void setup_features(VP8_COMP *cpi) { // Set up default state for MB feature flags @@ -510,7 +583,7 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) set_segment_data((VP8_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA); // Delete sementation map - vpx_free(seg_map); + vpx_free(seg_map); seg_map = 0; @@ -1397,11 +1470,13 @@ void vp8_new_frame_rate(VP8_COMP *cpi, double framerate) if(framerate < .1) framerate = 30; - cpi->oxcf.frame_rate = framerate; - cpi->output_frame_rate = cpi->oxcf.frame_rate; - cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate); - cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate); - cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100); + cpi->oxcf.frame_rate = framerate; + cpi->output_frame_rate = cpi->oxcf.frame_rate; + cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / + cpi->output_frame_rate); + cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth; + cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * + cpi->oxcf.two_pass_vbrmin_section / 100); // Set Maximum gf/arf interval cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2); @@ -1472,6 +1547,65 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->total_actual_bits = 0; cpi->total_target_vs_actual = 0; + // Temporal scalabilty + if (cpi->oxcf.number_of_layers > 1) + { + int i; + int prev_layer_frame_rate=0; + + for (i=0; i<cpi->oxcf.number_of_layers; i++) + { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + + // Layer configuration + lc->frame_rate = + cpi->output_frame_rate / cpi->oxcf.rate_decimator[i]; + lc->target_bandwidth = cpi->oxcf.target_bitrate[i] * 1000; + + lc->starting_buffer_level = + rescale(oxcf->starting_buffer_level, + lc->target_bandwidth, 1000); + + if (oxcf->optimal_buffer_level == 0) + lc->optimal_buffer_level = lc->target_bandwidth / 8; + else + lc->optimal_buffer_level = + rescale(oxcf->optimal_buffer_level, + lc->target_bandwidth, 1000); + + if (oxcf->maximum_buffer_size == 0) + lc->maximum_buffer_size = lc->target_bandwidth / 8; + else + lc->maximum_buffer_size = + rescale(oxcf->maximum_buffer_size, + lc->target_bandwidth, 1000); + + // Work out the average size of a frame within this layer + if (i > 0) + lc->avg_frame_size_for_layer = (cpi->oxcf.target_bitrate[i] - + cpi->oxcf.target_bitrate[i-1]) * 1000 / + (lc->frame_rate - prev_layer_frame_rate); + + lc->active_worst_quality = cpi->oxcf.worst_allowed_q; + lc->active_best_quality = cpi->oxcf.best_allowed_q; + lc->avg_frame_qindex = cpi->oxcf.worst_allowed_q; + + lc->buffer_level = lc->starting_buffer_level; + lc->bits_off_target = lc->starting_buffer_level; + + lc->total_actual_bits = 0; + lc->ni_av_qi = 0; + lc->ni_tot_qi = 0; + lc->ni_frames = 0; + lc->rate_correction_factor = 1.0; + lc->key_frame_rate_correction_factor = 1.0; + lc->gf_rate_correction_factor = 1.0; + lc->inter_frame_target = 0.0; + + prev_layer_frame_rate = lc->frame_rate; + } + } + #if VP8_TEMPORAL_ALT_REF { int i; @@ -1693,11 +1827,11 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->target_bandwidth = cpi->oxcf.target_bandwidth; - cm->Width = cpi->oxcf.Width ; - cm->Height = cpi->oxcf.Height ; + cm->Width = cpi->oxcf.Width; + cm->Height = cpi->oxcf.Height; cm->horiz_scale = cpi->horiz_scale; - cm->vert_scale = cpi->vert_scale ; + cm->vert_scale = cpi->vert_scale; // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) if (cpi->oxcf.Sharpness > 7) @@ -1828,7 +1962,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) cpi->prob_gf_coded = 128; cpi->prob_intra_coded = 63; - // Prime the recent reference frame useage counters. + // Prime the recent reference frame usage counters. // Hereafter they will be maintained as a sort of moving average cpi->recent_ref_frame_usage[INTRA_FRAME] = 1; cpi->recent_ref_frame_usage[LAST_FRAME] = 1; @@ -2143,35 +2277,106 @@ void vp8_remove_compressor(VP8_PTR *ptr) FILE *f = fopen("opsnr.stt", "a"); double time_encoded = (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) / 10000000.000; - double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data) / 1000.000; - double dr = (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded; + double total_encode_time = (cpi->time_receive_data + + cpi->time_compress_data) / 1000.000; + double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded; if (cpi->b_calculate_psnr) { - YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx]; - double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height; - double total_psnr = vp8_mse2psnr(samples, 255.0, cpi->total_sq_error); - double total_psnr2 = vp8_mse2psnr(samples, 255.0, cpi->total_sq_error2); - double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0); - - fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\tVPXSSIM\t Time(us)\n"); - fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n", - dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim, - total_encode_time); + YV12_BUFFER_CONFIG *lst_yv12 = + &cpi->common.yv12_fb[cpi->common.lst_fb_idx]; + + if (cpi->oxcf.number_of_layers > 1) + { + int i; + + fprintf(f, "Layer\tBitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t" + "GLPsnrP\tVPXSSIM\t\n"); + for (i=0; i<cpi->oxcf.number_of_layers; i++) + { + double dr = (double)cpi->bytes_in_layer[i] * + 8.0 / 1000.0 / time_encoded; + double samples = 3.0 / 2 * cpi->frames_in_layer[i] * + lst_yv12->y_width * lst_yv12->y_height; + double total_psnr = vp8_mse2psnr(samples, 255.0, + cpi->total_error2[i]); + double total_psnr2 = vp8_mse2psnr(samples, 255.0, + cpi->total_error2_p[i]); + double total_ssim = 100 * pow(cpi->sum_ssim[i] / + cpi->sum_weights[i], 8.0); + + fprintf(f, "%5d\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%7.3f\n", + i, dr, + cpi->sum_psnr[i] / cpi->frames_in_layer[i], + total_psnr, + cpi->sum_psnr_p[i] / cpi->frames_in_layer[i], + total_psnr2, total_ssim); + } + } + else + { + double samples = 3.0 / 2 * cpi->count * + lst_yv12->y_width * lst_yv12->y_height; + double total_psnr = vp8_mse2psnr(samples, 255.0, + cpi->total_sq_error); + double total_psnr2 = vp8_mse2psnr(samples, 255.0, + cpi->total_sq_error2); + double total_ssim = 100 * pow(cpi->summed_quality / + cpi->summed_weights, 8.0); + + fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t" + "GLPsnrP\tVPXSSIM\t Time(us)\n"); + fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t" + "%7.3f\t%8.0f\n", + dr, cpi->total / cpi->count, total_psnr, + cpi->totalp / cpi->count, total_psnr2, + total_ssim, total_encode_time); + } } if (cpi->b_calculate_ssimg) { - fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t Time(us)\n"); - fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr, - cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count, - cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time); + if (cpi->oxcf.number_of_layers > 1) + { + int i; + + fprintf(f, "Layer\tBitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t" + "Time(us)\n"); + for (i=0; i<cpi->oxcf.number_of_layers; i++) + { + double dr = (double)cpi->bytes_in_layer[i] * + 8.0 / 1000.0 / time_encoded; + fprintf(f, "%5d\t%7.3f\t%6.4f\t" + "%6.4f\t%6.4f\t%6.4f\t%8.0f\n", + i, dr, + cpi->total_ssimg_y_in_layer[i] / + cpi->frames_in_layer[i], + cpi->total_ssimg_u_in_layer[i] / + cpi->frames_in_layer[i], + cpi->total_ssimg_v_in_layer[i] / + cpi->frames_in_layer[i], + cpi->total_ssimg_all_in_layer[i] / + cpi->frames_in_layer[i], + total_encode_time); + } + } + else + { + fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t" + "Time(us)\n"); + fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr, + cpi->total_ssimg_y / cpi->count, + cpi->total_ssimg_u / cpi->count, + cpi->total_ssimg_v / cpi->count, + cpi->total_ssimg_all / cpi->count, total_encode_time); + } } fclose(f); #if 0 f = fopen("qskip.stt", "a"); - fprintf(f, "minq:%d -maxq:%d skipture:skipfalse = %d:%d\n", cpi->oxcf.best_allowed_q, cpi->oxcf.worst_allowed_q, skiptruecount, skipfalsecount); + fprintf(f, "minq:%d -maxq:%d skiptrue:skipfalse = %d:%d\n", cpi->oxcf.best_allowed_q, cpi->oxcf.worst_allowed_q, skiptruecount, skipfalsecount); fclose(f); #endif @@ -2841,10 +3046,41 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi) } else if (!(rf_intra + rf_inter)) { - // This is a trap in case this function is called with cpi->recent_ref_frame_usage[] blank. - cpi->prob_intra_coded = 63; - cpi->prob_last_coded = 128; - cpi->prob_gf_coded = 128; + if (cpi->oxcf.number_of_layers > 1) + { + if (cpi->ref_frame_flags == VP8_LAST_FLAG) + { + cpi->prob_intra_coded = 63; + cpi->prob_last_coded = 255; + cpi->prob_gf_coded = 128; + } + else if (cpi->ref_frame_flags == VP8_GOLD_FLAG) + { + cpi->prob_intra_coded = 63; + cpi->prob_last_coded = 1; + cpi->prob_gf_coded = 255; + } + else if (cpi->ref_frame_flags == VP8_ALT_FLAG) + { + cpi->prob_intra_coded = 63; + cpi->prob_last_coded = 1; + cpi->prob_gf_coded = 1; + } + else + { + cpi->prob_intra_coded = 63; + cpi->prob_last_coded = 128; + cpi->prob_gf_coded = 128; + } + } + else + { + // This is a trap in case this function is called with + // cpi->recent_ref_frame_usage[] blank. + cpi->prob_intra_coded = 63; + cpi->prob_last_coded = 128; + cpi->prob_gf_coded = 128; + } } else { @@ -2866,32 +3102,33 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi) } // update reference frame costs since we can do better than what we got last frame. - - if (cpi->common.refresh_alt_ref_frame) - { - cpi->prob_intra_coded += 40; - cpi->prob_last_coded = 200; - cpi->prob_gf_coded = 1; - } - else if (cpi->common.frames_since_golden == 0) + if (cpi->oxcf.number_of_layers == 1) { - cpi->prob_last_coded = 214; - cpi->prob_gf_coded = 1; - } - else if (cpi->common.frames_since_golden == 1) - { - cpi->prob_last_coded = 192; - cpi->prob_gf_coded = 220; - } - else if (cpi->source_alt_ref_active) - { - //int dist = cpi->common.frames_till_alt_ref_frame + cpi->common.frames_since_golden; - cpi->prob_gf_coded -= 20; + if (cpi->common.refresh_alt_ref_frame) + { + cpi->prob_intra_coded += 40; + cpi->prob_last_coded = 200; + cpi->prob_gf_coded = 1; + } + else if (cpi->common.frames_since_golden == 0) + { + cpi->prob_last_coded = 214; + cpi->prob_gf_coded = 1; + } + else if (cpi->common.frames_since_golden == 1) + { + cpi->prob_last_coded = 192; + cpi->prob_gf_coded = 220; + } + else if (cpi->source_alt_ref_active) + { + //int dist = cpi->common.frames_till_alt_ref_frame + cpi->common.frames_since_golden; + cpi->prob_gf_coded -= 20; - if (cpi->prob_gf_coded < 10) - cpi->prob_gf_coded = 10; + if (cpi->prob_gf_coded < 10) + cpi->prob_gf_coded = 10; + } } - #endif } @@ -3283,7 +3520,6 @@ static void encode_frame_to_data_rate // Enable or disable mode based tweaking of the zbin // For 2 Pass Only used where GF/ARF prediction quality // is above a threshold - cpi->zbin_mode_boost = 0; cpi->zbin_mode_boost_enabled = TRUE; if (cpi->pass == 2) { @@ -3432,6 +3668,19 @@ static void encode_frame_to_data_rate cpi->buffer_level = cpi->bits_off_target; + if (cpi->oxcf.number_of_layers > 1) + { + int i; + + // Propagate bits saved by dropping the frame to higher layers + for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++) + { + cpi->layer_context[i].bits_off_target + += cpi->av_per_frame_bandwidth; + cpi->layer_context[i].buffer_level = cpi->bits_off_target; + } + } + return; } else @@ -3478,7 +3727,7 @@ static void encode_frame_to_data_rate } // Set an active best quality and if necessary active worst quality - // There is some odd behaviour for one pass here that needs attention. + // There is some odd behavior for one pass here that needs attention. if ( (cpi->pass == 2) || (cpi->ni_frames > 150)) { vp8_clear_system_state(); @@ -3510,13 +3759,14 @@ static void encode_frame_to_data_rate cpi->active_best_quality = kf_high_motion_minq[Q]; } - else if (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame) + else if (cpi->oxcf.number_of_layers==1 && + (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame)) { // Use the lower of cpi->active_worst_quality and recent // average Q as basis for GF/ARF Q limit unless last frame was // a key frame. if ( (cpi->frames_since_key > 1) && - (cpi->avg_frame_qindex < cpi->active_worst_quality) ) + (cpi->avg_frame_qindex < cpi->active_worst_quality) ) { Q = cpi->avg_frame_qindex; } @@ -3617,13 +3867,17 @@ static void encode_frame_to_data_rate // Set highest allowed value for Zbin over quant if (cm->frame_type == KEY_FRAME) zbin_oq_high = 0; //ZBIN_OQ_MAX/16 - else if (cm->refresh_alt_ref_frame || (cm->refresh_golden_frame && !cpi->source_alt_ref_active)) - zbin_oq_high = 16; + else if ((cpi->oxcf.number_of_layers == 1) && ((cm->refresh_alt_ref_frame || + (cm->refresh_golden_frame && !cpi->source_alt_ref_active)))) + { + zbin_oq_high = 16; + } else zbin_oq_high = ZBIN_OQ_MAX; - // Setup background Q adjustment for error resilliant mode - if (cpi->cyclic_refresh_mode_enabled) + // Setup background Q adjustment for error resilient mode. + // For multi-layer encodes only enable this for the base layer. + if (cpi->cyclic_refresh_mode_enabled && (cpi->current_layer==0)) cyclic_background_refresh(cpi, Q, 0); vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit); @@ -3756,10 +4010,8 @@ static void encode_frame_to_data_rate if (cpi->prob_skip_false > 250) cpi->prob_skip_false = 250; - if (cpi->is_src_frame_alt_ref) + if (cpi->oxcf.number_of_layers == 1 && cpi->is_src_frame_alt_ref) cpi->prob_skip_false = 1; - - } #if 0 @@ -4111,9 +4363,10 @@ static void encode_frame_to_data_rate } // Update the GF useage maps. - // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter - // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter - vp8_update_gf_useage_maps(cpi, cm, &cpi->mb); + // This is done after completing the compression of a frame when all + // modes etc. are finalized but before loop filter + if (cpi->oxcf.number_of_layers == 1) + vp8_update_gf_useage_maps(cpi, cm, &cpi->mb); if (cm->frame_type == KEY_FRAME) cm->refresh_last_frame = 1; @@ -4179,6 +4432,13 @@ static void encode_frame_to_data_rate cpi->total_byte_count += (*size); cpi->projected_frame_size = (*size) << 3; + if (cpi->oxcf.number_of_layers > 1) + { + int i; + for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++) + cpi->layer_context[i].total_byte_count += (*size); + } + if (!active_worst_qchanged) vp8_update_rate_correction_factors(cpi, 2); @@ -4194,7 +4454,8 @@ static void encode_frame_to_data_rate cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2; // Keep a record from which we can calculate the average Q excluding GF updates and key frames - if ((cm->frame_type != KEY_FRAME) && !cm->refresh_golden_frame && !cm->refresh_alt_ref_frame) + if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) || + (!cm->refresh_golden_frame && !cm->refresh_alt_ref_frame))) { cpi->ni_frames++; @@ -4245,7 +4506,7 @@ static void encode_frame_to_data_rate #endif - // Set the count for maximum consequative dropped frames based upon the ratio of + // Set the count for maximum consecutive dropped frames based upon the ratio of // this frame size to the target average per frame bandwidth. // (cpi->av_per_frame_bandwidth > 0) is just a sanity check to prevent / 0. if (cpi->drop_frames_allowed && (cpi->av_per_frame_bandwidth > 0)) @@ -4270,13 +4531,32 @@ static void encode_frame_to_data_rate cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32; // Actual bits spent - cpi->total_actual_bits += cpi->projected_frame_size; + cpi->total_actual_bits += cpi->projected_frame_size; // Debug stats cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size); cpi->buffer_level = cpi->bits_off_target; + // Propagate values to higher temporal layers + if (cpi->oxcf.number_of_layers > 1) + { + int i; + + for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++) + { + LAYER_CONTEXT *lc = &cpi->layer_context[i]; + int bits_off_for_this_layer = lc->target_bandwidth / lc->frame_rate + - cpi->projected_frame_size; + + lc->bits_off_target += bits_off_for_this_layer; + + lc->total_actual_bits += cpi->projected_frame_size; + lc->total_target_vs_actual += bits_off_for_this_layer; + lc->buffer_level = lc->bits_off_target; + } + } + // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames if (cm->frame_type == KEY_FRAME) { @@ -4322,7 +4602,7 @@ static void encode_frame_to_data_rate vp8_clear_system_state(); //__asm emms; - if (cpi->twopass.total_left_stats.coded_error != 0.0) + if (cpi->twopass.total_left_stats->coded_error != 0.0) fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6d %6d" "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f" "%10.3f %8d\n", @@ -4340,9 +4620,9 @@ static void encode_frame_to_data_rate cm->frame_type, cpi->gfu_boost, cpi->twopass.est_max_qcorrection_factor, (int)cpi->twopass.bits_left, - cpi->twopass.total_left_stats.coded_error, + cpi->twopass.total_left_stats->coded_error, (double)cpi->twopass.bits_left / - cpi->twopass.total_left_stats.coded_error, + cpi->twopass.total_left_stats->coded_error, cpi->tot_recode_hits); else fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6d %6d" @@ -4362,7 +4642,7 @@ static void encode_frame_to_data_rate cm->frame_type, cpi->gfu_boost, cpi->twopass.est_max_qcorrection_factor, (int)cpi->twopass.bits_left, - cpi->twopass.total_left_stats.coded_error, + cpi->twopass.total_left_stats->coded_error, cpi->tot_recode_hits); fclose(f); @@ -4675,7 +4955,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon cm->refresh_golden_frame = 0; cm->refresh_last_frame = 0; cm->show_frame = 0; - cpi->source_alt_ref_pending = FALSE; // Clear Pending altf Ref flag. + cpi->source_alt_ref_pending = FALSE; // Clear Pending alt Ref flag. cpi->is_src_frame_alt_ref = 0; } } @@ -4727,6 +5007,13 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon return -1; } + // Restore layer specific context if necessary + if (cpi->oxcf.number_of_layers > 1) + { + restore_layer_context (cpi, + cpi->oxcf.layer_id[cm->current_video_frame % cpi->oxcf.periodicity]); + } + if (cpi->source->ts_start < cpi->first_time_stamp_ever) { cpi->first_time_stamp_ever = cpi->source->ts_start; @@ -4734,7 +5021,16 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon } // adjust frame rates based on timestamps given - if (!cm->refresh_alt_ref_frame) + if (cpi->oxcf.number_of_layers > 1 ) + { + vp8_new_frame_rate ( + cpi, cpi->layer_context[cpi->current_layer].frame_rate); + + cpi->last_time_stamp_seen = cpi->source->ts_start; + cpi->last_end_time_stamp_seen = cpi->source->ts_end; + + } + else if (!cm->refresh_alt_ref_frame) { int64_t this_duration; int step = 0; @@ -4786,7 +5082,8 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon if (cpi->compressor_speed == 2) { - check_gf_quality(cpi); + if (cpi->oxcf.number_of_layers == 1) + check_gf_quality(cpi); vpx_usec_timer_start(&tsctimer); vpx_usec_timer_start(&ticktimer); } @@ -4893,6 +5190,10 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon } + // Save layer specific state + if (cpi->oxcf.number_of_layers > 1) + save_layer_context (cpi); + vpx_usec_timer_mark(&cmptimer); cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer); @@ -4922,7 +5223,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon int y_samples = orig->y_height * orig->y_width ; int uv_samples = orig->uv_height * orig->uv_width ; int t_samples = y_samples + 2 * uv_samples; - int64_t sq_error; + int64_t sq_error, sq_error2; ye = calc_plane_error(orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height, @@ -4964,14 +5265,14 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height, IF_RTCD(&cpi->rtcd.variance)); - sq_error = ye + ue + ve; + sq_error2 = ye + ue + ve; - frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error); + frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error2); cpi->totalp_y += vp8_mse2psnr(y_samples, 255.0, ye); cpi->totalp_u += vp8_mse2psnr(uv_samples, 255.0, ue); cpi->totalp_v += vp8_mse2psnr(uv_samples, 255.0, ve); - cpi->total_sq_error2 += sq_error; + cpi->total_sq_error2 += sq_error2; cpi->totalp += frame_psnr2; frame_ssim2 = vp8_calc_ssim(cpi->Source, @@ -4981,6 +5282,24 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon cpi->summed_quality += frame_ssim2 * weight; cpi->summed_weights += weight; + if (cpi->oxcf.number_of_layers > 1) + { + int i; + + for (i=cpi->current_layer; + i<cpi->oxcf.number_of_layers; i++) + { + cpi->frames_in_layer[i]++; + + cpi->bytes_in_layer[i] += *size; + cpi->sum_psnr[i] += frame_psnr; + cpi->sum_psnr_p[i] += frame_psnr2; + cpi->total_error2[i] += sq_error; + cpi->total_error2_p[i] += sq_error2; + cpi->sum_ssim[i] += frame_ssim2 * weight; + cpi->sum_weights[i] += weight; + } + } } } @@ -4989,10 +5308,30 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon double y, u, v, frame_all; frame_all = vp8_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v, IF_RTCD(&cpi->rtcd.variance)); - cpi->total_ssimg_y += y; - cpi->total_ssimg_u += u; - cpi->total_ssimg_v += v; - cpi->total_ssimg_all += frame_all; + + if (cpi->oxcf.number_of_layers > 1) + { + int i; + + for (i=cpi->current_layer; + i<cpi->oxcf.number_of_layers; i++) + { + if (!cpi->b_calculate_psnr) + cpi->frames_in_layer[i]++; + + cpi->total_ssimg_y_in_layer[i] += y; + cpi->total_ssimg_u_in_layer[i] += u; + cpi->total_ssimg_v_in_layer[i] += v; + cpi->total_ssimg_all_in_layer[i] += frame_all; + } + } + else + { + cpi->total_ssimg_y += y; + cpi->total_ssimg_u += u; + cpi->total_ssimg_v += v; + cpi->total_ssimg_all += frame_all; + } } } diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index ee519fad0..6678c15fb 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -56,6 +56,8 @@ #define VP8_TEMPORAL_ALT_REF 1 #endif +#define MAX_PERIODICITY 16 + typedef struct { int kf_indicated; @@ -238,6 +240,52 @@ enum BLOCK_MAX_SEGMENTS }; +typedef struct +{ + // Layer configuration + double frame_rate; + int target_bandwidth; + + // Layer specific coding parameters + int starting_buffer_level; + int optimal_buffer_level; + int maximum_buffer_size; + + int avg_frame_size_for_layer; + + int buffer_level; + int bits_off_target; + + long long total_actual_bits; + int total_target_vs_actual; + + int worst_quality; + int active_worst_quality; + int best_quality; + int active_best_quality; + + int ni_av_qi; + int ni_tot_qi; + int ni_frames; + int avg_frame_qindex; + + double rate_correction_factor; + double key_frame_rate_correction_factor; + double gf_rate_correction_factor; + + int zbin_over_quant; + + int inter_frame_target; + INT64 total_byte_count; + + int filter_level; + + int last_frame_percent_intra; + + int count_mb_ref_frame_usage[MAX_REF_FRAMES]; + +} LAYER_CONTEXT; + typedef struct VP8_COMP { @@ -368,7 +416,7 @@ typedef struct VP8_COMP int buffered_mode; - int buffer_level; + int64_t buffer_level; int bits_off_target; int rolling_target_bits; @@ -610,6 +658,25 @@ typedef struct VP8_COMP int force_next_frame_intra; /* force next frame to intra when kf_auto says so */ int droppable; + + // Coding layer state variables + unsigned int current_layer; + LAYER_CONTEXT layer_context[MAX_LAYERS]; + + long long frames_in_layer[MAX_LAYERS]; + long long bytes_in_layer[MAX_LAYERS]; + double sum_psnr[MAX_LAYERS]; + double sum_psnr_p[MAX_LAYERS]; + double total_error2[MAX_LAYERS]; + double total_error2_p[MAX_LAYERS]; + double sum_ssim[MAX_LAYERS]; + double sum_weights[MAX_LAYERS]; + + double total_ssimg_y_in_layer[MAX_LAYERS]; + double total_ssimg_u_in_layer[MAX_LAYERS]; + double total_ssimg_v_in_layer[MAX_LAYERS]; + double total_ssimg_all_in_layer[MAX_LAYERS]; + } VP8_COMP; void control_data_rate(VP8_COMP *cpi); diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 1e602138f..62e644dea 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -157,7 +157,7 @@ static int pick_intra4x4block( rate = mode_costs[mode]; RECON_INVOKE(&rtcd->common->recon, intra4x4_predict) - (b, mode, b->predictor); + (b, mode, b->predictor, 16); distortion = get_prediction_error(be, b, &rtcd->variance); this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); @@ -471,7 +471,8 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, else skip_mode[GOLDEN_FRAME] = 1; - if (cpi->ref_frame_flags & VP8_ALT_FLAG && cpi->source_alt_ref_active) + if ((cpi->ref_frame_flags & VP8_ALT_FLAG) && + (cpi->source_alt_ref_active || cpi->oxcf.number_of_layers > 1)) { YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx]; y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset; diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index 46e1d9dd9..1ac905021 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -436,7 +436,8 @@ static void calc_iframe_target_size(VP8_COMP *cpi) } -// Do the best we can to define the parameteres for the next GF based on what information we have available. +// Do the best we can to define the parameters for the next GF based on what +// information we have available. static void calc_gf_params(VP8_COMP *cpi) { int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; @@ -607,6 +608,11 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { int min_frame_target; int Adjustment; + int old_per_frame_bandwidth = cpi->per_frame_bandwidth; + + if ( cpi->current_layer > 0) + cpi->per_frame_bandwidth = + cpi->layer_context[cpi->current_layer].avg_frame_size_for_layer; min_frame_target = 0; @@ -622,7 +628,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) // Special alt reference frame case - if (cpi->common.refresh_alt_ref_frame) + if((cpi->common.refresh_alt_ref_frame) && (cpi->oxcf.number_of_layers == 1)) { if (cpi->pass == 2) { @@ -789,7 +795,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) // Decide whether or not we need to adjust the frame data rate target. // // If we are are below the optimal buffer fullness level and adherence - // to buffering contraints is important to the end useage then adjust + // to buffering constraints is important to the end usage then adjust // the per frame target. if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && (cpi->buffer_level < cpi->oxcf.optimal_buffer_level)) @@ -812,12 +818,12 @@ static void calc_pframe_target_size(VP8_COMP *cpi) percent_low = 0; // lower the target bandwidth for this frame. - cpi->this_frame_target -= (cpi->this_frame_target * percent_low) - / 200; + cpi->this_frame_target -= + (cpi->this_frame_target * percent_low) / 200; // Are we using allowing control of active_worst_allowed_q // according to buffer level. - if (cpi->auto_worst_q) + if (cpi->auto_worst_q && cpi->ni_frames > 150) { int critical_buffer_level; @@ -834,7 +840,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) (cpi->buffer_level < cpi->bits_off_target) ? cpi->buffer_level : cpi->bits_off_target; } - // For local file playback short term buffering contraints + // For local file playback short term buffering constraints // are less of an issue else { @@ -905,11 +911,11 @@ static void calc_pframe_target_size(VP8_COMP *cpi) percent_high = 0; cpi->this_frame_target += (cpi->this_frame_target * - percent_high) / 200; - + percent_high) / 200; - // Are we allowing control of active_worst_allowed_q according to bufferl level. - if (cpi->auto_worst_q) + // Are we allowing control of active_worst_allowed_q according + // to buffer level. + if (cpi->auto_worst_q && cpi->ni_frames > 150) { // When using the relaxed buffer model stick to the user specified value cpi->active_worst_quality = cpi->ni_av_qi; @@ -1112,6 +1118,8 @@ static void calc_pframe_target_size(VP8_COMP *cpi) } } + + cpi->per_frame_bandwidth = old_per_frame_bandwidth; } @@ -1421,8 +1429,14 @@ void vp8_adjust_key_frame_context(VP8_COMP *cpi) * bits allocated than those following other gfs. */ overspend = (cpi->projected_frame_size - cpi->per_frame_bandwidth); - cpi->kf_overspend_bits += overspend * 7 / 8; - cpi->gf_overspend_bits += overspend * 1 / 8; + + if (cpi->oxcf.number_of_layers > 1) + cpi->kf_overspend_bits += overspend; + else + { + cpi->kf_overspend_bits += overspend * 7 / 8; + cpi->gf_overspend_bits += overspend * 1 / 8; + } /* Work out how much to try and recover per frame. */ cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits @@ -1452,7 +1466,9 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, } else { - if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame) + if (cpi->oxcf.number_of_layers > 1 || + cpi->common.refresh_alt_ref_frame || + cpi->common.refresh_golden_frame) { *frame_over_shoot_limit = cpi->this_frame_target * 9 / 8; *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8; diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 124cfe564..fdb519c19 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -631,7 +631,7 @@ static int rd_pick_intra4x4block( rate = bmode_costs[mode]; RECON_INVOKE(&cpi->rtcd.common->recon, intra4x4_predict) - (b, mode, b->predictor); + (b, mode, b->predictor, 16); ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16); x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32); x->quantize_b(be, b); @@ -660,8 +660,8 @@ static int rd_pick_intra4x4block( } b->bmi.as_mode = (B_PREDICTION_MODE)(*best_mode); - IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff, b->diff, 32); - RECON_INVOKE(IF_RTCD(&cpi->rtcd.common->recon), recon)(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff, + best_predictor, 16, *(b->base_dst) + b->dst, b->dst_stride); return best_rd; } diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 5f2e6a354..85b8113d7 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -64,7 +64,6 @@ VP8_COMMON_SRCS-yes += common/mbpitch.c VP8_COMMON_SRCS-yes += common/modecont.c VP8_COMMON_SRCS-yes += common/modecontext.c VP8_COMMON_SRCS-yes += common/quant_common.c -VP8_COMMON_SRCS-yes += common/recon.c VP8_COMMON_SRCS-yes += common/reconinter.c VP8_COMMON_SRCS-yes += common/reconintra.c VP8_COMMON_SRCS-yes += common/reconintra4x4.c @@ -125,7 +124,6 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/iwalsh_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/filter_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/idct_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/loopfilter_v6$(ASM) -VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/recon_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/simpleloopfilter_v6$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/sixtappredict8x4_v6$(ASM) @@ -143,16 +141,10 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfilter_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/mbloopfilter_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon2b_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon4b_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/reconb_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/shortidct4x4llm_1_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/shortidct4x4llm_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict4x4_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x4_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x8_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict16x16_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon16x16mb_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/buildintrapredictorsmby_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/save_neon_reg$(ASM) -VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon_neon.c diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index ca4e505dc..f8336240c 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -218,6 +218,25 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, } #endif + RANGE_CHECK(cfg, ts_number_layers, 1, 5); + + if (cfg->ts_number_layers > 1) + { + int i; + RANGE_CHECK_HI(cfg, ts_periodicity, 16); + + for (i=1; i<cfg->ts_number_layers; i++) + if (cfg->ts_target_bitrate[i] <= cfg->ts_target_bitrate[i-1]) + ERROR("ts_target_bitrate entries are not strictly increasing"); + + RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers-1], 1, 1); + for (i=cfg->ts_number_layers-2; i>0; i--) + if (cfg->ts_rate_decimator[i-1] != 2*cfg->ts_rate_decimator[i]) + ERROR("ts_rate_decimator factors are not powers of 2"); + + RANGE_CHECK_HI(cfg, ts_layer_id[i], cfg->ts_number_layers-1); + } + return VPX_CODEC_OK; } @@ -253,14 +272,15 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, oxcf->Width = cfg.g_w; oxcf->Height = cfg.g_h; /* guess a frame rate if out of whack, use 30 */ - oxcf->frame_rate = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num); + oxcf->frame_rate = (double)(cfg.g_timebase.den) / + (double)(cfg.g_timebase.num); if (oxcf->frame_rate > 180) { oxcf->frame_rate = 30; } - oxcf->error_resilient_mode = cfg.g_error_resilient; + oxcf->error_resilient_mode = cfg.g_error_resilient; switch (cfg.g_pass) { @@ -277,13 +297,13 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, if (cfg.g_pass == VPX_RC_FIRST_PASS) { - oxcf->allow_lag = 0; - oxcf->lag_in_frames = 0; + oxcf->allow_lag = 0; + oxcf->lag_in_frames = 0; } else { - oxcf->allow_lag = (cfg.g_lag_in_frames) > 0; - oxcf->lag_in_frames = cfg.g_lag_in_frames; + oxcf->allow_lag = (cfg.g_lag_in_frames) > 0; + oxcf->lag_in_frames = cfg.g_lag_in_frames; } oxcf->allow_df = (cfg.rc_dropframe_thresh > 0); @@ -295,59 +315,71 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, if (cfg.rc_end_usage == VPX_VBR) { - oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK; + oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK; } else if (cfg.rc_end_usage == VPX_CBR) { - oxcf->end_usage = USAGE_STREAM_FROM_SERVER; + oxcf->end_usage = USAGE_STREAM_FROM_SERVER; } else if (cfg.rc_end_usage == VPX_CQ) { - oxcf->end_usage = USAGE_CONSTRAINED_QUALITY; + oxcf->end_usage = USAGE_CONSTRAINED_QUALITY; } - oxcf->target_bandwidth = cfg.rc_target_bitrate; + oxcf->target_bandwidth = cfg.rc_target_bitrate; oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct; - oxcf->best_allowed_q = cfg.rc_min_quantizer; - oxcf->worst_allowed_q = cfg.rc_max_quantizer; - oxcf->cq_level = vp8_cfg.cq_level; + oxcf->best_allowed_q = cfg.rc_min_quantizer; + oxcf->worst_allowed_q = cfg.rc_max_quantizer; + oxcf->cq_level = vp8_cfg.cq_level; oxcf->fixed_q = -1; - oxcf->under_shoot_pct = cfg.rc_undershoot_pct; - oxcf->over_shoot_pct = cfg.rc_overshoot_pct; + oxcf->under_shoot_pct = cfg.rc_undershoot_pct; + oxcf->over_shoot_pct = cfg.rc_overshoot_pct; - oxcf->maximum_buffer_size = cfg.rc_buf_sz; - oxcf->starting_buffer_level = cfg.rc_buf_initial_sz; - oxcf->optimal_buffer_level = cfg.rc_buf_optimal_sz; + oxcf->maximum_buffer_size = cfg.rc_buf_sz; + oxcf->starting_buffer_level = cfg.rc_buf_initial_sz; + oxcf->optimal_buffer_level = cfg.rc_buf_optimal_sz; - oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct; + oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct; oxcf->two_pass_vbrmin_section = cfg.rc_2pass_vbr_minsection_pct; oxcf->two_pass_vbrmax_section = cfg.rc_2pass_vbr_maxsection_pct; - oxcf->auto_key = cfg.kf_mode == VPX_KF_AUTO - && cfg.kf_min_dist != cfg.kf_max_dist; - //oxcf->kf_min_dist = cfg.kf_min_dis; - oxcf->key_freq = cfg.kf_max_dist; + oxcf->auto_key = cfg.kf_mode == VPX_KF_AUTO + && cfg.kf_min_dist != cfg.kf_max_dist; + //oxcf->kf_min_dist = cfg.kf_min_dis; + oxcf->key_freq = cfg.kf_max_dist; + + oxcf->number_of_layers = cfg.ts_number_layers; + oxcf->periodicity = cfg.ts_periodicity; + + if (oxcf->number_of_layers > 1) + { + memcpy (oxcf->target_bitrate, cfg.ts_target_bitrate, + sizeof(cfg.ts_target_bitrate)); + memcpy (oxcf->rate_decimator, cfg.ts_rate_decimator, + sizeof(cfg.ts_rate_decimator)); + memcpy (oxcf->layer_id, cfg.ts_layer_id, sizeof(cfg.ts_layer_id)); + } //oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile; //strcpy(oxcf->first_pass_file, cfg.g_firstpass_file); - oxcf->cpu_used = vp8_cfg.cpu_used; - oxcf->encode_breakout = vp8_cfg.static_thresh; - oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref; - oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity; - oxcf->Sharpness = vp8_cfg.Sharpness; - oxcf->token_partitions = vp8_cfg.token_partitions; + oxcf->cpu_used = vp8_cfg.cpu_used; + oxcf->encode_breakout = vp8_cfg.static_thresh; + oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref; + oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity; + oxcf->Sharpness = vp8_cfg.Sharpness; + oxcf->token_partitions = vp8_cfg.token_partitions; - oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in; - oxcf->output_pkt_list = vp8_cfg.pkt_list; + oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in; + oxcf->output_pkt_list = vp8_cfg.pkt_list; - oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames; - oxcf->arnr_strength = vp8_cfg.arnr_strength; - oxcf->arnr_type = vp8_cfg.arnr_type; + oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames; + oxcf->arnr_strength = vp8_cfg.arnr_strength; + oxcf->arnr_type = vp8_cfg.arnr_type; - oxcf->tuning = vp8_cfg.tuning; + oxcf->tuning = vp8_cfg.tuning; /* printf("Current VP8 Settings: \n"); @@ -515,7 +547,7 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx) cfg = &ctx->priv->alg_priv->cfg; - /* Select the extra vp6 configuration table based on the current + /* Select the extra vp8 configuration table based on the current * usage value. If the current usage value isn't found, use the * values for usage case 0. */ @@ -1143,6 +1175,12 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = 1, /* g_delete_first_pass_file */ "vp8.fpf" /* first pass filename */ #endif + + 1, /* ts_number_layers */ + {0}, /* ts_target_bitrate */ + {0}, /* ts_rate_decimator */ + 0, /* ts_periodicity */ + {0}, /* ts_layer_id */ }}, { -1, {NOT_IMPLEMENTED}} }; |