summaryrefslogtreecommitdiff
path: root/vp8
diff options
context:
space:
mode:
Diffstat (limited to 'vp8')
-rw-r--r--vp8/common/arm/arm_systemdependent.c9
-rw-r--r--vp8/common/arm/armv6/dc_only_idct_add_v6.asm33
-rw-r--r--vp8/common/arm/armv6/idct_v6.asm501
-rw-r--r--vp8/common/arm/armv6/recon_v6.asm281
-rw-r--r--vp8/common/arm/idct_arm.h8
-rw-r--r--vp8/common/arm/neon/dc_only_idct_add_neon.asm33
-rw-r--r--vp8/common/arm/neon/recon16x16mb_neon.asm131
-rw-r--r--vp8/common/arm/neon/recon2b_neon.asm54
-rw-r--r--vp8/common/arm/neon/recon4b_neon.asm69
-rw-r--r--vp8/common/arm/neon/recon_neon.c29
-rw-r--r--vp8/common/arm/neon/reconb_neon.asm61
-rw-r--r--vp8/common/arm/neon/shortidct4x4llm_1_neon.asm67
-rw-r--r--vp8/common/arm/neon/shortidct4x4llm_neon.asm53
-rw-r--r--vp8/common/arm/recon_arm.h29
-rw-r--r--vp8/common/generic/systemdependent.c7
-rw-r--r--vp8/common/idct.h14
-rw-r--r--vp8/common/idctllm.c67
-rw-r--r--vp8/common/invtrans.c52
-rw-r--r--vp8/common/onyx.h15
-rw-r--r--vp8/common/recon.h38
-rw-r--r--vp8/common/reconinter.c158
-rw-r--r--vp8/common/reconinter.h1
-rw-r--r--vp8/common/reconintra.c10
-rw-r--r--vp8/common/reconintra4x4.c207
-rw-r--r--vp8/common/x86/idct_x86.h4
-rw-r--r--vp8/common/x86/idctllm_mmx.asm295
-rw-r--r--vp8/common/x86/idctllm_sse2.asm130
-rw-r--r--vp8/common/x86/recon_mmx.asm47
-rw-r--r--vp8/common/x86/recon_sse2.asm115
-rw-r--r--vp8/common/x86/recon_x86.h12
-rw-r--r--vp8/common/x86/x86_systemdependent.c4
-rw-r--r--vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm27
-rw-r--r--vp8/decoder/arm/armv6/dequant_idct_v6.asm26
-rw-r--r--vp8/decoder/arm/armv6/idct_blk_v6.c104
-rw-r--r--vp8/decoder/arm/dequantize_arm.h2
-rw-r--r--vp8/decoder/arm/neon/dequant_idct_neon.asm34
-rw-r--r--vp8/decoder/arm/neon/idct_blk_neon.c141
-rw-r--r--vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm48
-rw-r--r--vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm44
-rw-r--r--vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm59
-rw-r--r--vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm51
-rw-r--r--vp8/decoder/decodframe.c43
-rw-r--r--vp8/decoder/dequantize.c62
-rw-r--r--vp8/decoder/dequantize.h14
-rw-r--r--vp8/decoder/error_concealment.c9
-rw-r--r--vp8/decoder/idct_blk.c43
-rw-r--r--vp8/decoder/reconintra_mt.c201
-rw-r--r--vp8/decoder/reconintra_mt.h2
-rw-r--r--vp8/decoder/threading.c22
-rw-r--r--vp8/decoder/x86/dequantize_mmx.asm64
-rw-r--r--vp8/decoder/x86/idct_blk_mmx.c87
-rw-r--r--vp8/decoder/x86/idct_blk_sse2.c119
-rw-r--r--vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm5
-rw-r--r--vp8/encoder/encodeframe.c34
-rw-r--r--vp8/encoder/encodeintra.c9
-rw-r--r--vp8/encoder/encodemb.c69
-rw-r--r--vp8/encoder/onyx_if.c511
-rw-r--r--vp8/encoder/onyx_int.h69
-rw-r--r--vp8/encoder/pickinter.c5
-rw-r--r--vp8/encoder/ratectrl.c44
-rw-r--r--vp8/encoder/rdopt.c6
-rw-r--r--vp8/vp8_common.mk8
-rw-r--r--vp8/vp8_cx_iface.c110
63 files changed, 2073 insertions, 2533 deletions
diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c
index 2b45afe4b..29288519f 100644
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -45,7 +45,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_armv6;
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6;
- rtcd->idct.idct1 = vp8_short_idct4x4llm_1_v6;
rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_v6;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_v6;
@@ -64,9 +63,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6;
rtcd->recon.copy8x8 = vp8_copy_mem8x8_v6;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_v6;
- rtcd->recon.recon = vp8_recon_b_armv6;
- rtcd->recon.recon2 = vp8_recon2b_armv6;
- rtcd->recon.recon4 = vp8_recon4b_armv6;
}
#endif
@@ -82,7 +78,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_neon;
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon;
- rtcd->idct.idct1 = vp8_short_idct4x4llm_1_neon;
rtcd->idct.idct16 = vp8_short_idct4x4llm_neon;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon;
@@ -99,10 +94,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->recon.copy16x16 = vp8_copy_mem16x16_neon;
rtcd->recon.copy8x8 = vp8_copy_mem8x8_neon;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_neon;
- rtcd->recon.recon = vp8_recon_b_neon;
- rtcd->recon.recon2 = vp8_recon2b_neon;
- rtcd->recon.recon4 = vp8_recon4b_neon;
- rtcd->recon.recon_mb = vp8_recon_mb_neon;
rtcd->recon.build_intra_predictors_mby =
vp8_build_intra_predictors_mby_neon;
rtcd->recon.build_intra_predictors_mby_s =
diff --git a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
index e0660e9fd..9aa659fa7 100644
--- a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
+++ b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm
@@ -11,25 +11,27 @@
AREA |.text|, CODE, READONLY
-;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
-; unsigned char *dst_ptr, int pitch, int stride)
+;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+; int pred_stride, unsigned char *dst_ptr,
+; int dst_stride)
; r0 input_dc
; r1 pred_ptr
-; r2 dest_ptr
-; r3 pitch
-; sp stride
+; r2 pred_stride
+; r3 dst_ptr
+; sp dst_stride
|vp8_dc_only_idct_add_v6| PROC
- stmdb sp!, {r4 - r7, lr}
+ stmdb sp!, {r4 - r7}
add r0, r0, #4 ; input_dc += 4
ldr r12, c0x0000FFFF
- ldr r4, [r1], r3
- ldr r6, [r1], r3
+ ldr r4, [r1], r2
and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
- ldr lr, [sp, #20]
+ ldr r6, [r1], r2
orr r0, r0, r0, lsl #16 ; a1 | a1
+ ldr r12, [sp, #16] ; dst stride
+
uxtab16 r5, r0, r4 ; a1+2 | a1+0
uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
uxtab16 r7, r0, r6
@@ -40,10 +42,10 @@
usat16 r6, #8, r6
orr r5, r5, r4, lsl #8
orr r7, r7, r6, lsl #8
- ldr r4, [r1], r3
+ ldr r4, [r1], r2
+ str r5, [r3], r12
ldr r6, [r1]
- str r5, [r2], lr
- str r7, [r2], lr
+ str r7, [r3], r12
uxtab16 r5, r0, r4
uxtab16 r4, r0, r4, ror #8
@@ -55,10 +57,11 @@
usat16 r6, #8, r6
orr r5, r5, r4, lsl #8
orr r7, r7, r6, lsl #8
- str r5, [r2], lr
- str r7, [r2]
+ str r5, [r3], r12
+ str r7, [r3]
- ldmia sp!, {r4 - r7, pc}
+ ldmia sp!, {r4 - r7}
+ bx lr
ENDP ; |vp8_dc_only_idct_add_v6|
diff --git a/vp8/common/arm/armv6/idct_v6.asm b/vp8/common/arm/armv6/idct_v6.asm
index 27215afcd..b4d44cbeb 100644
--- a/vp8/common/arm/armv6/idct_v6.asm
+++ b/vp8/common/arm/armv6/idct_v6.asm
@@ -9,337 +9,194 @@
;
-; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14
- EXPORT |vp8_short_idct4x4llm_1_v6|
- EXPORT |vp8_short_idct4x4llm_v6|
- EXPORT |vp8_short_idct4x4llm_v6_scott|
EXPORT |vp8_short_idct4x4llm_v6_dual|
AREA |.text|, CODE, READONLY
-;********************************************************************************
-;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)
-;* r0 INT16 * input
-;* r1 INT16 * output
-;* r2 INT32 pitch
-;* bench: 3/5
-;********************************************************************************
-
-|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit
- ;
- ldrsh r0, [r0] ; load input[0] 1, r0 un 2
- add r0, r0, #4 ; 1 +4
- stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup
- mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3
- pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack
- mov r5, r4 ; expand expand
-
- strd r4, [r1], r2 ; *output = r0, post inc 1
- strd r4, [r1], r2 ; 1
- strd r4, [r1], r2 ; 1
- strd r4, [r1] ; 1
- ;
- ldmia sp!, {r4, r5, pc} ; replace vars, return restore
- ENDP ; |vp8_short_idct4x4llm_1_v6|
-;********************************************************************************
-;********************************************************************************
-;********************************************************************************
-
-;********************************************************************************
-;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)
-;* r0 INT16 * input
-;* r1 INT16 * output
-;* r2 INT32 pitch
-;* bench:
-;********************************************************************************
-
-|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit
- ;
- stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
- ;
- mov r4, #0x00004E00 ; 1 cst
- orr r4, r4, #0x0000007B ; cospi8sqrt2minus1
- mov r5, #0x00008A00 ; 1 cst
- orr r5, r5, #0x0000008C ; sinpi8sqrt2
- ;
- mov r6, #4 ; i=4 1 i
-loop1 ;
- ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4]
- ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12]
- ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8]
- ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0]
- smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1
- smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2
- add r9, r7, r8 ; a1 = [0] + [8] 1 a1
- sub r7, r7, r8 ; b1 = [0] - [8] 1 b1
- add r11, r3, r11 ; temp2 1
- rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1
- smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2
- smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1
- add r8, r7, r11 ; b1 + c1 1 b+c
- strh r8, [r1, r2] ; out[pitch] = b1+c1 1
- sub r7, r7, r11 ; b1 - c1 1 b-c
- add r10, r12, r10 ; temp1 1
- add r3, r10, r3 ; d1 = temp1 + temp2 1 d1
- add r10, r9, r3 ; a1 + d1 1 a+d
- sub r3, r9, r3 ; a1 - d1 1 a-d
- add r8, r2, r2 ; pitch * 2 1 p*2
- strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1
- add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3
- strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1
- subs r6, r6, #1 ; i-- 1 --
- strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++
- bne loop1 ; if i>0, continue
- ;
- sub r1, r1, #8 ; set up out for next loop 1 -4
- ; for this iteration, input=prev output
- mov r6, #4 ; i=4 1 i
-; b returnfull
-loop2 ;
- ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1]
- ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3]
- ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2]
- ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0]
- smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1
- smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2
- add r7, r0, r3 ; a1 = [0] + [2] 1 a1
- sub r0, r0, r3 ; b1 = [0] - [2] 1 b1
- add r10, r8, r10 ; temp2 1
- rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1
- smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2
- smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1
- add r3, r0, r9 ; b1+c1 1 b+c
- add r3, r3, #4 ; b1+c1+4 1 +4
- add r10, r11, r10 ; temp1 1
- mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3
- strh r3, [r1, #2] ; out[1] = b1+c1 1
- add r10, r10, r8 ; d1 = temp1 + temp2 1 d1
- add r3, r7, r10 ; a1+d1 1 a+d
- add r3, r3, #4 ; a1+d1+4 1 +4
- sub r7, r7, r10 ; a1-d1 1 a-d
- add r7, r7, #4 ; a1-d1+4 1 +4
- mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3
- mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3
- strh r7, [r1, #6] ; out[3] = a1-d1 1
- sub r0, r0, r9 ; b1-c1 1 b-c
- add r0, r0, #4 ; b1-c1+4 1 +4
- subs r6, r6, #1 ; i-- 1 --
- mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3
- strh r0, [r1, #4] ; out[2] = b1-c1 1
- strh r3, [r1], r2 ; out[0] = a1+d1 1
-; add r1, r1, r2 ; out += pitch 1 ++
- bne loop2 ; if i>0, continue
-returnfull ;
- ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
- ENDP
-;********************************************************************************
-;********************************************************************************
-;********************************************************************************
-
-;********************************************************************************
-;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)
-;* r0 INT16 * input
-;* r1 INT16 * output
-;* r2 INT32 pitch
-;* bench:
-;********************************************************************************
-
-|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit
-; mov r0, #0 ;
-; ldr r0, [r0] ;
- stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup
- ;
- mov r3, #0x00004E00 ; cos
- orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
- mov r4, #0x00008A00 ; sin
- orr r4, r4, #0x0000008C ; sinpi8sqrt2
- ;
- mov r5, #0x2 ; i i
- ;
-short_idct4x4llm_v6_scott_loop1 ;
- ldr r10, [r0, #(4*2)] ; i5 | i4 5,4
- ldr r11, [r0, #(12*2)] ; i13 | i12 13,12
- ;
- smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1
- smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2
- ;
- smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2
- smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1
- ;
- add r6, r6, r7 ; partial c1 lt1-lt2
- add r12, r12, r14 ; partial d1 l2t2+l2t1
- ;
- smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1
- smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2
- ;
- smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1
- smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2
- ;
- add r7, r14, r7 ; partial c1_2 ht1+ht2
- sub r8, r8, r9 ; partial d1_2 h2t1-h2t2
- ;
- pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack
- pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack
- ;
- usub16 r6, r6, r10 ; c1_2 | c1_1 c
- uadd16 r12, r12, r11 ; d1_2 | d1_1 d
- ;
- ldr r10, [r0, #0] ; i1 | i0 1,0
- ldr r11, [r0, #(8*2)] ; i9 | i10 9,10
- ;
-;;;;;; add r0, r0, #0x4 ; +4
-;;;;;; add r1, r1, #0x4 ; +4
- ;
- uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a
- usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b
- ;
- uadd16 r7, r8, r12 ; a1 + d1 pair a+d
- usub16 r14, r8, r12 ; a1 - d1 pair a-d
- ;
- str r7, [r1] ; op[0] = a1 + d1
- str r14, [r1, r2] ; op[pitch*3] = a1 - d1
- ;
- add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++
- add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++
- ;
- subs r5, r5, #0x1 ; --
- bne short_idct4x4llm_v6_scott_loop1 ;
- ;
- sub r1, r1, #16 ; reset output ptr
- mov r5, #0x4 ;
- mov r0, r1 ; input = output
- ;
-short_idct4x4llm_v6_scott_loop2 ;
- ;
- subs r5, r5, #0x1 ;
- bne short_idct4x4llm_v6_scott_loop2 ;
- ;
- ldmia sp!, {r4 - r11, pc} ;
- ENDP ;
- ;
-;********************************************************************************
-;********************************************************************************
-;********************************************************************************
-
-;********************************************************************************
-;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)
-;* r0 INT16 * input
-;* r1 INT16 * output
-;* r2 INT32 pitch
-;* bench:
-;********************************************************************************
-
-|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit
- ;
- stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
- mov r3, #0x00004E00 ; cos
- orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
- mov r4, #0x00008A00 ; sin
- orr r4, r4, #0x0000008C ; sinpi8sqrt2
- mov r5, #0x2 ; i=2 i
+; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
+; unsigned char *dst, int stride)
+; r0 short* input
+; r1 unsigned char* pred
+; r2 int pitch
+; r3 unsigned char* dst
+; sp int stride
+
+|vp8_short_idct4x4llm_v6_dual| PROC
+ stmdb sp!, {r4-r11, lr}
+
+ sub sp, sp, #4
+
+ mov r4, #0x00008A00 ; sin
+ orr r4, r4, #0x0000008C ; sinpi8sqrt2
+
+ mov r5, #0x00004E00 ; cos
+ orr r5, r5, #0x0000007B ; cospi8sqrt2minus1
+ orr r5, r5, #1<<31 ; loop counter on top bit
+
loop1_dual
- ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
- ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
- ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
-
- smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
- smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
- smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
- smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
- pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
- smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
+ ldr r6, [r0, #(4*2)] ; i5 | i4
+ ldr r12, [r0, #(12*2)] ; i13|i12
+ ldr r14, [r0, #(8*2)] ; i9 | i8
+
+ smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
+ smulbb r7, r5, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16
+ smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
+ smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16
+
+ smulbt r11, r5, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16
+ pkhtb r7, r9, r7, asr #16 ; 5c | 4c
pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
- uadd16 r6, r6, r7 ; 5c+5 | 4c+4
- smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
- smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
- smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
- subs r5, r5, #0x1 ; i-- --
- pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
- ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
+ uadd16 r6, r6, r7 ; 5c+5 | 4c+4
+
+ smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16
+ smulbb r9, r5, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16
+ smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16
+
+ subs r5, r5, #1<<31 ; i--
+
+ pkhtb r9, r11, r9, asr #16 ; 13c | 12c
+ ldr r11, [r0] ; i1 | i0
pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
- uadd16 r7, r12, r9 ; 13c+13 | 12c+12
- usub16 r7, r8, r7 ; c c
- uadd16 r6, r6, r10 ; d d
- uadd16 r10, r11, r14 ; a a
- usub16 r8, r11, r14 ; b b
- uadd16 r9, r10, r6 ; a+d a+d
- usub16 r10, r10, r6 ; a-d a-d
- uadd16 r6, r8, r7 ; b+c b+c
- usub16 r7, r8, r7 ; b-c b-c
- str r6, [r1, r2] ; o5 | o4
- add r6, r2, r2 ; pitch * 2 p2
- str r7, [r1, r6] ; o9 | o8
- add r6, r6, r2 ; pitch * 3 p3
- str r10, [r1, r6] ; o13 | o12
- str r9, [r1], #0x4 ; o1 | o0 ++
- bne loop1_dual ;
- mov r5, #0x2 ; i=2 i
- sub r0, r1, #8 ; reset input/output i/o
+ uadd16 r7, r12, r9 ; 13c+13 | 12c+12
+
+ usub16 r7, r8, r7 ; c
+ uadd16 r6, r6, r10 ; d
+ uadd16 r10, r11, r14 ; a
+ usub16 r8, r11, r14 ; b
+
+ uadd16 r9, r10, r6 ; a+d
+ usub16 r10, r10, r6 ; a-d
+ uadd16 r6, r8, r7 ; b+c
+ usub16 r7, r8, r7 ; b-c
+
+ ; use input buffer to store intermediate results
+ str r6, [r0, #(4*2)] ; o5 | o4
+ str r7, [r0, #(8*2)] ; o9 | o8
+ str r10,[r0, #(12*2)] ; o13|o12
+ str r9, [r0], #4 ; o1 | o0
+
+ bcs loop1_dual
+
+ sub r0, r0, #8 ; reset input/output
+ str r0, [sp]
+
loop2_dual
- ldr r6, [r0, r2] ; i5 | i4 5|4
- ldr r1, [r0] ; i1 | i0 1|0
- ldr r12, [r0, #0x4] ; i3 | i2 3|2
- add r14, r2, #0x4 ; pitch + 2 p+2
- ldr r14, [r0, r14] ; i7 | i6 7|6
- smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
- smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
- smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
- smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
- pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
- pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
- pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1
- pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
- uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
- pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
- uadd16 r10, r11, r9 ; a a
- usub16 r9, r11, r9 ; b b
- pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
- subs r5, r5, #0x1 ; i-- --
- smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
- smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
- smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
- smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
-
- pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
- pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
- uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
- usub16 r12, r8, r6 ; c (o1 | o5) c
- uadd16 r6, r11, r1 ; d (o3 | o7) d
- uadd16 r7, r10, r6 ; a+d a+d
- mov r8, #0x4 ; set up 4's 4
- orr r8, r8, #0x40000 ; 4|4
- usub16 r6, r10, r6 ; a-d a-d
- uadd16 r6, r6, r8 ; a-d+4 3|7
- uadd16 r7, r7, r8 ; a+d+4 0|4
- uadd16 r10, r9, r12 ; b+c b+c
- usub16 r1, r9, r12 ; b-c b-c
- uadd16 r10, r10, r8 ; b+c+4 1|5
- uadd16 r1, r1, r8 ; b-c+4 2|6
- mov r8, r10, asr #19 ; o1 >> 3
- strh r8, [r0, #2] ; o1
- mov r8, r1, asr #19 ; o2 >> 3
- strh r8, [r0, #4] ; o2
- mov r8, r6, asr #19 ; o3 >> 3
- strh r8, [r0, #6] ; o3
- mov r8, r7, asr #19 ; o0 >> 3
- strh r8, [r0], r2 ; o0 +p
- sxth r10, r10 ;
- mov r8, r10, asr #3 ; o5 >> 3
- strh r8, [r0, #2] ; o5
- sxth r1, r1 ;
- mov r8, r1, asr #3 ; o6 >> 3
- strh r8, [r0, #4] ; o6
- sxth r6, r6 ;
- mov r8, r6, asr #3 ; o7 >> 3
- strh r8, [r0, #6] ; o7
- sxth r7, r7 ;
- mov r8, r7, asr #3 ; o4 >> 3
- strh r8, [r0], r2 ; o4 +p
-;;;;; subs r5, r5, #0x1 ; i-- --
- bne loop2_dual ;
- ;
- ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
+
+ ldr r6, [r0, #(4*2)] ; i5 | i4
+ ldr r12,[r0, #(2*2)] ; i3 | i2
+ ldr r14,[r0, #(6*2)] ; i7 | i6
+ ldr r0, [r0, #(0*2)] ; i1 | i0
+
+ smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
+ smulbt r7, r5, r0 ; (ip[1] * cospi8sqrt2minus1) >> 16
+ smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
+ smulwt r8, r4, r0 ; (ip[1] * sinpi8sqrt2) >> 16
+
+ pkhbt r11, r6, r0, lsl #16 ; i0 | i4
+ pkhtb r7, r7, r9, asr #16 ; 1c | 5c
+ pkhtb r0, r0, r6, asr #16 ; i1 | i5
+ pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1
+
+ uadd16 r0, r7, r0 ; 1c+1 | 5c+5 = temp2
+ pkhbt r9, r14, r12, lsl #16 ; i2 | i6
+ uadd16 r10, r11, r9 ; a
+ usub16 r9, r11, r9 ; b
+ pkhtb r6, r12, r14, asr #16 ; i3 | i7
+
+ subs r5, r5, #1<<31 ; i--
+
+ smulbt r7, r5, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16
+ smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16
+ smulbb r12, r5, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16
+ smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16
+
+ pkhtb r7, r7, r12, asr #16 ; 3c | 7c
+ pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1
+
+ uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2
+ usub16 r12, r8, r6 ; c (o1 | o5)
+ uadd16 r6, r11, r0 ; d (o3 | o7)
+ uadd16 r7, r10, r6 ; a+d
+
+ mov r8, #4 ; set up 4's
+ orr r8, r8, #0x40000 ; 4|4
+
+ usub16 r6, r10, r6 ; a-d
+ uadd16 r6, r6, r8 ; a-d+4, 3|7
+ uadd16 r7, r7, r8 ; a+d+4, 0|4
+ uadd16 r10, r9, r12 ; b+c
+ usub16 r0, r9, r12 ; b-c
+ uadd16 r10, r10, r8 ; b+c+4, 1|5
+ uadd16 r8, r0, r8 ; b-c+4, 2|6
+
+ ldr lr, [sp, #40] ; dst stride
+
+ ldrb r0, [r1] ; pred p0
+ ldrb r11, [r1, #1] ; pred p1
+ ldrb r12, [r1, #2] ; pred p2
+
+ add r0, r0, r7, asr #19 ; p0 + o0
+ add r11, r11, r10, asr #19 ; p1 + o1
+ add r12, r12, r8, asr #19 ; p2 + o2
+
+ usat r0, #8, r0 ; d0 = clip8(p0 + o0)
+ usat r11, #8, r11 ; d1 = clip8(p1 + o1)
+ usat r12, #8, r12 ; d2 = clip8(p2 + o2)
+
+ add r0, r0, r11, lsl #8 ; |--|--|d1|d0|
+
+ ldrb r11, [r1, #3] ; pred p3
+
+ add r0, r0, r12, lsl #16 ; |--|d2|d1|d0|
+
+ add r11, r11, r6, asr #19 ; p3 + o3
+
+ sxth r7, r7 ;
+ sxth r10, r10 ;
+
+ usat r11, #8, r11 ; d3 = clip8(p3 + o3)
+
+ sxth r8, r8 ;
+ sxth r6, r6 ;
+
+ add r0, r0, r11, lsl #24 ; |d3|d2|d1|d0|
+
+ ldrb r12, [r1, r2]! ; pred p4
+ str r0, [r3], lr
+ ldrb r11, [r1, #1] ; pred p5
+
+ add r12, r12, r7, asr #3 ; p4 + o4
+ add r11, r11, r10, asr #3 ; p5 + o5
+
+ usat r12, #8, r12 ; d4 = clip8(p4 + o4)
+ usat r11, #8, r11 ; d5 = clip8(p5 + o5)
+
+ ldrb r7, [r1, #2] ; pred p6
+ ldrb r10, [r1, #3] ; pred p6
+
+ add r12, r12, r11, lsl #8 ; |--|--|d5|d4|
+
+ add r7, r7, r8, asr #3 ; p6 + o6
+ add r10, r10, r6, asr #3 ; p7 + o7
+
+ ldr r0, [sp] ; load input pointer
+
+ usat r7, #8, r7 ; d6 = clip8(p6 + o6)
+ usat r10, #8, r10 ; d7 = clip8(p7 + o7)
+
+ add r12, r12, r7, lsl #16 ; |--|d6|d5|d4|
+ add r12, r12, r10, lsl #24 ; |d7|d6|d5|d4|
+
+ str r12, [r3], lr
+ add r0, r0, #16
+ add r1, r1, r2 ; pred + pitch
+
+ bcs loop2_dual
+
+ add sp, sp, #4 ; idct_output buffer
+ ldmia sp!, {r4 - r11, pc}
+
ENDP
END
diff --git a/vp8/common/arm/armv6/recon_v6.asm b/vp8/common/arm/armv6/recon_v6.asm
deleted file mode 100644
index 99c7bcf2d..000000000
--- a/vp8/common/arm/armv6/recon_v6.asm
+++ /dev/null
@@ -1,281 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_recon_b_armv6|
- EXPORT |vp8_recon2b_armv6|
- EXPORT |vp8_recon4b_armv6|
-
- AREA |.text|, CODE, READONLY ; name this block of code
-prd RN r0
-dif RN r1
-dst RN r2
-stride RN r3
-
-;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride)
-; R0 char* pred_ptr
-; R1 short * dif_ptr
-; R2 char * dst_ptr
-; R3 int stride
-
-; Description:
-; Loop through the block adding the Pred and Diff together. Clamp and then
-; store back into the Dst.
-
-; Restrictions :
-; all buffers are expected to be 4 byte aligned coming in and
-; going out.
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-;
-;
-;
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-|vp8_recon_b_armv6| PROC
- stmdb sp!, {r4 - r9, lr}
-
- ;0, 1, 2, 3
- ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
- ldr r6, [dif, #0] ; 1 | 0
- ldr r7, [dif, #4] ; 3 | 2
-
- pkhbt r8, r6, r7, lsl #16 ; 2 | 0
- pkhtb r9, r7, r6, asr #16 ; 3 | 1
-
- uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
- uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
-
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- add dif, dif, #32
- orr r8, r8, r9, lsl #8
-
- str r8, [dst], stride
-
- ;0, 1, 2, 3
- ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
-;; ldr r6, [dif, #8] ; 1 | 0
-;; ldr r7, [dif, #12] ; 3 | 2
- ldr r6, [dif, #0] ; 1 | 0
- ldr r7, [dif, #4] ; 3 | 2
-
- pkhbt r8, r6, r7, lsl #16 ; 2 | 0
- pkhtb r9, r7, r6, asr #16 ; 3 | 1
-
- uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
- uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
-
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- add dif, dif, #32
- orr r8, r8, r9, lsl #8
-
- str r8, [dst], stride
-
- ;0, 1, 2, 3
- ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
-;; ldr r6, [dif, #16] ; 1 | 0
-;; ldr r7, [dif, #20] ; 3 | 2
- ldr r6, [dif, #0] ; 1 | 0
- ldr r7, [dif, #4] ; 3 | 2
-
- pkhbt r8, r6, r7, lsl #16 ; 2 | 0
- pkhtb r9, r7, r6, asr #16 ; 3 | 1
-
- uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
- uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
-
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- add dif, dif, #32
- orr r8, r8, r9, lsl #8
-
- str r8, [dst], stride
-
- ;0, 1, 2, 3
- ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
-;; ldr r6, [dif, #24] ; 1 | 0
-;; ldr r7, [dif, #28] ; 3 | 2
- ldr r6, [dif, #0] ; 1 | 0
- ldr r7, [dif, #4] ; 3 | 2
-
- pkhbt r8, r6, r7, lsl #16 ; 2 | 0
- pkhtb r9, r7, r6, asr #16 ; 3 | 1
-
- uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
- uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
-
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- orr r8, r8, r9, lsl #8
-
- str r8, [dst], stride
-
- ldmia sp!, {r4 - r9, pc}
-
- ENDP ; |recon_b|
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-;
-;
-;
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-; R0 char *pred_ptr
-; R1 short *dif_ptr
-; R2 char *dst_ptr
-; R3 int stride
-|vp8_recon4b_armv6| PROC
- stmdb sp!, {r4 - r9, lr}
-
- mov lr, #4
-
-recon4b_loop
- ;0, 1, 2, 3
- ldr r4, [prd], #4 ; 3 | 2 | 1 | 0
- ldr r6, [dif, #0] ; 1 | 0
- ldr r7, [dif, #4] ; 3 | 2
-
- pkhbt r8, r6, r7, lsl #16 ; 2 | 0
- pkhtb r9, r7, r6, asr #16 ; 3 | 1
-
- uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
- uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
-
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- orr r8, r8, r9, lsl #8
-
- str r8, [dst]
-
- ;4, 5, 6, 7
- ldr r4, [prd], #4
-;; ldr r6, [dif, #32]
-;; ldr r7, [dif, #36]
- ldr r6, [dif, #8]
- ldr r7, [dif, #12]
-
- pkhbt r8, r6, r7, lsl #16
- pkhtb r9, r7, r6, asr #16
-
- uxtab16 r8, r8, r4
- uxtab16 r9, r9, r4, ror #8
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- orr r8, r8, r9, lsl #8
-
- str r8, [dst, #4]
-
- ;8, 9, 10, 11
- ldr r4, [prd], #4
-;; ldr r6, [dif, #64]
-;; ldr r7, [dif, #68]
- ldr r6, [dif, #16]
- ldr r7, [dif, #20]
-
- pkhbt r8, r6, r7, lsl #16
- pkhtb r9, r7, r6, asr #16
-
- uxtab16 r8, r8, r4
- uxtab16 r9, r9, r4, ror #8
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- orr r8, r8, r9, lsl #8
-
- str r8, [dst, #8]
-
- ;12, 13, 14, 15
- ldr r4, [prd], #4
-;; ldr r6, [dif, #96]
-;; ldr r7, [dif, #100]
- ldr r6, [dif, #24]
- ldr r7, [dif, #28]
-
- pkhbt r8, r6, r7, lsl #16
- pkhtb r9, r7, r6, asr #16
-
- uxtab16 r8, r8, r4
- uxtab16 r9, r9, r4, ror #8
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- orr r8, r8, r9, lsl #8
-
- str r8, [dst, #12]
-
- add dst, dst, stride
-;; add dif, dif, #8
- add dif, dif, #32
-
- subs lr, lr, #1
- bne recon4b_loop
-
- ldmia sp!, {r4 - r9, pc}
-
- ENDP ; |Recon4B|
-
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-;
-;
-;
-;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-; R0 char *pred_ptr
-; R1 short *dif_ptr
-; R2 char *dst_ptr
-; R3 int stride
-|vp8_recon2b_armv6| PROC
- stmdb sp!, {r4 - r9, lr}
-
- mov lr, #4
-
-recon2b_loop
- ;0, 1, 2, 3
- ldr r4, [prd], #4
- ldr r6, [dif, #0]
- ldr r7, [dif, #4]
-
- pkhbt r8, r6, r7, lsl #16
- pkhtb r9, r7, r6, asr #16
-
- uxtab16 r8, r8, r4
- uxtab16 r9, r9, r4, ror #8
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- orr r8, r8, r9, lsl #8
-
- str r8, [dst]
-
- ;4, 5, 6, 7
- ldr r4, [prd], #4
-;; ldr r6, [dif, #32]
-;; ldr r7, [dif, #36]
- ldr r6, [dif, #8]
- ldr r7, [dif, #12]
-
- pkhbt r8, r6, r7, lsl #16
- pkhtb r9, r7, r6, asr #16
-
- uxtab16 r8, r8, r4
- uxtab16 r9, r9, r4, ror #8
- usat16 r8, #8, r8
- usat16 r9, #8, r9
- orr r8, r8, r9, lsl #8
-
- str r8, [dst, #4]
-
- add dst, dst, stride
-;; add dif, dif, #8
- add dif, dif, #16
-
- subs lr, lr, #1
- bne recon2b_loop
-
- ldmia sp!, {r4 - r9, pc}
-
- ENDP ; |Recon2B|
-
- END
diff --git a/vp8/common/arm/idct_arm.h b/vp8/common/arm/idct_arm.h
index 8b8d17917..c710c2eb0 100644
--- a/vp8/common/arm/idct_arm.h
+++ b/vp8/common/arm/idct_arm.h
@@ -13,16 +13,12 @@
#define IDCT_ARM_H
#if HAVE_ARMV6
-extern prototype_idct(vp8_short_idct4x4llm_1_v6);
extern prototype_idct(vp8_short_idct4x4llm_v6_dual);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_v6);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6);
extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_idct_idct1
-#define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6
-
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual
@@ -38,16 +34,12 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
#endif
#if HAVE_ARMV7
-extern prototype_idct(vp8_short_idct4x4llm_1_neon);
extern prototype_idct(vp8_short_idct4x4llm_neon);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);
extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_idct_idct1
-#define vp8_idct_idct1 vp8_short_idct4x4llm_1_neon
-
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_neon
diff --git a/vp8/common/arm/neon/dc_only_idct_add_neon.asm b/vp8/common/arm/neon/dc_only_idct_add_neon.asm
index 49ba05fb0..65a4680c1 100644
--- a/vp8/common/arm/neon/dc_only_idct_add_neon.asm
+++ b/vp8/common/arm/neon/dc_only_idct_add_neon.asm
@@ -14,22 +14,26 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
-; unsigned char *dst_ptr, int pitch, int stride)
+
+;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+; int pred_stride, unsigned char *dst_ptr,
+; int dst_stride)
+
; r0 input_dc
; r1 pred_ptr
-; r2 dst_ptr
-; r3 pitch
-; sp stride
+; r2 pred_stride
+; r3 dst_ptr
+; sp dst_stride
+
|vp8_dc_only_idct_add_neon| PROC
add r0, r0, #4
asr r0, r0, #3
ldr r12, [sp]
vdup.16 q0, r0
- vld1.32 {d2[0]}, [r1], r3
- vld1.32 {d2[1]}, [r1], r3
- vld1.32 {d4[0]}, [r1], r3
+ vld1.32 {d2[0]}, [r1], r2
+ vld1.32 {d2[1]}, [r1], r2
+ vld1.32 {d4[0]}, [r1], r2
vld1.32 {d4[1]}, [r1]
vaddw.u8 q1, q0, d2
@@ -38,12 +42,13 @@
vqmovun.s16 d2, q1
vqmovun.s16 d4, q2
- vst1.32 {d2[0]}, [r2], r12
- vst1.32 {d2[1]}, [r2], r12
- vst1.32 {d4[0]}, [r2], r12
- vst1.32 {d4[1]}, [r2]
-
- bx lr
+ vst1.32 {d2[0]}, [r3], r12
+ vst1.32 {d2[1]}, [r3], r12
+ vst1.32 {d4[0]}, [r3], r12
+ vst1.32 {d4[1]}, [r3]
+
+ bx lr
ENDP
+
END
diff --git a/vp8/common/arm/neon/recon16x16mb_neon.asm b/vp8/common/arm/neon/recon16x16mb_neon.asm
deleted file mode 100644
index 3f1a30f48..000000000
--- a/vp8/common/arm/neon/recon16x16mb_neon.asm
+++ /dev/null
@@ -1,131 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_recon16x16mb_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *pred_ptr,
-; r1 short *diff_ptr,
-; r2 unsigned char *dst_ptr,
-; r3 int ystride,
-; stack unsigned char *udst_ptr,
-; stack unsigned char *vdst_ptr
-
-|vp8_recon16x16mb_neon| PROC
- mov r12, #4 ;loop counter for Y loop
-
-recon16x16mb_loop_y
- vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
- vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
- vld1.u8 {q14, q15}, [r0]!
- vld1.16 {q10, q11}, [r1]!
-
- vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
- vmovl.u8 q1, d25
- vmovl.u8 q2, d26
- vmovl.u8 q3, d27
- vmovl.u8 q4, d28
- vmovl.u8 q5, d29
- vmovl.u8 q6, d30
- vld1.16 {q12, q13}, [r1]!
- vmovl.u8 q7, d31
- vld1.16 {q14, q15}, [r1]!
-
- pld [r0]
- pld [r1]
- pld [r1, #64]
-
- vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
- vadd.s16 q1, q1, q9
- vadd.s16 q2, q2, q10
- vadd.s16 q3, q3, q11
- vadd.s16 q4, q4, q12
- vadd.s16 q5, q5, q13
- vadd.s16 q6, q6, q14
- vadd.s16 q7, q7, q15
-
- vqmovun.s16 d0, q0 ;CLAMP() saturation
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q2
- vqmovun.s16 d3, q3
- vqmovun.s16 d4, q4
- vqmovun.s16 d5, q5
- vst1.u8 {q0}, [r2], r3 ;store result
- vqmovun.s16 d6, q6
- vst1.u8 {q1}, [r2], r3
- vqmovun.s16 d7, q7
- vst1.u8 {q2}, [r2], r3
- subs r12, r12, #1
-
- moveq r12, #2 ;loop counter for UV loop
-
- vst1.u8 {q3}, [r2], r3
- bne recon16x16mb_loop_y
-
- mov r3, r3, lsr #1 ;uv_stride = ystride>>1
- ldr r2, [sp] ;load upred_ptr
-
-recon16x16mb_loop_uv
- vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
- vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
- vld1.u8 {q14, q15}, [r0]!
- vld1.16 {q10, q11}, [r1]!
-
- vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
- vmovl.u8 q1, d25
- vmovl.u8 q2, d26
- vmovl.u8 q3, d27
- vmovl.u8 q4, d28
- vmovl.u8 q5, d29
- vmovl.u8 q6, d30
- vld1.16 {q12, q13}, [r1]!
- vmovl.u8 q7, d31
- vld1.16 {q14, q15}, [r1]!
-
- vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
- vadd.s16 q1, q1, q9
- vadd.s16 q2, q2, q10
- vadd.s16 q3, q3, q11
- vadd.s16 q4, q4, q12
- vadd.s16 q5, q5, q13
- vadd.s16 q6, q6, q14
-
- vqmovun.s16 d0, q0 ;CLAMP() saturation
- vadd.s16 q7, q7, q15
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q2
- vqmovun.s16 d3, q3
- vst1.u8 {d0}, [r2], r3 ;store result
- vqmovun.s16 d4, q4
- vst1.u8 {d1}, [r2], r3
- vqmovun.s16 d5, q5
- vst1.u8 {d2}, [r2], r3
- vqmovun.s16 d6, q6
- vst1.u8 {d3}, [r2], r3
- vqmovun.s16 d7, q7
- vst1.u8 {d4}, [r2], r3
- subs r12, r12, #1
-
- vst1.u8 {d5}, [r2], r3
- vst1.u8 {d6}, [r2], r3
- vst1.u8 {d7}, [r2], r3
-
- ldrne r2, [sp, #4] ;load vpred_ptr
- bne recon16x16mb_loop_uv
-
- bx lr
-
- ENDP
- END
diff --git a/vp8/common/arm/neon/recon2b_neon.asm b/vp8/common/arm/neon/recon2b_neon.asm
deleted file mode 100644
index 99b251c91..000000000
--- a/vp8/common/arm/neon/recon2b_neon.asm
+++ /dev/null
@@ -1,54 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_recon2b_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *pred_ptr,
-; r1 short *diff_ptr,
-; r2 unsigned char *dst_ptr,
-; r3 int stride
-
-|vp8_recon2b_neon| PROC
- vld1.u8 {q8, q9}, [r0] ;load data from pred_ptr
- vld1.16 {q4, q5}, [r1]! ;load data from diff_ptr
-
- vmovl.u8 q0, d16 ;modify Pred data from 8 bits to 16 bits
- vld1.16 {q6, q7}, [r1]!
- vmovl.u8 q1, d17
- vmovl.u8 q2, d18
- vmovl.u8 q3, d19
-
- vadd.s16 q0, q0, q4 ;add Diff data and Pred data together
- vadd.s16 q1, q1, q5
- vadd.s16 q2, q2, q6
- vadd.s16 q3, q3, q7
-
- vqmovun.s16 d0, q0 ;CLAMP() saturation
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q2
- vqmovun.s16 d3, q3
- add r0, r2, r3
-
- vst1.u8 {d0}, [r2] ;store result
- vst1.u8 {d1}, [r0], r3
- add r2, r0, r3
- vst1.u8 {d2}, [r0]
- vst1.u8 {d3}, [r2], r3
-
- bx lr
-
- ENDP
- END
diff --git a/vp8/common/arm/neon/recon4b_neon.asm b/vp8/common/arm/neon/recon4b_neon.asm
deleted file mode 100644
index 991727746..000000000
--- a/vp8/common/arm/neon/recon4b_neon.asm
+++ /dev/null
@@ -1,69 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_recon4b_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *pred_ptr,
-; r1 short *diff_ptr,
-; r2 unsigned char *dst_ptr,
-; r3 int stride
-
-|vp8_recon4b_neon| PROC
- vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
- vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
- vld1.u8 {q14, q15}, [r0]
- vld1.16 {q10, q11}, [r1]!
-
- vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
- vmovl.u8 q1, d25
- vmovl.u8 q2, d26
- vmovl.u8 q3, d27
- vmovl.u8 q4, d28
- vmovl.u8 q5, d29
- vmovl.u8 q6, d30
- vld1.16 {q12, q13}, [r1]!
- vmovl.u8 q7, d31
- vld1.16 {q14, q15}, [r1]
-
- vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
- vadd.s16 q1, q1, q9
- vadd.s16 q2, q2, q10
- vadd.s16 q3, q3, q11
- vadd.s16 q4, q4, q12
- vadd.s16 q5, q5, q13
- vadd.s16 q6, q6, q14
- vadd.s16 q7, q7, q15
-
- vqmovun.s16 d0, q0 ;CLAMP() saturation
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q2
- vqmovun.s16 d3, q3
- vqmovun.s16 d4, q4
- vqmovun.s16 d5, q5
- vqmovun.s16 d6, q6
- vqmovun.s16 d7, q7
- add r0, r2, r3
-
- vst1.u8 {q0}, [r2] ;store result
- vst1.u8 {q1}, [r0], r3
- add r2, r0, r3
- vst1.u8 {q2}, [r0]
- vst1.u8 {q3}, [r2], r3
-
- bx lr
-
- ENDP
- END
diff --git a/vp8/common/arm/neon/recon_neon.c b/vp8/common/arm/neon/recon_neon.c
deleted file mode 100644
index d2aafd51f..000000000
--- a/vp8/common/arm/neon/recon_neon.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8/common/recon.h"
-#include "vp8/common/blockd.h"
-
-extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
-
-void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
- unsigned char *pred_ptr = &x->predictor[0];
- short *diff_ptr = &x->diff[0];
- unsigned char *dst_ptr = x->dst.y_buffer;
- unsigned char *udst_ptr = x->dst.u_buffer;
- unsigned char *vdst_ptr = x->dst.v_buffer;
- int ystride = x->dst.y_stride;
- /*int uv_stride = x->dst.uv_stride;*/
-
- vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride, udst_ptr, vdst_ptr);
-}
diff --git a/vp8/common/arm/neon/reconb_neon.asm b/vp8/common/arm/neon/reconb_neon.asm
deleted file mode 100644
index 288c0ef01..000000000
--- a/vp8/common/arm/neon/reconb_neon.asm
+++ /dev/null
@@ -1,61 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_recon_b_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0 unsigned char *pred_ptr,
-; r1 short *diff_ptr,
-; r2 unsigned char *dst_ptr,
-; r3 int stride
-
-|vp8_recon_b_neon| PROC
- mov r12, #16
-
- vld1.u8 {d28}, [r0], r12 ;load 4 data/line from pred_ptr
- vld1.16 {q10, q11}, [r1]! ;load data from diff_ptr
- vld1.u8 {d29}, [r0], r12
- vld1.16 {q11, q12}, [r1]!
- vld1.u8 {d30}, [r0], r12
- vld1.16 {q12, q13}, [r1]!
- vld1.u8 {d31}, [r0], r12
- vld1.16 {q13}, [r1]
-
- vmovl.u8 q0, d28 ;modify Pred data from 8 bits to 16 bits
- vmovl.u8 q1, d29 ;Pred data in d0, d2, d4, d6
- vmovl.u8 q2, d30
- vmovl.u8 q3, d31
-
- vadd.s16 d0, d0, d20 ;add Diff data and Pred data together
- vadd.s16 d2, d2, d22
- vadd.s16 d4, d4, d24
- vadd.s16 d6, d6, d26
-
- vqmovun.s16 d0, q0 ;CLAMP() saturation
- vqmovun.s16 d1, q1
- vqmovun.s16 d2, q2
- vqmovun.s16 d3, q3
- add r1, r2, r3
-
- vst1.32 {d0[0]}, [r2] ;store result
- vst1.32 {d1[0]}, [r1], r3
- add r2, r1, r3
- vst1.32 {d2[0]}, [r1]
- vst1.32 {d3[0]}, [r2], r3
-
- bx lr
-
- ENDP
- END
diff --git a/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm b/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm
deleted file mode 100644
index d7bdbae75..000000000
--- a/vp8/common/arm/neon/shortidct4x4llm_1_neon.asm
+++ /dev/null
@@ -1,67 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_short_idct4x4llm_1_neon|
- EXPORT |vp8_dc_only_idct_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
-; r0 short *input;
-; r1 short *output;
-; r2 int pitch;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-|vp8_short_idct4x4llm_1_neon| PROC
- vld1.16 {d0[]}, [r0] ;load input[0]
-
- add r3, r1, r2
- add r12, r3, r2
-
- vrshr.s16 d0, d0, #3
-
- add r0, r12, r2
-
- vst1.16 {d0}, [r1]
- vst1.16 {d0}, [r3]
- vst1.16 {d0}, [r12]
- vst1.16 {d0}, [r0]
-
- bx lr
- ENDP
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
-; r0 short input_dc;
-; r1 short *output;
-; r2 int pitch;
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-|vp8_dc_only_idct_neon| PROC
- vdup.16 d0, r0
-
- add r3, r1, r2
- add r12, r3, r2
-
- vrshr.s16 d0, d0, #3
-
- add r0, r12, r2
-
- vst1.16 {d0}, [r1]
- vst1.16 {d0}, [r3]
- vst1.16 {d0}, [r12]
- vst1.16 {d0}, [r0]
-
- bx lr
-
- ENDP
- END
diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.asm b/vp8/common/arm/neon/shortidct4x4llm_neon.asm
index b74c31521..67d2ab015 100644
--- a/vp8/common/arm/neon/shortidct4x4llm_neon.asm
+++ b/vp8/common/arm/neon/shortidct4x4llm_neon.asm
@@ -17,18 +17,24 @@
AREA ||.text||, CODE, READONLY, ALIGN=2
;*************************************************************
-;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
+;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
+; unsigned char *dst, int stride)
;r0 short * input
-;r1 short * output
+;r1 short * pred
;r2 int pitch
+;r3 unsigned char dst
+;sp int stride
;*************************************************************
-;static const int cospi8sqrt2minus1=20091;
-;static const int sinpi8sqrt2 =35468;
-;static const int rounding = 0;
-;Optimization note: The resulted data from dequantization are signed 13-bit data that is
-;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since
-;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half
-;result of the multiplication that is needed in IDCT.
+
+; static const int cospi8sqrt2minus1=20091;
+; static const int sinpi8sqrt2 =35468;
+; static const int rounding = 0;
+
+; Optimization note: The resulted data from dequantization are signed
+; 13-bit data that is in the range of [-4096, 4095]. This allows to
+; use "vqdmulh"(neon) instruction since it won't go out of range
+; (13+16+1=30bits<32bits). This instruction gives the high half
+; result of the multiplication that is needed in IDCT.
|vp8_short_idct4x4llm_neon| PROC
adr r12, idct_coeff
@@ -36,6 +42,7 @@
vld1.16 {d0}, [r12]
vswp d3, d4 ;q2(vp[4] vp[12])
+ ldr r0, [sp] ; stride
vqdmulh.s16 q3, q2, d0[2]
vqdmulh.s16 q4, q2, d0[0]
@@ -94,21 +101,31 @@
vrshr.s16 d4, d4, #3
vrshr.s16 d5, d5, #3
- add r3, r1, r2
- add r12, r3, r2
- add r0, r12, r2
-
vtrn.32 d2, d4
vtrn.32 d3, d5
vtrn.16 d2, d3
vtrn.16 d4, d5
- vst1.16 {d2}, [r1]
- vst1.16 {d3}, [r3]
- vst1.16 {d4}, [r12]
- vst1.16 {d5}, [r0]
+ ; load prediction data
+ vld1.32 d6[0], [r1], r2
+ vld1.32 d6[1], [r1], r2
+ vld1.32 d7[0], [r1], r2
+ vld1.32 d7[1], [r1], r2
+
+ ; add prediction and residual
+ vaddw.u8 q1, q1, d6
+ vaddw.u8 q2, q2, d7
+
+ vqmovun.s16 d1, q1
+ vqmovun.s16 d2, q2
+
+ ; store to destination
+ vst1.32 d1[0], [r3], r0
+ vst1.32 d1[1], [r3], r0
+ vst1.32 d2[0], [r3], r0
+ vst1.32 d2[1], [r3], r0
- bx lr
+ bx lr
ENDP
diff --git a/vp8/common/arm/recon_arm.h b/vp8/common/arm/recon_arm.h
index 377cb2a07..dec7fc425 100644
--- a/vp8/common/arm/recon_arm.h
+++ b/vp8/common/arm/recon_arm.h
@@ -13,24 +13,12 @@
#define RECON_ARM_H
#if HAVE_ARMV6
-extern prototype_recon_block(vp8_recon_b_armv6);
-extern prototype_recon_block(vp8_recon2b_armv6);
-extern prototype_recon_block(vp8_recon4b_armv6);
extern prototype_copy_block(vp8_copy_mem8x8_v6);
extern prototype_copy_block(vp8_copy_mem8x4_v6);
extern prototype_copy_block(vp8_copy_mem16x16_v6);
#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_recon_recon
-#define vp8_recon_recon vp8_recon_b_armv6
-
-#undef vp8_recon_recon2
-#define vp8_recon_recon2 vp8_recon2b_armv6
-
-#undef vp8_recon_recon4
-#define vp8_recon_recon4 vp8_recon4b_armv6
-
#undef vp8_recon_copy8x8
#define vp8_recon_copy8x8 vp8_copy_mem8x8_v6
@@ -43,29 +31,15 @@ extern prototype_copy_block(vp8_copy_mem16x16_v6);
#endif
#if HAVE_ARMV7
-extern prototype_recon_block(vp8_recon_b_neon);
-extern prototype_recon_block(vp8_recon2b_neon);
-extern prototype_recon_block(vp8_recon4b_neon);
extern prototype_copy_block(vp8_copy_mem8x8_neon);
extern prototype_copy_block(vp8_copy_mem8x4_neon);
extern prototype_copy_block(vp8_copy_mem16x16_neon);
-extern prototype_recon_macroblock(vp8_recon_mb_neon);
-
extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_neon);
extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_recon_recon
-#define vp8_recon_recon vp8_recon_b_neon
-
-#undef vp8_recon_recon2
-#define vp8_recon_recon2 vp8_recon2b_neon
-
-#undef vp8_recon_recon4
-#define vp8_recon_recon4 vp8_recon4b_neon
-
#undef vp8_recon_copy8x8
#define vp8_recon_copy8x8 vp8_copy_mem8x8_neon
@@ -75,9 +49,6 @@ extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon);
#undef vp8_recon_copy16x16
#define vp8_recon_copy16x16 vp8_copy_mem16x16_neon
-#undef vp8_recon_recon_mb
-#define vp8_recon_recon_mb vp8_recon_mb_neon
-
#undef vp8_recon_build_intra_predictors_mby
#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_neon
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index d1dd60286..5c4fbb193 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -70,7 +70,6 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
#if CONFIG_RUNTIME_CPU_DETECT
VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
- rtcd->idct.idct1 = vp8_short_idct4x4llm_1_c;
rtcd->idct.idct16 = vp8_short_idct4x4llm_c;
rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c;
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_c;
@@ -79,11 +78,7 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
rtcd->recon.copy16x16 = vp8_copy_mem16x16_c;
rtcd->recon.copy8x8 = vp8_copy_mem8x8_c;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_c;
- rtcd->recon.recon = vp8_recon_b_c;
- rtcd->recon.recon2 = vp8_recon2b_c;
- rtcd->recon.recon4 = vp8_recon4b_c;
- rtcd->recon.recon_mb = vp8_recon_mb_c;
- rtcd->recon.recon_mby = vp8_recon_mby_c;
+
rtcd->recon.build_intra_predictors_mby =
vp8_build_intra_predictors_mby;
rtcd->recon.build_intra_predictors_mby_s =
diff --git a/vp8/common/idct.h b/vp8/common/idct.h
index f5fd94dfd..411a1b472 100644
--- a/vp8/common/idct.h
+++ b/vp8/common/idct.h
@@ -16,12 +16,14 @@
void sym(short *input, short *output)
#define prototype_idct(sym) \
- void sym(short *input, short *output, int pitch)
+ void sym(short *input, unsigned char *pred, int pitch, unsigned char *dst, \
+ int dst_stride)
#define prototype_idct_scalar_add(sym) \
void sym(short input, \
- unsigned char *pred, unsigned char *output, \
- int pitch, int stride)
+ unsigned char *pred, int pred_stride, \
+ unsigned char *dst, \
+ int dst_stride)
#if ARCH_X86 || ARCH_X86_64
#include "x86/idct_x86.h"
@@ -31,11 +33,6 @@
#include "arm/idct_arm.h"
#endif
-#ifndef vp8_idct_idct1
-#define vp8_idct_idct1 vp8_short_idct4x4llm_1_c
-#endif
-extern prototype_idct(vp8_idct_idct1);
-
#ifndef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_c
#endif
@@ -63,7 +60,6 @@ typedef prototype_second_order((*vp8_second_order_fn_t));
typedef struct
{
- vp8_idct_fn_t idct1;
vp8_idct_fn_t idct16;
vp8_idct_scalar_add_fn_t idct1_scalar_add;
diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c
index 196062df6..49496abef 100644
--- a/vp8/common/idctllm.c
+++ b/vp8/common/idctllm.c
@@ -24,28 +24,31 @@
**************************************************************************/
static const int cospi8sqrt2minus1 = 20091;
static const int sinpi8sqrt2 = 35468;
-static const int rounding = 0;
-void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
+
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride)
{
int i;
+ int r, c;
int a1, b1, c1, d1;
-
+ short output[16];
short *ip = input;
short *op = output;
int temp1, temp2;
- int shortpitch = pitch >> 1;
+ int shortpitch = 4;
for (i = 0; i < 4; i++)
{
a1 = ip[0] + ip[8];
b1 = ip[0] - ip[8];
- temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
- temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
+ temp1 = (ip[4] * sinpi8sqrt2) >> 16;
+ temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
c1 = temp1 - temp2;
- temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
- temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
+ temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[12] * sinpi8sqrt2) >> 16;
d1 = temp1 + temp2;
op[shortpitch*0] = a1 + d1;
@@ -66,12 +69,12 @@ void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
a1 = ip[0] + ip[2];
b1 = ip[0] - ip[2];
- temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;
- temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);
+ temp1 = (ip[1] * sinpi8sqrt2) >> 16;
+ temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
c1 = temp1 - temp2;
- temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);
- temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;
+ temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
+ temp2 = (ip[3] * sinpi8sqrt2) >> 16;
d1 = temp1 + temp2;
@@ -84,27 +87,31 @@ void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
ip += shortpitch;
op += shortpitch;
}
-}
-void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch)
-{
- int i;
- int a1;
- short *op = output;
- int shortpitch = pitch >> 1;
- a1 = ((input[0] + 4) >> 3);
-
- for (i = 0; i < 4; i++)
+ ip = output;
+ for (r = 0; r < 4; r++)
{
- op[0] = a1;
- op[1] = a1;
- op[2] = a1;
- op[3] = a1;
- op += shortpitch;
+ for (c = 0; c < 4; c++)
+ {
+ int a = ip[c] + pred_ptr[c] ;
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dst_ptr[c] = (unsigned char) a ;
+ }
+ ip += 4;
+ dst_ptr += dst_stride;
+ pred_ptr += pred_stride;
}
}
-void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride)
{
int a1 = ((input_dc + 4) >> 3);
int r, c;
@@ -124,8 +131,8 @@ void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned ch
dst_ptr[c] = (unsigned char) a ;
}
- dst_ptr += stride;
- pred_ptr += pitch;
+ dst_ptr += dst_stride;
+ pred_ptr += pred_stride;
}
}
diff --git a/vp8/common/invtrans.c b/vp8/common/invtrans.c
index 81a3f2d89..7712b59b7 100644
--- a/vp8/common/invtrans.c
+++ b/vp8/common/invtrans.c
@@ -12,6 +12,21 @@
#include "invtrans.h"
+void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b,
+ int pitch)
+{
+ if (b->eob > 1)
+ {
+ IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, pitch,
+ *(b->base_dst) + b->dst, b->dst_stride);
+ }
+ else
+ {
+ IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, pitch,
+ *(b->base_dst) + b->dst, b->dst_stride);
+ }
+
+}
static void recon_dcblock(MACROBLOCKD *x)
{
@@ -25,15 +40,6 @@ static void recon_dcblock(MACROBLOCKD *x)
}
-void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch)
-{
- if (b->eob > 1)
- IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch);
- else
- IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch);
-}
-
-
void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
{
int i;
@@ -45,7 +51,7 @@ void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *
for (i = 0; i < 16; i++)
{
- vp8_inverse_transform_b(rtcd, &x->block[i], 32);
+ vp8_inverse_transform_b(rtcd, &x->block[i], 16);
}
}
@@ -55,34 +61,10 @@ void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD
for (i = 16; i < 24; i++)
{
- vp8_inverse_transform_b(rtcd, &x->block[i], 16);
+ vp8_inverse_transform_b(rtcd, &x->block[i], 8);
}
}
-void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
- int i;
- if (x->mode_info_context->mbmi.mode != B_PRED &&
- x->mode_info_context->mbmi.mode != SPLITMV)
- {
- /* do 2nd order transform on the dc block */
-
- IDCT_INVOKE(rtcd, iwalsh16)(&x->block[24].dqcoeff[0], x->block[24].diff);
- recon_dcblock(x);
- }
-
- for (i = 0; i < 16; i++)
- {
- vp8_inverse_transform_b(rtcd, &x->block[i], 32);
- }
-
-
- for (i = 16; i < 24; i++)
- {
- vp8_inverse_transform_b(rtcd, &x->block[i], 16);
- }
-
-}
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 015b4c4d4..e911ea0f4 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -19,6 +19,7 @@ extern "C"
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
#include "vpx_scale/yv12config.h"
#include "type_aliases.h"
#include "ppflags.h"
@@ -145,9 +146,9 @@ extern "C"
int over_shoot_pct;
// buffering parameters
- int starting_buffer_level; // in seconds
- int optimal_buffer_level;
- int maximum_buffer_size;
+ int64_t starting_buffer_level; // in seconds
+ int64_t optimal_buffer_level;
+ int64_t maximum_buffer_size;
// controlling quality
int fixed_q;
@@ -198,6 +199,14 @@ extern "C"
struct vpx_codec_pkt_list *output_pkt_list;
vp8e_tuning tuning;
+
+ // Temporal scaling parameters
+ unsigned int number_of_layers;
+ unsigned int target_bitrate[MAX_PERIODICITY];
+ unsigned int rate_decimator[MAX_PERIODICITY];
+ unsigned int periodicity;
+ unsigned int layer_id[MAX_PERIODICITY];
+
} VP8_CONFIG;
diff --git a/vp8/common/recon.h b/vp8/common/recon.h
index 7cfc779cd..62bd71aac 100644
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -18,7 +18,7 @@
void sym(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch)
#define prototype_recon_block(sym) \
- void sym(unsigned char *pred, short *diff, unsigned char *dst, int pitch)
+ void sym(unsigned char *pred, short *diff, int diff_stride, unsigned char *dst, int pitch)
#define prototype_recon_macroblock(sym) \
void sym(const struct vp8_recon_rtcd_vtable *rtcd, MACROBLOCKD *x)
@@ -27,7 +27,7 @@
void sym(MACROBLOCKD *x)
#define prototype_intra4x4_predict(sym) \
- void sym(BLOCKD *x, int b_mode, unsigned char *predictor)
+ void sym(BLOCKD *x, int b_mode, unsigned char *predictor, int stride)
struct vp8_recon_rtcd_vtable;
@@ -54,31 +54,6 @@ extern prototype_copy_block(vp8_recon_copy8x8);
#endif
extern prototype_copy_block(vp8_recon_copy8x4);
-#ifndef vp8_recon_recon
-#define vp8_recon_recon vp8_recon_b_c
-#endif
-extern prototype_recon_block(vp8_recon_recon);
-
-#ifndef vp8_recon_recon2
-#define vp8_recon_recon2 vp8_recon2b_c
-#endif
-extern prototype_recon_block(vp8_recon_recon2);
-
-#ifndef vp8_recon_recon4
-#define vp8_recon_recon4 vp8_recon4b_c
-#endif
-extern prototype_recon_block(vp8_recon_recon4);
-
-#ifndef vp8_recon_recon_mb
-#define vp8_recon_recon_mb vp8_recon_mb_c
-#endif
-extern prototype_recon_macroblock(vp8_recon_recon_mb);
-
-#ifndef vp8_recon_recon_mby
-#define vp8_recon_recon_mby vp8_recon_mby_c
-#endif
-extern prototype_recon_macroblock(vp8_recon_recon_mby);
-
#ifndef vp8_recon_build_intra_predictors_mby
#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby
#endif
@@ -111,8 +86,6 @@ extern prototype_intra4x4_predict\
typedef prototype_copy_block((*vp8_copy_block_fn_t));
-typedef prototype_recon_block((*vp8_recon_fn_t));
-typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t));
typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t));
typedef prototype_intra4x4_predict((*vp8_intra4x4_pred_fn_t));
typedef struct vp8_recon_rtcd_vtable
@@ -120,11 +93,7 @@ typedef struct vp8_recon_rtcd_vtable
vp8_copy_block_fn_t copy16x16;
vp8_copy_block_fn_t copy8x8;
vp8_copy_block_fn_t copy8x4;
- vp8_recon_fn_t recon;
- vp8_recon_fn_t recon2;
- vp8_recon_fn_t recon4;
- vp8_recon_mb_fn_t recon_mb;
- vp8_recon_mb_fn_t recon_mby;
+
vp8_build_intra_pred_fn_t build_intra_predictors_mby_s;
vp8_build_intra_pred_fn_t build_intra_predictors_mby;
vp8_build_intra_pred_fn_t build_intra_predictors_mbuv_s;
@@ -138,5 +107,4 @@ typedef struct vp8_recon_rtcd_vtable
#define RECON_INVOKE(ctx,fn) vp8_recon_##fn
#endif
-void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
#endif
diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
index e4e8a80a4..24c09a353 100644
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -123,7 +123,6 @@ void vp8_copy_mem8x4_c(
}
-
void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf)
{
int r;
@@ -159,41 +158,73 @@ void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf)
}
}
-static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch)
+static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride)
{
unsigned char *ptr_base;
unsigned char *ptr;
- unsigned char *pred_ptr = d->predictor;
ptr_base = *(d->base_pre);
ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
{
- x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
+ x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
}
else
{
- RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, pred_ptr, pitch);
+ RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst, dst_stride);
}
}
-static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch)
+static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride)
{
unsigned char *ptr_base;
unsigned char *ptr;
- unsigned char *pred_ptr = d->predictor;
ptr_base = *(d->base_pre);
ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
{
- x->subpixel_predict8x4(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
+ x->subpixel_predict8x4(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
+ }
+ else
+ {
+ RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d->pre_stride, dst, dst_stride);
+ }
+}
+
+static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stride, vp8_subpix_fn_t sppf)
+{
+ int r;
+ unsigned char *ptr_base;
+ unsigned char *ptr;
+
+ ptr_base = *(d->base_pre);
+
+ if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
+ {
+ ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+ sppf(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
}
else
{
- RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d->pre_stride, pred_ptr, pitch);
+ ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+ ptr = ptr_base;
+
+ for (r = 0; r < 4; r++)
+ {
+#if !(CONFIG_FAST_UNALIGNED)
+ dst[0] = ptr[0];
+ dst[1] = ptr[1];
+ dst[2] = ptr[2];
+ dst[3] = ptr[3];
+#else
+ *(uint32_t *)dst = *(uint32_t *)ptr ;
+#endif
+ dst += dst_stride;
+ ptr += d->pre_stride;
+ }
}
}
@@ -292,7 +323,7 @@ void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
BLOCKD *d1 = &x->block[i+1];
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
- build_inter_predictors2b(x, d0, 8);
+ build_inter_predictors2b(x, d0, d0->predictor, 8);
else
{
vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
@@ -435,6 +466,9 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
if (x->mode_info_context->mbmi.partitioning < 3)
{
+ BLOCKD *b;
+ int dst_stride = x->block[ 0].dst_stride;
+
x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
@@ -447,10 +481,14 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
clamp_mv_to_umv_border(&x->block[10].bmi.mv.as_mv, x);
}
- build_inter_predictors4b(x, &x->block[ 0], 16);
- build_inter_predictors4b(x, &x->block[ 2], 16);
- build_inter_predictors4b(x, &x->block[ 8], 16);
- build_inter_predictors4b(x, &x->block[10], 16);
+ b = &x->block[ 0];
+ build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
+ b = &x->block[ 2];
+ build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
+ b = &x->block[ 8];
+ build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
+ b = &x->block[10];
+ build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
}
else
{
@@ -458,6 +496,7 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
+ int dst_stride = x->block[ 0].dst_stride;
x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
@@ -468,11 +507,11 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
}
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
- build_inter_predictors2b(x, d0, 16);
+ build_inter_predictors2b(x, d0, *(d0->base_dst) + d0->dst, dst_stride);
else
{
- vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
- vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict);
+ build_inter_predictors_b(d0, *(d0->base_dst) + d0->dst, dst_stride, x->subpixel_predict);
+ build_inter_predictors_b(d1, *(d1->base_dst) + d1->dst, dst_stride, x->subpixel_predict);
}
}
@@ -483,15 +522,16 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
+ int dst_stride = x->block[ 16].dst_stride;
/* Note: uv mvs already clamped in build_4x4uvmvs() */
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
- build_inter_predictors2b(x, d0, 8);
+ build_inter_predictors2b(x, d0, *(d0->base_dst) + d0->dst, dst_stride);
else
{
- vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
- vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict);
+ build_inter_predictors_b(d0, *(d0->base_dst) + d0->dst, dst_stride, x->subpixel_predict);
+ build_inter_predictors_b(d1, *(d1->base_dst) + d1->dst, dst_stride, x->subpixel_predict);
}
}
}
@@ -542,17 +582,83 @@ void build_4x4uvmvs(MACROBLOCKD *x)
}
}
-void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
+void vp8_build_inter_predictors_mb(MACROBLOCKD *xd)
{
- if (x->mode_info_context->mbmi.mode != SPLITMV)
+ if (xd->mode_info_context->mbmi.mode != SPLITMV)
{
- vp8_build_inter16x16_predictors_mb(x, x->predictor, &x->predictor[256],
- &x->predictor[320], 16, 8);
+ vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
+ xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.y_stride, xd->dst.uv_stride);
}
else
{
- build_4x4uvmvs(x);
- build_inter4x4_predictors_mb(x);
+ build_4x4uvmvs(xd);
+ build_inter4x4_predictors_mb(xd);
}
}
+/* encoder only*/
+static void build_inter4x4_predictors_mb_e(MACROBLOCKD *x)
+{
+ int i;
+ if (x->mode_info_context->mbmi.partitioning < 3)
+ {
+ x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
+ x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
+ x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
+ x->block[10].bmi = x->mode_info_context->bmi[10];
+
+ build_inter_predictors4b(x, &x->block[ 0], x->block[ 0].predictor, 16);
+ build_inter_predictors4b(x, &x->block[ 2], x->block[ 2].predictor, 16);
+ build_inter_predictors4b(x, &x->block[ 8], x->block[ 8].predictor, 16);
+ build_inter_predictors4b(x, &x->block[10], x->block[10].predictor, 16);
+ }
+ else
+ {
+ for (i = 0; i < 16; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+
+ x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
+ x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ build_inter_predictors2b(x, d0, d0->predictor, 16);
+ else
+ {
+ build_inter_predictors_b(d0, d0->predictor, 16, x->subpixel_predict);
+ build_inter_predictors_b(d1, d1->predictor, 16, x->subpixel_predict);
+ }
+
+ }
+
+ }
+
+ for (i = 16; i < 24; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ build_inter_predictors2b(x, d0, d0->predictor, 8);
+ else
+ {
+ build_inter_predictors_b(d0, d0->predictor, 8, x->subpixel_predict);
+ build_inter_predictors_b(d1, d1->predictor, 8, x->subpixel_predict);
+ }
+ }
+}
+void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd)
+{
+ if (xd->mode_info_context->mbmi.mode != SPLITMV)
+ {
+ vp8_build_inter16x16_predictors_mb(xd, xd->predictor, &xd->predictor[256],
+ &xd->predictor[320], 16, 8);
+ }
+ else
+ {
+ build_4x4uvmvs(xd);
+ build_inter4x4_predictors_mb_e(xd);
+ }
+}
diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h
index 456812ecd..86f9d5ae3 100644
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -26,5 +26,6 @@ extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t s
extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
+extern void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd);
#endif
diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c
index 16dadc47d..c0863eeb1 100644
--- a/vp8/common/reconintra.c
+++ b/vp8/common/reconintra.c
@@ -17,16 +17,6 @@
/* For skip_recon_mb(), add vp8_build_intra_predictors_mby_s(MACROBLOCKD *x) and
* vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x).
*/
-void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
- int i;
-
- for (i = 16; i < 24; i += 2)
- {
- BLOCKD *b = &x->block[i];
- RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
- }
-}
void vp8_build_intra_predictors_mby(MACROBLOCKD *x)
{
diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c
index 0e1ebb584..12430da92 100644
--- a/vp8/common/reconintra4x4.c
+++ b/vp8/common/reconintra4x4.c
@@ -16,7 +16,7 @@
void vp8_intra4x4_predict(BLOCKD *x,
int b_mode,
- unsigned char *predictor)
+ unsigned char *predictor, int stride)
{
int i, r, c;
@@ -50,7 +50,7 @@ void vp8_intra4x4_predict(BLOCKD *x,
predictor[c] = expected_dc;
}
- predictor += 16;
+ predictor += stride;
}
}
break;
@@ -72,7 +72,7 @@ void vp8_intra4x4_predict(BLOCKD *x,
predictor[c] = pred;
}
- predictor += 16;
+ predictor += stride;
}
}
break;
@@ -94,7 +94,7 @@ void vp8_intra4x4_predict(BLOCKD *x,
predictor[c] = ap[c];
}
- predictor += 16;
+ predictor += stride;
}
}
@@ -117,29 +117,29 @@ void vp8_intra4x4_predict(BLOCKD *x,
predictor[c] = lp[r];
}
- predictor += 16;
+ predictor += stride;
}
}
break;
case B_LD_PRED:
{
unsigned char *ptr = Above;
- predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
- predictor[0 * 16 + 1] =
- predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
- predictor[0 * 16 + 2] =
- predictor[1 * 16 + 1] =
- predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
- predictor[0 * 16 + 3] =
- predictor[1 * 16 + 2] =
- predictor[2 * 16 + 1] =
- predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
- predictor[1 * 16 + 3] =
- predictor[2 * 16 + 2] =
- predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
- predictor[2 * 16 + 3] =
- predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
- predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
+ predictor[0 * stride + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
+ predictor[0 * stride + 1] =
+ predictor[1 * stride + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
+ predictor[0 * stride + 2] =
+ predictor[1 * stride + 1] =
+ predictor[2 * stride + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
+ predictor[0 * stride + 3] =
+ predictor[1 * stride + 2] =
+ predictor[2 * stride + 1] =
+ predictor[3 * stride + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
+ predictor[1 * stride + 3] =
+ predictor[2 * stride + 2] =
+ predictor[3 * stride + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
+ predictor[2 * stride + 3] =
+ predictor[3 * stride + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
+ predictor[3 * stride + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
}
break;
@@ -158,22 +158,22 @@ void vp8_intra4x4_predict(BLOCKD *x,
pp[7] = Above[2];
pp[8] = Above[3];
- predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
- predictor[3 * 16 + 1] =
- predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
- predictor[3 * 16 + 2] =
- predictor[2 * 16 + 1] =
- predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
- predictor[3 * 16 + 3] =
- predictor[2 * 16 + 2] =
- predictor[1 * 16 + 1] =
- predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
- predictor[2 * 16 + 3] =
- predictor[1 * 16 + 2] =
- predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
- predictor[1 * 16 + 3] =
- predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
- predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+ predictor[3 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ predictor[3 * stride + 1] =
+ predictor[2 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ predictor[3 * stride + 2] =
+ predictor[2 * stride + 1] =
+ predictor[1 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ predictor[3 * stride + 3] =
+ predictor[2 * stride + 2] =
+ predictor[1 * stride + 1] =
+ predictor[0 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ predictor[2 * stride + 3] =
+ predictor[1 * stride + 2] =
+ predictor[0 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ predictor[1 * stride + 3] =
+ predictor[0 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ predictor[0 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
}
break;
@@ -193,22 +193,22 @@ void vp8_intra4x4_predict(BLOCKD *x,
pp[8] = Above[3];
- predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
- predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
- predictor[3 * 16 + 1] =
- predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
- predictor[2 * 16 + 1] =
- predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
- predictor[3 * 16 + 2] =
- predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
- predictor[2 * 16 + 2] =
- predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
- predictor[3 * 16 + 3] =
- predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
- predictor[2 * 16 + 3] =
- predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
- predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
- predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
+ predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ predictor[2 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ predictor[3 * stride + 1] =
+ predictor[1 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ predictor[2 * stride + 1] =
+ predictor[0 * stride + 0] = (pp[4] + pp[5] + 1) >> 1;
+ predictor[3 * stride + 2] =
+ predictor[1 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ predictor[2 * stride + 2] =
+ predictor[0 * stride + 1] = (pp[5] + pp[6] + 1) >> 1;
+ predictor[3 * stride + 3] =
+ predictor[1 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ predictor[2 * stride + 3] =
+ predictor[0 * stride + 2] = (pp[6] + pp[7] + 1) >> 1;
+ predictor[1 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+ predictor[0 * stride + 3] = (pp[7] + pp[8] + 1) >> 1;
}
break;
@@ -217,22 +217,22 @@ void vp8_intra4x4_predict(BLOCKD *x,
unsigned char *pp = Above;
- predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
- predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
- predictor[2 * 16 + 0] =
- predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
- predictor[1 * 16 + 1] =
- predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
- predictor[2 * 16 + 1] =
- predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
- predictor[3 * 16 + 1] =
- predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
- predictor[0 * 16 + 3] =
- predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
- predictor[1 * 16 + 3] =
- predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
- predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
- predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
+ predictor[1 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ predictor[2 * stride + 0] =
+ predictor[0 * stride + 1] = (pp[1] + pp[2] + 1) >> 1;
+ predictor[1 * stride + 1] =
+ predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ predictor[2 * stride + 1] =
+ predictor[0 * stride + 2] = (pp[2] + pp[3] + 1) >> 1;
+ predictor[3 * stride + 1] =
+ predictor[1 * stride + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ predictor[0 * stride + 3] =
+ predictor[2 * stride + 2] = (pp[3] + pp[4] + 1) >> 1;
+ predictor[1 * stride + 3] =
+ predictor[3 * stride + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ predictor[2 * stride + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ predictor[3 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
}
break;
@@ -250,22 +250,22 @@ void vp8_intra4x4_predict(BLOCKD *x,
pp[8] = Above[3];
- predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
- predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
- predictor[2 * 16 + 0] =
- predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
- predictor[2 * 16 + 1] =
- predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
- predictor[2 * 16 + 2] =
- predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
- predictor[2 * 16 + 3] =
- predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
- predictor[1 * 16 + 2] =
- predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
- predictor[1 * 16 + 3] =
- predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
- predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
- predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ predictor[3 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
+ predictor[3 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ predictor[2 * stride + 0] =
+ predictor[3 * stride + 2] = (pp[1] + pp[2] + 1) >> 1;
+ predictor[2 * stride + 1] =
+ predictor[3 * stride + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ predictor[2 * stride + 2] =
+ predictor[1 * stride + 0] = (pp[2] + pp[3] + 1) >> 1;
+ predictor[2 * stride + 3] =
+ predictor[1 * stride + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ predictor[1 * stride + 2] =
+ predictor[0 * stride + 0] = (pp[3] + pp[4] + 1) >> 1;
+ predictor[1 * stride + 3] =
+ predictor[0 * stride + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ predictor[0 * stride + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ predictor[0 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
}
break;
@@ -273,28 +273,33 @@ void vp8_intra4x4_predict(BLOCKD *x,
case B_HU_PRED:
{
unsigned char *pp = Left;
- predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
- predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
- predictor[0 * 16 + 2] =
- predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
- predictor[0 * 16 + 3] =
- predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
- predictor[1 * 16 + 2] =
- predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
- predictor[1 * 16 + 3] =
- predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
- predictor[2 * 16 + 2] =
- predictor[2 * 16 + 3] =
- predictor[3 * 16 + 0] =
- predictor[3 * 16 + 1] =
- predictor[3 * 16 + 2] =
- predictor[3 * 16 + 3] = pp[3];
+ predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
+ predictor[0 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ predictor[0 * stride + 2] =
+ predictor[1 * stride + 0] = (pp[1] + pp[2] + 1) >> 1;
+ predictor[0 * stride + 3] =
+ predictor[1 * stride + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ predictor[1 * stride + 2] =
+ predictor[2 * stride + 0] = (pp[2] + pp[3] + 1) >> 1;
+ predictor[1 * stride + 3] =
+ predictor[2 * stride + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
+ predictor[2 * stride + 2] =
+ predictor[2 * stride + 3] =
+ predictor[3 * stride + 0] =
+ predictor[3 * stride + 1] =
+ predictor[3 * stride + 2] =
+ predictor[3 * stride + 3] = pp[3];
}
break;
}
}
+
+
+
+
+
/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
* to the right prediction have filled in pixels to use.
*/
diff --git a/vp8/common/x86/idct_x86.h b/vp8/common/x86/idct_x86.h
index f6e568cdc..f9e3a794d 100644
--- a/vp8/common/x86/idct_x86.h
+++ b/vp8/common/x86/idct_x86.h
@@ -20,7 +20,6 @@
*/
#if HAVE_MMX
-extern prototype_idct(vp8_short_idct4x4llm_1_mmx);
extern prototype_idct(vp8_short_idct4x4llm_mmx);
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx);
@@ -28,9 +27,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_mmx);
extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_idct_idct1
-#define vp8_idct_idct1 vp8_short_idct4x4llm_1_mmx
-
#undef vp8_idct_idct16
#define vp8_idct_idct16 vp8_short_idct4x4llm_mmx
diff --git a/vp8/common/x86/idctllm_mmx.asm b/vp8/common/x86/idctllm_mmx.asm
index 465626b8f..0c9c205c2 100644
--- a/vp8/common/x86/idctllm_mmx.asm
+++ b/vp8/common/x86/idctllm_mmx.asm
@@ -32,249 +32,252 @@
; **************************************************************************/
-;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
+;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
+;int pitch, unsigned char *dest,int stride)
global sym(vp8_short_idct4x4llm_mmx)
sym(vp8_short_idct4x4llm_mmx):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
+ SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
+ push rsi
+ push rdi
; end prolog
- mov rax, arg(0) ;input
- mov rdx, arg(1) ;output
-
- movq mm0, [rax ]
- movq mm1, [rax+ 8]
-
- movq mm2, [rax+16]
- movq mm3, [rax+24]
-
- movsxd rax, dword ptr arg(2) ;pitch
-
- psubw mm0, mm2 ; b1= 0-2
- paddw mm2, mm2 ;
-
- movq mm5, mm1
- paddw mm2, mm0 ; a1 =0+2
-
- pmulhw mm5, [GLOBAL(x_s1sqr2)] ;
- paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+ mov rax, arg(0) ;input
+ mov rsi, arg(1) ;pred
- movq mm7, mm3 ;
- pmulhw mm7, [GLOBAL(x_c1sqr2less1)] ;
+ movq mm0, [rax ]
+ movq mm1, [rax+ 8]
+ movq mm2, [rax+16]
+ movq mm3, [rax+24]
- paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw mm7, mm5 ; c1
+%if 0
+ pxor mm7, mm7
+ movq [rax], mm7
+ movq [rax+8], mm7
+ movq [rax+16],mm7
+ movq [rax+24],mm7
+%endif
+ movsxd rax, dword ptr arg(2) ;pitch
+ mov rdx, arg(3) ;dest
+ movsxd rdi, dword ptr arg(4) ;stride
- movq mm5, mm1
- movq mm4, mm3
- pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
- paddw mm5, mm1
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
- pmulhw mm3, [GLOBAL(x_s1sqr2)]
- paddw mm3, mm4
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
- paddw mm3, mm5 ; d1
- movq mm6, mm2 ; a1
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
- movq mm4, mm0 ; b1
- paddw mm2, mm3 ;0
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
- paddw mm4, mm7 ;1
- psubw mm0, mm7 ;2
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
- psubw mm6, mm3 ;3
+ movq mm5, mm1
+ movq mm4, mm3
- movq mm1, mm2 ; 03 02 01 00
- movq mm3, mm4 ; 23 22 21 20
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
- punpcklwd mm1, mm0 ; 11 01 10 00
- punpckhwd mm2, mm0 ; 13 03 12 02
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
- punpcklwd mm3, mm6 ; 31 21 30 20
- punpckhwd mm4, mm6 ; 33 23 32 22
+ paddw mm3, mm5 ; d1
+ movq mm6, mm2 ; a1
- movq mm0, mm1 ; 11 01 10 00
- movq mm5, mm2 ; 13 03 12 02
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
- punpckldq mm0, mm3 ; 30 20 10 00
- punpckhdq mm1, mm3 ; 31 21 11 01
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
- punpckldq mm2, mm4 ; 32 22 12 02
- punpckhdq mm5, mm4 ; 33 23 13 03
+ psubw mm6, mm3 ;3
- movq mm3, mm5 ; 33 23 13 03
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
- psubw mm0, mm2 ; b1= 0-2
- paddw mm2, mm2 ;
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
- movq mm5, mm1
- paddw mm2, mm0 ; a1 =0+2
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
- pmulhw mm5, [GLOBAL(x_s1sqr2)] ;
- paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
- movq mm7, mm3 ;
- pmulhw mm7, [GLOBAL(x_c1sqr2less1)] ;
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
- paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw mm7, mm5 ; c1
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
- movq mm5, mm1
- movq mm4, mm3
+ movq mm3, mm5 ; 33 23 13 03
- pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
- paddw mm5, mm1
+ psubw mm0, mm2 ; b1= 0-2
+ paddw mm2, mm2 ;
- pmulhw mm3, [GLOBAL(x_s1sqr2)]
- paddw mm3, mm4
+ movq mm5, mm1
+ paddw mm2, mm0 ; a1 =0+2
- paddw mm3, mm5 ; d1
- paddw mm0, [GLOBAL(fours)]
+ pmulhw mm5, [GLOBAL(x_s1sqr2)];
+ paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
- paddw mm2, [GLOBAL(fours)]
- movq mm6, mm2 ; a1
+ movq mm7, mm3 ;
+ pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
- movq mm4, mm0 ; b1
- paddw mm2, mm3 ;0
+ paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
+ psubw mm7, mm5 ; c1
- paddw mm4, mm7 ;1
- psubw mm0, mm7 ;2
+ movq mm5, mm1
+ movq mm4, mm3
- psubw mm6, mm3 ;3
- psraw mm2, 3
+ pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
+ paddw mm5, mm1
- psraw mm0, 3
- psraw mm4, 3
+ pmulhw mm3, [GLOBAL(x_s1sqr2)]
+ paddw mm3, mm4
- psraw mm6, 3
+ paddw mm3, mm5 ; d1
+ paddw mm0, [GLOBAL(fours)]
- movq mm1, mm2 ; 03 02 01 00
- movq mm3, mm4 ; 23 22 21 20
+ paddw mm2, [GLOBAL(fours)]
+ movq mm6, mm2 ; a1
- punpcklwd mm1, mm0 ; 11 01 10 00
- punpckhwd mm2, mm0 ; 13 03 12 02
+ movq mm4, mm0 ; b1
+ paddw mm2, mm3 ;0
- punpcklwd mm3, mm6 ; 31 21 30 20
- punpckhwd mm4, mm6 ; 33 23 32 22
+ paddw mm4, mm7 ;1
+ psubw mm0, mm7 ;2
- movq mm0, mm1 ; 11 01 10 00
- movq mm5, mm2 ; 13 03 12 02
+ psubw mm6, mm3 ;3
+ psraw mm2, 3
- punpckldq mm0, mm3 ; 30 20 10 00
- punpckhdq mm1, mm3 ; 31 21 11 01
+ psraw mm0, 3
+ psraw mm4, 3
- punpckldq mm2, mm4 ; 32 22 12 02
- punpckhdq mm5, mm4 ; 33 23 13 03
+ psraw mm6, 3
- movq [rdx], mm0
+ movq mm1, mm2 ; 03 02 01 00
+ movq mm3, mm4 ; 23 22 21 20
- movq [rdx+rax], mm1
- movq [rdx+rax*2], mm2
+ punpcklwd mm1, mm0 ; 11 01 10 00
+ punpckhwd mm2, mm0 ; 13 03 12 02
- add rdx, rax
- movq [rdx+rax*2], mm5
+ punpcklwd mm3, mm6 ; 31 21 30 20
+ punpckhwd mm4, mm6 ; 33 23 32 22
- ; begin epilog
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
-global sym(vp8_short_idct4x4llm_1_mmx)
-sym(vp8_short_idct4x4llm_1_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- ; end prolog
+ movq mm0, mm1 ; 11 01 10 00
+ movq mm5, mm2 ; 13 03 12 02
- mov rax, arg(0) ;input
- movd mm0, [rax]
+ punpckldq mm0, mm3 ; 30 20 10 00
+ punpckhdq mm1, mm3 ; 31 21 11 01
- paddw mm0, [GLOBAL(fours)]
- mov rdx, arg(1) ;output
+ punpckldq mm2, mm4 ; 32 22 12 02
+ punpckhdq mm5, mm4 ; 33 23 13 03
- psraw mm0, 3
- movsxd rax, dword ptr arg(2) ;pitch
+ pxor mm7, mm7
- punpcklwd mm0, mm0
- punpckldq mm0, mm0
+ movd mm4, [rsi]
+ punpcklbw mm4, mm7
+ paddsw mm0, mm4
+ packuswb mm0, mm7
+ movd [rdx], mm0
- movq [rdx], mm0
- movq [rdx+rax], mm0
+ movd mm4, [rsi+rax]
+ punpcklbw mm4, mm7
+ paddsw mm1, mm4
+ packuswb mm1, mm7
+ movd [rdx+rdi], mm1
- movq [rdx+rax*2], mm0
- add rdx, rax
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm2, mm4
+ packuswb mm2, mm7
+ movd [rdx+rdi*2], mm2
- movq [rdx+rax*2], mm0
+ add rdx, rdi
+ add rsi, rax
+ movd mm4, [rsi+2*rax]
+ punpcklbw mm4, mm7
+ paddsw mm5, mm4
+ packuswb mm5, mm7
+ movd [rdx+rdi*2], mm5
; begin epilog
+ pop rdi
+ pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
-;void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
+;void vp8_dc_only_idct_add_mmx(
+;short input_dc,
+;unsigned char *pred_ptr,
+;int pred_stride,
+;unsigned char *dst_ptr,
+;int stride)
global sym(vp8_dc_only_idct_add_mmx)
sym(vp8_dc_only_idct_add_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
- push rsi
- push rdi
; end prolog
- mov rsi, arg(1) ;s -- prediction
- mov rdi, arg(2) ;d -- destination
- movsxd rax, dword ptr arg(4) ;stride
- movsxd rdx, dword ptr arg(3) ;pitch
- pxor mm0, mm0
-
movd mm5, arg(0) ;input_dc
+ mov rax, arg(1) ;pred_ptr
+ movsxd rdx, dword ptr arg(2) ;pred_stride
+
+ pxor mm0, mm0
paddw mm5, [GLOBAL(fours)]
+ lea rcx, [rdx + rdx*2]
psraw mm5, 3
punpcklwd mm5, mm5
+
punpckldq mm5, mm5
- movd mm1, [rsi]
+ movd mm1, [rax]
+ movd mm2, [rax+rdx]
+ movd mm3, [rax+2*rdx]
+ movd mm4, [rax+rcx]
+
+ mov rax, arg(3) ;d -- destination
+ movsxd rdx, dword ptr arg(4) ;dst_stride
+
punpcklbw mm1, mm0
paddsw mm1, mm5
packuswb mm1, mm0 ; pack and unpack to saturate
- movd [rdi], mm1
+ lea rcx, [rdx + rdx*2]
- movd mm2, [rsi+rdx]
punpcklbw mm2, mm0
paddsw mm2, mm5
packuswb mm2, mm0 ; pack and unpack to saturate
- movd [rdi+rax], mm2
- movd mm3, [rsi+2*rdx]
punpcklbw mm3, mm0
paddsw mm3, mm5
packuswb mm3, mm0 ; pack and unpack to saturate
- movd [rdi+2*rax], mm3
- add rdi, rax
- add rsi, rdx
- movd mm4, [rsi+2*rdx]
punpcklbw mm4, mm0
paddsw mm4, mm5
packuswb mm4, mm0 ; pack and unpack to saturate
- movd [rdi+2*rax], mm4
+
+ movd [rax], mm1
+ movd [rax+rdx], mm2
+ movd [rax+2*rdx], mm3
+ movd [rax+rcx], mm4
; begin epilog
- pop rdi
- pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm
index 83d3765ff..abeb0b682 100644
--- a/vp8/common/x86/idctllm_sse2.asm
+++ b/vp8/common/x86/idctllm_sse2.asm
@@ -15,17 +15,15 @@
; (
; short *qcoeff - 0
; short *dequant - 1
-; unsigned char *pre - 2
-; unsigned char *dst - 3
-; int dst_stride - 4
-; int blk_stride - 5
+; unsigned char *dst - 2
+; int dst_stride - 3
; )
global sym(vp8_idct_dequant_0_2x_sse2)
sym(vp8_idct_dequant_0_2x_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
+ SHADOW_ARGS_TO_STACK 4
GET_GOT rbx
; end prolog
@@ -47,19 +45,20 @@ sym(vp8_idct_dequant_0_2x_sse2):
movd [rax], xmm5
movd [rax+32], xmm5
;pshufb
+ mov rax, arg(2) ; dst
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+
pshuflw xmm4, xmm4, 00000000b
pshufhw xmm4, xmm4, 00000000b
- mov rax, arg(2) ; pre
+ lea rcx, [rdx + rdx*2]
paddw xmm4, [GLOBAL(fours)]
- movsxd rcx, dword ptr arg(5) ; blk_stride
psraw xmm4, 3
movq xmm0, [rax]
- movq xmm1, [rax+rcx]
- movq xmm2, [rax+2*rcx]
- lea rcx, [3*rcx]
+ movq xmm1, [rax+rdx]
+ movq xmm2, [rax+2*rdx]
movq xmm3, [rax+rcx]
punpcklbw xmm0, xmm5
@@ -67,8 +66,6 @@ sym(vp8_idct_dequant_0_2x_sse2):
punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm5
- mov rax, arg(3) ; dst
- movsxd rdx, dword ptr arg(4) ; dst_stride
; Add to predict buffer
paddw xmm0, xmm4
@@ -97,11 +94,18 @@ sym(vp8_idct_dequant_0_2x_sse2):
pop rbp
ret
+;void vp8_idct_dequant_full_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; )
global sym(vp8_idct_dequant_full_2x_sse2)
sym(vp8_idct_dequant_full_2x_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
+ SHADOW_ARGS_TO_STACK 4
SAVE_XMM 7
GET_GOT rbx
push rsi
@@ -111,14 +115,13 @@ sym(vp8_idct_dequant_full_2x_sse2):
; special case when 2 blocks have 0 or 1 coeffs
; dc is set as first coeff, so no need to load qcoeff
mov rax, arg(0) ; qcoeff
- mov rsi, arg(2) ; pre
- mov rdi, arg(3) ; dst
- movsxd rcx, dword ptr arg(5) ; blk_stride
+ mov rdx, arg(1) ; dequant
+ mov rdi, arg(2) ; dst
+
; Zero out xmm7, for use unpacking
pxor xmm7, xmm7
- mov rdx, arg(1) ; dequant
; note the transpose of xmm1 and xmm2, necessary for shuffle
; to spit out sensicle data
@@ -138,6 +141,7 @@ sym(vp8_idct_dequant_full_2x_sse2):
pmullw xmm2, [rdx+16]
pmullw xmm1, [rdx]
pmullw xmm3, [rdx+16]
+ movsxd rdx, dword ptr arg(3) ; dst_stride
; repack so block 0 row x and block 1 row x are together
movdqa xmm4, xmm0
@@ -162,6 +166,7 @@ sym(vp8_idct_dequant_full_2x_sse2):
paddw xmm2, xmm0 ; a1 = 0+2
pmulhw xmm5, [GLOBAL(x_s1sqr2)]
+ lea rcx, [rdx + rdx*2] ;dst_stride * 3
paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
movdqa xmm7, xmm3
@@ -304,8 +309,8 @@ sym(vp8_idct_dequant_full_2x_sse2):
pxor xmm7, xmm7
; Load up predict blocks
- movq xmm4, [rsi]
- movq xmm5, [rsi+rcx]
+ movq xmm4, [rdi]
+ movq xmm5, [rdi+rdx]
punpcklbw xmm4, xmm7
punpcklbw xmm5, xmm7
@@ -313,9 +318,8 @@ sym(vp8_idct_dequant_full_2x_sse2):
paddw xmm0, xmm4
paddw xmm1, xmm5
- movq xmm4, [rsi+2*rcx]
- lea rcx, [3*rcx]
- movq xmm5, [rsi+rcx]
+ movq xmm4, [rdi+2*rdx]
+ movq xmm5, [rdi+rcx]
punpcklbw xmm4, xmm7
punpcklbw xmm5, xmm7
@@ -331,18 +335,11 @@ sym(vp8_idct_dequant_full_2x_sse2):
packuswb xmm2, xmm7
packuswb xmm3, xmm7
- ; Load destination stride before writing out,
- ; doesn't need to persist
- movsxd rdx, dword ptr arg(4) ; dst_stride
-
; store blocks back out
movq [rdi], xmm0
movq [rdi + rdx], xmm1
-
- lea rdi, [rdi + 2*rdx]
-
- movq [rdi], xmm2
- movq [rdi + rdx], xmm3
+ movq [rdi + rdx*2], xmm2
+ movq [rdi + rcx], xmm3
; begin epilog
pop rdi
@@ -357,27 +354,25 @@ sym(vp8_idct_dequant_full_2x_sse2):
; (
; short *qcoeff - 0
; short *dequant - 1
-; unsigned char *pre - 2
-; unsigned char *dst - 3
-; int dst_stride - 4
-; short *dc - 5
+; unsigned char *dst - 2
+; int dst_stride - 3
+; short *dc - 4
; )
global sym(vp8_idct_dequant_dc_0_2x_sse2)
sym(vp8_idct_dequant_dc_0_2x_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
+ SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
- push rsi
push rdi
; end prolog
; special case when 2 blocks have 0 or 1 coeffs
; dc is set as first coeff, so no need to load qcoeff
mov rax, arg(0) ; qcoeff
- mov rsi, arg(2) ; pre
- mov rdi, arg(3) ; dst
- mov rdx, arg(5) ; dc
+
+ mov rdi, arg(2) ; dst
+ mov rdx, arg(4) ; dc
; Zero out xmm5, for use unpacking
pxor xmm5, xmm5
@@ -385,11 +380,13 @@ sym(vp8_idct_dequant_dc_0_2x_sse2):
; load up 2 dc words here == 2*16 = doubleword
movd xmm4, [rdx]
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+ lea rcx, [rdx + rdx*2]
; Load up predict blocks
- movq xmm0, [rsi]
- movq xmm1, [rsi+16]
- movq xmm2, [rsi+32]
- movq xmm3, [rsi+48]
+ movq xmm0, [rdi]
+ movq xmm1, [rdi+rdx*1]
+ movq xmm2, [rdi+rdx*2]
+ movq xmm3, [rdi+rcx]
; Duplicate and expand dc across
punpcklwd xmm4, xmm4
@@ -417,48 +414,46 @@ sym(vp8_idct_dequant_dc_0_2x_sse2):
packuswb xmm2, xmm5
packuswb xmm3, xmm5
- ; Load destination stride before writing out,
- ; doesn't need to persist
- movsxd rdx, dword ptr arg(4) ; dst_stride
-
; store blocks back out
movq [rdi], xmm0
movq [rdi + rdx], xmm1
-
- lea rdi, [rdi + 2*rdx]
-
- movq [rdi], xmm2
- movq [rdi + rdx], xmm3
+ movq [rdi + rdx*2], xmm2
+ movq [rdi + rcx], xmm3
; begin epilog
pop rdi
- pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
-
+;void vp8_idct_dequant_dc_full_2x_sse2
+; (
+; short *qcoeff - 0
+; short *dequant - 1
+; unsigned char *dst - 2
+; int dst_stride - 3
+; short *dc - 4
+; )
global sym(vp8_idct_dequant_dc_full_2x_sse2)
sym(vp8_idct_dequant_dc_full_2x_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
+ SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
GET_GOT rbx
- push rsi
push rdi
; end prolog
; special case when 2 blocks have 0 or 1 coeffs
; dc is set as first coeff, so no need to load qcoeff
mov rax, arg(0) ; qcoeff
- mov rsi, arg(2) ; pre
- mov rdi, arg(3) ; dst
+ mov rdx, arg(1) ; dequant
+
+ mov rdi, arg(2) ; dst
; Zero out xmm7, for use unpacking
pxor xmm7, xmm7
- mov rdx, arg(1) ; dequant
; note the transpose of xmm1 and xmm2, necessary for shuffle
; to spit out sensicle data
@@ -480,7 +475,7 @@ sym(vp8_idct_dequant_dc_full_2x_sse2):
pmullw xmm3, [rdx+16]
; DC component
- mov rdx, arg(5)
+ mov rdx, arg(4)
; repack so block 0 row x and block 1 row x are together
movdqa xmm4, xmm0
@@ -651,8 +646,10 @@ sym(vp8_idct_dequant_dc_full_2x_sse2):
pxor xmm7, xmm7
; Load up predict blocks
- movq xmm4, [rsi]
- movq xmm5, [rsi+16]
+ movsxd rdx, dword ptr arg(3) ; dst_stride
+ movq xmm4, [rdi]
+ movq xmm5, [rdi+rdx]
+ lea rcx, [rdx + rdx*2]
punpcklbw xmm4, xmm7
punpcklbw xmm5, xmm7
@@ -660,8 +657,8 @@ sym(vp8_idct_dequant_dc_full_2x_sse2):
paddw xmm0, xmm4
paddw xmm1, xmm5
- movq xmm4, [rsi+32]
- movq xmm5, [rsi+48]
+ movq xmm4, [rdi+rdx*2]
+ movq xmm5, [rdi+rcx]
punpcklbw xmm4, xmm7
punpcklbw xmm5, xmm7
@@ -679,7 +676,7 @@ sym(vp8_idct_dequant_dc_full_2x_sse2):
; Load destination stride before writing out,
; doesn't need to persist
- movsxd rdx, dword ptr arg(4) ; dst_stride
+ movsxd rdx, dword ptr arg(3) ; dst_stride
; store blocks back out
movq [rdi], xmm0
@@ -693,7 +690,6 @@ sym(vp8_idct_dequant_dc_full_2x_sse2):
; begin epilog
pop rdi
- pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
diff --git a/vp8/common/x86/recon_mmx.asm b/vp8/common/x86/recon_mmx.asm
index e7211fccb..19c0faf3f 100644
--- a/vp8/common/x86/recon_mmx.asm
+++ b/vp8/common/x86/recon_mmx.asm
@@ -10,53 +10,6 @@
%include "vpx_ports/x86_abi_support.asm"
-;void vp8_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp8_recon_b_mmx)
-sym(vp8_recon_b_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;s
- mov rdi, arg(2) ;d
- mov rdx, arg(1) ;q
- movsxd rax, dword ptr arg(3) ;stride
- pxor mm0, mm0
-
- movd mm1, [rsi]
- punpcklbw mm1, mm0
- paddsw mm1, [rdx]
- packuswb mm1, mm0 ; pack and unpack to saturate
- movd [rdi], mm1
-
- movd mm2, [rsi+16]
- punpcklbw mm2, mm0
- paddsw mm2, [rdx+32]
- packuswb mm2, mm0 ; pack and unpack to saturate
- movd [rdi+rax], mm2
-
- movd mm3, [rsi+32]
- punpcklbw mm3, mm0
- paddsw mm3, [rdx+64]
- packuswb mm3, mm0 ; pack and unpack to saturate
- movd [rdi+2*rax], mm3
-
- add rdi, rax
- movd mm4, [rsi+48]
- punpcklbw mm4, mm0
- paddsw mm4, [rdx+96]
- packuswb mm4, mm0 ; pack and unpack to saturate
- movd [rdi+2*rax], mm4
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
;void copy_mem8x8_mmx(
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index f54cc4e7e..a82c1b4fd 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -10,121 +10,6 @@
%include "vpx_ports/x86_abi_support.asm"
-;void vp8_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp8_recon2b_sse2)
-sym(vp8_recon2b_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;s
- mov rdi, arg(2) ;d
- mov rdx, arg(1) ;q
- movsxd rax, dword ptr arg(3) ;stride
- pxor xmm0, xmm0
-
- movq xmm1, MMWORD PTR [rsi]
- punpcklbw xmm1, xmm0
- paddsw xmm1, XMMWORD PTR [rdx]
- packuswb xmm1, xmm0 ; pack and unpack to saturate
- movq MMWORD PTR [rdi], xmm1
-
-
- movq xmm2, MMWORD PTR [rsi+8]
- punpcklbw xmm2, xmm0
- paddsw xmm2, XMMWORD PTR [rdx+16]
- packuswb xmm2, xmm0 ; pack and unpack to saturate
- movq MMWORD PTR [rdi+rax], xmm2
-
-
- movq xmm3, MMWORD PTR [rsi+16]
- punpcklbw xmm3, xmm0
- paddsw xmm3, XMMWORD PTR [rdx+32]
- packuswb xmm3, xmm0 ; pack and unpack to saturate
- movq MMWORD PTR [rdi+rax*2], xmm3
-
- add rdi, rax
- movq xmm4, MMWORD PTR [rsi+24]
- punpcklbw xmm4, xmm0
- paddsw xmm4, XMMWORD PTR [rdx+48]
- packuswb xmm4, xmm0 ; pack and unpack to saturate
- movq MMWORD PTR [rdi+rax*2], xmm4
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp8_recon4b_sse2)
-sym(vp8_recon4b_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;s
- mov rdi, arg(2) ;d
- mov rdx, arg(1) ;q
- movsxd rax, dword ptr arg(3) ;stride
- pxor xmm0, xmm0
-
- movdqa xmm1, XMMWORD PTR [rsi]
- movdqa xmm5, xmm1
- punpcklbw xmm1, xmm0
- punpckhbw xmm5, xmm0
- paddsw xmm1, XMMWORD PTR [rdx]
- paddsw xmm5, XMMWORD PTR [rdx+16]
- packuswb xmm1, xmm5 ; pack and unpack to saturate
- movdqa XMMWORD PTR [rdi], xmm1
-
-
- movdqa xmm2, XMMWORD PTR [rsi+16]
- movdqa xmm6, xmm2
- punpcklbw xmm2, xmm0
- punpckhbw xmm6, xmm0
- paddsw xmm2, XMMWORD PTR [rdx+32]
- paddsw xmm6, XMMWORD PTR [rdx+48]
- packuswb xmm2, xmm6 ; pack and unpack to saturate
- movdqa XMMWORD PTR [rdi+rax], xmm2
-
-
- movdqa xmm3, XMMWORD PTR [rsi+32]
- movdqa xmm7, xmm3
- punpcklbw xmm3, xmm0
- punpckhbw xmm7, xmm0
- paddsw xmm3, XMMWORD PTR [rdx+64]
- paddsw xmm7, XMMWORD PTR [rdx+80]
- packuswb xmm3, xmm7 ; pack and unpack to saturate
- movdqa XMMWORD PTR [rdi+rax*2], xmm3
-
- add rdi, rax
- movdqa xmm4, XMMWORD PTR [rsi+48]
- movdqa xmm5, xmm4
- punpcklbw xmm4, xmm0
- punpckhbw xmm5, xmm0
- paddsw xmm4, XMMWORD PTR [rdx+96]
- paddsw xmm5, XMMWORD PTR [rdx+112]
- packuswb xmm4, xmm5 ; pack and unpack to saturate
- movdqa XMMWORD PTR [rdi+rax*2], xmm4
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
;void copy_mem16x16_sse2(
; unsigned char *src,
diff --git a/vp8/common/x86/recon_x86.h b/vp8/common/x86/recon_x86.h
index fe0f8f0bc..fbb3dcb63 100644
--- a/vp8/common/x86/recon_x86.h
+++ b/vp8/common/x86/recon_x86.h
@@ -20,16 +20,12 @@
*/
#if HAVE_MMX
-extern prototype_recon_block(vp8_recon_b_mmx);
extern prototype_copy_block(vp8_copy_mem8x8_mmx);
extern prototype_copy_block(vp8_copy_mem8x4_mmx);
extern prototype_copy_block(vp8_copy_mem16x16_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_recon_recon
-#define vp8_recon_recon vp8_recon_b_mmx
-
#undef vp8_recon_copy8x8
#define vp8_recon_copy8x8 vp8_copy_mem8x8_mmx
@@ -43,19 +39,11 @@ extern prototype_copy_block(vp8_copy_mem16x16_mmx);
#endif
#if HAVE_SSE2
-extern prototype_recon_block(vp8_recon2b_sse2);
-extern prototype_recon_block(vp8_recon4b_sse2);
extern prototype_copy_block(vp8_copy_mem16x16_sse2);
extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_sse2);
extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2);
#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp8_recon_recon2
-#define vp8_recon_recon2 vp8_recon2b_sse2
-
-#undef vp8_recon_recon4
-#define vp8_recon_recon4 vp8_recon4b_sse2
-
#undef vp8_recon_copy16x16
#define vp8_recon_copy16x16 vp8_copy_mem16x16_sse2
diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c
index 33a984b79..c4e616a67 100644
--- a/vp8/common/x86/x86_systemdependent.c
+++ b/vp8/common/x86/x86_systemdependent.c
@@ -37,7 +37,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
if (flags & HAS_MMX)
{
- rtcd->idct.idct1 = vp8_short_idct4x4llm_1_mmx;
rtcd->idct.idct16 = vp8_short_idct4x4llm_mmx;
rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_mmx;
@@ -45,7 +44,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
- rtcd->recon.recon = vp8_recon_b_mmx;
rtcd->recon.copy8x8 = vp8_copy_mem8x8_mmx;
rtcd->recon.copy8x4 = vp8_copy_mem8x4_mmx;
rtcd->recon.copy16x16 = vp8_copy_mem16x16_mmx;
@@ -81,8 +79,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
if (flags & HAS_SSE2)
{
- rtcd->recon.recon2 = vp8_recon2b_sse2;
- rtcd->recon.recon4 = vp8_recon4b_sse2;
rtcd->recon.copy16x16 = vp8_copy_mem16x16_sse2;
rtcd->recon.build_intra_predictors_mbuv =
vp8_build_intra_predictors_mbuv_sse2;
diff --git a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
index 6bebda24f..19f94e089 100644
--- a/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
+++ b/vp8/decoder/arm/armv6/dequant_dc_idct_v6.asm
@@ -12,21 +12,19 @@
AREA |.text|, CODE, READONLY
-;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,
-; unsigned char *dest, int pitch, int stride, int Dc)
+;void vp8_dequant_dc_idct_v6(short *input, short *dq,
+; unsigned char *dest, int stride, int Dc)
; r0 = input
; r1 = dq
-; r2 = pred
-; r3 = dest
-; sp + 36 = pitch ; +4 = 40
-; sp + 40 = stride ; +4 = 44
-; sp + 44 = Dc ; +4 = 48
+; r2 = dst
+; r3 = stride
+; sp + 36 = Dc
|vp8_dequant_dc_idct_add_v6| PROC
stmdb sp!, {r4-r11, lr}
- ldr r6, [sp, #44]
+ ldr r6, [sp, #36]
ldr r4, [r0] ;input
ldr r5, [r1], #4 ;dq
@@ -149,7 +147,7 @@ vp8_dequant_dc_idct_loop2_v6
usub16 r1, r12, r8
uadd16 r8, r11, r6
ldr r9, c0x00040004
- ldr r12, [sp, #40]
+ ldr r12, [sp] ; get stride from stack
uadd16 r6, r10, r8
usub16 r7, r10, r8
uadd16 r7, r7, r9
@@ -158,7 +156,7 @@ vp8_dequant_dc_idct_loop2_v6
usub16 r1, r14, r1
uadd16 r10, r10, r9
uadd16 r1, r1, r9
- ldr r11, [r2], r12
+ ldr r11, [r2] ; load input from dst
mov r8, r7, asr #3
pkhtb r9, r8, r10, asr #19
mov r8, r1, asr #3
@@ -170,9 +168,7 @@ vp8_dequant_dc_idct_loop2_v6
usat16 r9, #8, r9
usat16 r8, #8, r8
orr r9, r8, r9, lsl #8
- ldr r11, [r2], r12
- ldr lr, [sp]
- ldr r12, [sp, #44]
+ ldr r11, [r2, r12] ; load input from dst
mov r7, r7, lsl #16
mov r1, r1, lsl #16
mov r10, r10, lsl #16
@@ -188,9 +184,8 @@ vp8_dequant_dc_idct_loop2_v6
usat16 r7, #8, r7
usat16 r1, #8, r1
orr r1, r1, r7, lsl #8
- str r9, [lr], r12
- str r1, [lr], r12
- str lr, [sp]
+ str r9, [r2], r12 ; store output to dst
+ str r1, [r2], r12 ; store output to dst
bne vp8_dequant_dc_idct_loop2_v6
; vpx_memset
diff --git a/vp8/decoder/arm/armv6/dequant_idct_v6.asm b/vp8/decoder/arm/armv6/dequant_idct_v6.asm
index 47b671ca6..2510ad838 100644
--- a/vp8/decoder/arm/armv6/dequant_idct_v6.asm
+++ b/vp8/decoder/arm/armv6/dequant_idct_v6.asm
@@ -10,15 +10,12 @@
EXPORT |vp8_dequant_idct_add_v6|
AREA |.text|, CODE, READONLY
-;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
-; unsigned char *dest, int pitch, int stride)
-; r0 = input
+;void vp8_dequant_idct_v6(short *input, short *dq,
+; unsigned char *dest, int stride)
+; r0 = q
; r1 = dq
-; r2 = pred
-; r3 = dest
-; sp + 36 = pitch ; +4 = 40
-; sp + 40 = stride ; +4 = 44
-
+; r2 = dst
+; r3 = stride
|vp8_dequant_idct_add_v6| PROC
stmdb sp!, {r4-r11, lr}
@@ -127,7 +124,7 @@ vp8_dequant_idct_loop2_v6
usub16 r1, r12, r8
uadd16 r8, r11, r6
ldr r9, c0x00040004
- ldr r12, [sp, #40]
+ ldr r12, [sp] ; get stride from stack
uadd16 r6, r10, r8
usub16 r7, r10, r8
uadd16 r7, r7, r9
@@ -136,7 +133,7 @@ vp8_dequant_idct_loop2_v6
usub16 r1, r14, r1
uadd16 r10, r10, r9
uadd16 r1, r1, r9
- ldr r11, [r2], r12
+ ldr r11, [r2] ; load input from dst
mov r8, r7, asr #3
pkhtb r9, r8, r10, asr #19
mov r8, r1, asr #3
@@ -148,9 +145,7 @@ vp8_dequant_idct_loop2_v6
usat16 r9, #8, r9
usat16 r8, #8, r8
orr r9, r8, r9, lsl #8
- ldr r11, [r2], r12
- ldr lr, [sp]
- ldr r12, [sp, #44]
+ ldr r11, [r2, r12] ; load input from dst
mov r7, r7, lsl #16
mov r1, r1, lsl #16
mov r10, r10, lsl #16
@@ -166,9 +161,8 @@ vp8_dequant_idct_loop2_v6
usat16 r7, #8, r7
usat16 r1, #8, r1
orr r1, r1, r7, lsl #8
- str r9, [lr], r12
- str r1, [lr], r12
- str lr, [sp]
+ str r9, [r2], r12 ; store output to dst
+ str r1, [r2], r12 ; store output to dst
bne vp8_dequant_idct_loop2_v6
; vpx_memset
diff --git a/vp8/decoder/arm/armv6/idct_blk_v6.c b/vp8/decoder/arm/armv6/idct_blk_v6.c
index 5c7592f35..686bb737f 100644
--- a/vp8/decoder/arm/armv6/idct_blk_v6.c
+++ b/vp8/decoder/arm/armv6/idct_blk_v6.c
@@ -12,115 +12,121 @@
#include "vp8/common/idct.h"
#include "vp8/decoder/dequantize.h"
-void vp8_dequant_dc_idct_add_y_block_v6
- (short *q, short *dq, unsigned char *pre,
- unsigned char *dst, int stride, char *eobs, short *dc)
+
+void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq,
+ unsigned char *dst, int stride,
+ char *eobs, short *dc)
{
int i;
for (i = 0; i < 4; i++)
{
if (eobs[0] > 1)
- vp8_dequant_dc_idct_add_v6 (q, dq, pre, dst, 16, stride, dc[0]);
- else
- vp8_dc_only_idct_add_v6 (dc[0], pre, dst, 16, stride);
+ vp8_dequant_dc_idct_add_v6 (q, dq, dst, stride, dc[0]);
+ else if (eobs[0] == 1)
+ vp8_dc_only_idct_add_v6 (dc[0], dst, stride, dst, stride);
if (eobs[1] > 1)
- vp8_dequant_dc_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
- else
- vp8_dc_only_idct_add_v6 (dc[1], pre+4, dst+4, 16, stride);
+ {
+ vp8_dequant_dc_idct_add_v6 (q+16, dq, dst+4, stride, dc[1]);
+ }
+ else if (eobs[1] == 1)
+ vp8_dc_only_idct_add_v6 (dc[1], dst+4, stride, dst+4, stride);
if (eobs[2] > 1)
- vp8_dequant_dc_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
- else
- vp8_dc_only_idct_add_v6 (dc[2], pre+8, dst+8, 16, stride);
+ {
+ vp8_dequant_dc_idct_add_v6 (q+32, dq, dst+8, stride, dc[2]);
+ }
+ else if (eobs[2] == 1)
+ vp8_dc_only_idct_add_v6 (dc[2], dst+8, stride, dst+8, stride);
if (eobs[3] > 1)
- vp8_dequant_dc_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
- else
- vp8_dc_only_idct_add_v6 (dc[3], pre+12, dst+12, 16, stride);
+ {
+ vp8_dequant_dc_idct_add_v6 (q+48, dq, dst+12, stride, dc[3]);
+ }
+ else if (eobs[3] == 1)
+ vp8_dc_only_idct_add_v6 (dc[3], dst+12, stride, dst+12, stride);
q += 64;
dc += 4;
- pre += 64;
dst += 4*stride;
eobs += 4;
}
}
-void vp8_dequant_idct_add_y_block_v6
- (short *q, short *dq, unsigned char *pre,
- unsigned char *dst, int stride, char *eobs)
+void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
+ unsigned char *dst,
+ int stride, char *eobs)
{
int i;
for (i = 0; i < 4; i++)
{
if (eobs[0] > 1)
- vp8_dequant_idct_add_v6 (q, dq, pre, dst, 16, stride);
- else
+ vp8_dequant_idct_add_v6 (q, dq, dst, stride);
+ else if (eobs[0] == 1)
{
- vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dst, 16, stride);
+ vp8_dc_only_idct_add_v6 (q[0]*dq[0], dst, stride, dst, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
- vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride);
- else
+ vp8_dequant_idct_add_v6 (q+16, dq, dst+4, stride);
+ else if (eobs[1] == 1)
{
- vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dst+4, 16, stride);
+ vp8_dc_only_idct_add_v6 (q[16]*dq[0], dst+4, stride, dst+4, stride);
((int *)(q+16))[0] = 0;
}
if (eobs[2] > 1)
- vp8_dequant_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride);
- else
+ vp8_dequant_idct_add_v6 (q+32, dq, dst+8, stride);
+ else if (eobs[2] == 1)
{
- vp8_dc_only_idct_add_v6 (q[32]*dq[0], pre+8, dst+8, 16, stride);
+ vp8_dc_only_idct_add_v6 (q[32]*dq[0], dst+8, stride, dst+8, stride);
((int *)(q+32))[0] = 0;
}
if (eobs[3] > 1)
- vp8_dequant_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride);
- else
+ vp8_dequant_idct_add_v6 (q+48, dq, dst+12, stride);
+ else if (eobs[3] == 1)
{
- vp8_dc_only_idct_add_v6 (q[48]*dq[0], pre+12, dst+12, 16, stride);
+ vp8_dc_only_idct_add_v6 (q[48]*dq[0], dst+12, stride,dst+12,stride);
((int *)(q+48))[0] = 0;
}
q += 64;
- pre += 64;
dst += 4*stride;
eobs += 4;
}
}
-void vp8_dequant_idct_add_uv_block_v6
- (short *q, short *dq, unsigned char *pre,
- unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq,
+ unsigned char *dstu,
+ unsigned char *dstv,
+ int stride, char *eobs)
{
int i;
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
- vp8_dequant_idct_add_v6 (q, dq, pre, dstu, 8, stride);
- else
+ vp8_dequant_idct_add_v6 (q, dq, dstu, stride);
+ else if (eobs[0] == 1)
{
- vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstu, 8, stride);
+ vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstu, stride, dstu, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
- vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstu+4, 8, stride);
- else
+ vp8_dequant_idct_add_v6 (q+16, dq, dstu+4, stride);
+ else if (eobs[1] == 1)
{
- vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstu+4, 8, stride);
+ vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstu+4, stride,
+ dstu+4, stride);
((int *)(q+16))[0] = 0;
}
q += 32;
- pre += 32;
dstu += 4*stride;
eobs += 2;
}
@@ -128,23 +134,23 @@ void vp8_dequant_idct_add_uv_block_v6
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
- vp8_dequant_idct_add_v6 (q, dq, pre, dstv, 8, stride);
- else
+ vp8_dequant_idct_add_v6 (q, dq, dstv, stride);
+ else if (eobs[0] == 1)
{
- vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstv, 8, stride);
+ vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstv, stride, dstv, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
- vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstv+4, 8, stride);
- else
+ vp8_dequant_idct_add_v6 (q+16, dq, dstv+4, stride);
+ else if (eobs[1] == 1)
{
- vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstv+4, 8, stride);
+ vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstv+4, stride,
+ dstv+4, stride);
((int *)(q+16))[0] = 0;
}
q += 32;
- pre += 32;
dstv += 4*stride;
eobs += 2;
}
diff --git a/vp8/decoder/arm/dequantize_arm.h b/vp8/decoder/arm/dequantize_arm.h
index b7d800d26..c020c8530 100644
--- a/vp8/decoder/arm/dequantize_arm.h
+++ b/vp8/decoder/arm/dequantize_arm.h
@@ -49,6 +49,7 @@ extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neo
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
+
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_dequant_block
#define vp8_dequant_block vp8_dequantize_b_neon
@@ -68,6 +69,7 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
#undef vp8_dequant_idct_add_uv_block
#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
#endif
+
#endif
#endif
diff --git a/vp8/decoder/arm/neon/dequant_idct_neon.asm b/vp8/decoder/arm/neon/dequant_idct_neon.asm
index 4bf661857..602cce676 100644
--- a/vp8/decoder/arm/neon/dequant_idct_neon.asm
+++ b/vp8/decoder/arm/neon/dequant_idct_neon.asm
@@ -15,25 +15,24 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,
-; unsigned char *dest, int pitch, int stride)
+;void vp8_dequant_idct_add_neon(short *input, short *dq,
+; unsigned char *dest, int stride)
; r0 short *input,
; r1 short *dq,
-; r2 unsigned char *pred
-; r3 unsigned char *dest
-; sp int pitch
-; sp+4 int stride
+; r2 unsigned char *dest
+; r3 int stride
|vp8_dequant_idct_add_neon| PROC
vld1.16 {q3, q4}, [r0]
vld1.16 {q5, q6}, [r1]
- ldr r1, [sp] ; pitch
- vld1.32 {d14[0]}, [r2], r1
- vld1.32 {d14[1]}, [r2], r1
- vld1.32 {d15[0]}, [r2], r1
- vld1.32 {d15[1]}, [r2]
- ldr r1, [sp, #4] ; stride
+ add r1, r2, r3 ; r1 = dest + stride
+ lsl r3, #1 ; 2x stride
+
+ vld1.32 {d14[0]}, [r2], r3
+ vld1.32 {d14[1]}, [r1], r3
+ vld1.32 {d15[0]}, [r2]
+ vld1.32 {d15[1]}, [r1]
adr r12, cospi8sqrt2minus1 ; pointer to the first constant
@@ -110,13 +109,16 @@
vaddw.u8 q1, q1, d14
vaddw.u8 q2, q2, d15
+ sub r2, r2, r3
+ sub r1, r1, r3
+
vqmovun.s16 d0, q1
vqmovun.s16 d1, q2
- vst1.32 {d0[0]}, [r3], r1
- vst1.32 {d0[1]}, [r3], r1
- vst1.32 {d1[0]}, [r3], r1
- vst1.32 {d1[1]}, [r3]
+ vst1.32 {d0[0]}, [r2], r3
+ vst1.32 {d0[1]}, [r1], r3
+ vst1.32 {d1[0]}, [r2]
+ vst1.32 {d1[1]}, [r1]
bx lr
diff --git a/vp8/decoder/arm/neon/idct_blk_neon.c b/vp8/decoder/arm/neon/idct_blk_neon.c
index f31654060..086293114 100644
--- a/vp8/decoder/arm/neon/idct_blk_neon.c
+++ b/vp8/decoder/arm/neon/idct_blk_neon.c
@@ -15,101 +15,118 @@
/* place these declarations here because we don't want to maintain them
* outside of this scope
*/
-void idct_dequant_dc_full_2x_neon
- (short *input, short *dq, unsigned char *pre, unsigned char *dst,
- int stride, short *dc);
-void idct_dequant_dc_0_2x_neon
- (short *dc, unsigned char *pre, unsigned char *dst, int stride);
-void idct_dequant_full_2x_neon
- (short *q, short *dq, unsigned char *pre, unsigned char *dst,
- int pitch, int stride);
-void idct_dequant_0_2x_neon
- (short *q, short dq, unsigned char *pre, int pitch,
- unsigned char *dst, int stride);
-
-void vp8_dequant_dc_idct_add_y_block_neon
- (short *q, short *dq, unsigned char *pre,
- unsigned char *dst, int stride, char *eobs, short *dc)
+void idct_dequant_dc_full_2x_neon(short *input, short *dq,
+ unsigned char *dst,
+ int stride, short *dc);
+void idct_dequant_dc_0_2x_neon(short *input, short *dq,
+ unsigned char *dst,
+ int stride, short *dc);
+void idct_dequant_full_2x_neon(short *q, short *dq,
+ unsigned char *dst, int stride);
+void idct_dequant_0_2x_neon(short *q, short dq,
+ unsigned char *dst, int stride);
+
+void vp8_dequant_dc_idct_add_y_block_neon(short *q, short *dq,
+ unsigned char *dst,
+ int stride, char *eobs, short *dc)
{
int i;
for (i = 0; i < 4; i++)
{
- if (((short *)eobs)[0] & 0xfefe)
- idct_dequant_dc_full_2x_neon (q, dq, pre, dst, stride, dc);
- else
- idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);
-
- if (((short *)eobs)[1] & 0xfefe)
- idct_dequant_dc_full_2x_neon (q+32, dq, pre+8, dst+8, stride, dc+2);
- else
- idct_dequant_dc_0_2x_neon(dc+2, pre+8, dst+8, stride);
-
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_dc_full_2x_neon (q, dq, dst, stride, dc);
+ else
+ idct_dequant_dc_0_2x_neon(q, dq, dst, stride, dc);
+ }
+
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_dc_full_2x_neon (q+32, dq, dst+8, stride, dc+2);
+ else
+ idct_dequant_dc_0_2x_neon(q+32, dq, dst+8, stride, dc+2);
+ }
q += 64;
dc += 4;
- pre += 64;
dst += 4*stride;
eobs += 4;
}
}
-void vp8_dequant_idct_add_y_block_neon
- (short *q, short *dq, unsigned char *pre,
- unsigned char *dst, int stride, char *eobs)
+void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
+ unsigned char *dst,
+ int stride, char *eobs)
{
int i;
for (i = 0; i < 4; i++)
{
- if (((short *)eobs)[0] & 0xfefe)
- idct_dequant_full_2x_neon (q, dq, pre, dst, 16, stride);
- else
- idct_dequant_0_2x_neon (q, dq[0], pre, 16, dst, stride);
-
- if (((short *)eobs)[1] & 0xfefe)
- idct_dequant_full_2x_neon (q+32, dq, pre+8, dst+8, 16, stride);
- else
- idct_dequant_0_2x_neon (q+32, dq[0], pre+8, 16, dst+8, stride);
-
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dst, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dst, stride);
+ }
+
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_full_2x_neon (q+32, dq, dst+8, stride);
+ else
+ idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride);
+ }
q += 64;
- pre += 64;
dst += 4*stride;
eobs += 4;
}
}
-void vp8_dequant_idct_add_uv_block_neon
- (short *q, short *dq, unsigned char *pre,
- unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
+void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
+ unsigned char *dstu,
+ unsigned char *dstv,
+ int stride, char *eobs)
{
- if (((short *)eobs)[0] & 0xfefe)
- idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
- else
- idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)eobs)[0] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstu, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
+ }
q += 32;
- pre += 32;
dstu += 4*stride;
- if (((short *)eobs)[1] & 0xfefe)
- idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
- else
- idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)eobs)[1] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstu, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
+ }
q += 32;
- pre += 32;
- if (((short *)eobs)[2] & 0xfefe)
- idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
- else
- idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
+ if (((short *)(eobs))[2])
+ {
+ if (((short *)eobs)[2] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstv, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
+ }
q += 32;
- pre += 32;
dstv += 4*stride;
- if (((short *)eobs)[3] & 0xfefe)
- idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
- else
- idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
+ if (((short *)(eobs))[3])
+ {
+ if (((short *)eobs)[3] & 0xfefe)
+ idct_dequant_full_2x_neon (q, dq, dstv, stride);
+ else
+ idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
+ }
}
diff --git a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
index 456f8e1d4..6c29c5586 100644
--- a/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_0_2x_neon.asm
@@ -14,38 +14,38 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,
-; int pitch, unsigned char *dst, int stride);
+;void idct_dequant_0_2x_neon(short *q, short dq,
+; unsigned char *dst, int stride);
; r0 *q
; r1 dq
-; r2 *pre
-; r3 pitch
-; sp *dst
-; sp+4 stride
+; r2 *dst
+; r3 stride
|idct_dequant_0_2x_neon| PROC
+ push {r4, r5}
+
add r12, r2, #4
vld1.32 {d2[0]}, [r2], r3
- vld1.32 {d2[1]}, [r2], r3
- vld1.32 {d4[0]}, [r2], r3
- vld1.32 {d4[1]}, [r2]
vld1.32 {d8[0]}, [r12], r3
+ vld1.32 {d2[1]}, [r2], r3
vld1.32 {d8[1]}, [r12], r3
+ vld1.32 {d4[0]}, [r2], r3
vld1.32 {d10[0]}, [r12], r3
- vld1.32 {d10[1]}, [r12]
+ vld1.32 {d4[1]}, [r2], r3
+ vld1.32 {d10[1]}, [r12], r3
ldrh r12, [r0] ; lo q
- ldrh r2, [r0, #32] ; hi q
- mov r3, #0
- strh r3, [r0]
- strh r3, [r0, #32]
+ ldrh r4, [r0, #32] ; hi q
+ mov r5, #0
+ strh r5, [r0]
+ strh r5, [r0, #32]
sxth r12, r12 ; lo
mul r0, r12, r1
add r0, r0, #4
asr r0, r0, #3
vdup.16 q0, r0
- sxth r2, r2 ; hi
- mul r0, r2, r1
+ sxth r4, r4 ; hi
+ mul r0, r4, r1
add r0, r0, #4
asr r0, r0, #3
vdup.16 q3, r0
@@ -55,25 +55,25 @@
vaddw.u8 q4, q3, d8 ; hi
vaddw.u8 q5, q3, d10
- ldr r2, [sp] ; dst
- ldr r3, [sp, #4] ; stride
+ sub r2, r2, r3, lsl #2 ; dst - 4*stride
+ add r0, r2, #4
vqmovun.s16 d2, q1 ; lo
vqmovun.s16 d4, q2
vqmovun.s16 d8, q4 ; hi
vqmovun.s16 d10, q5
- add r0, r2, #4
vst1.32 {d2[0]}, [r2], r3 ; lo
- vst1.32 {d2[1]}, [r2], r3
- vst1.32 {d4[0]}, [r2], r3
- vst1.32 {d4[1]}, [r2]
vst1.32 {d8[0]}, [r0], r3 ; hi
+ vst1.32 {d2[1]}, [r2], r3
vst1.32 {d8[1]}, [r0], r3
+ vst1.32 {d4[0]}, [r2], r3
vst1.32 {d10[0]}, [r0], r3
+ vst1.32 {d4[1]}, [r2]
vst1.32 {d10[1]}, [r0]
- bx lr
+ pop {r4, r5}
+ bx lr
- ENDP ; |idct_dequant_0_2x_neon|
+ ENDP ; |idct_dequant_0_2x_neon|
END
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
index 0dc036acb..bf8d7ddcd 100644
--- a/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_dc_0_2x_neon.asm
@@ -14,25 +14,29 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,
+
+;void idct_dequant_dc_0_2x_neon(short *q, short *dq,
; unsigned char *dst, int stride);
-; r0 *dc
-; r1 *pre
-; r2 *dst
-; r3 stride
+; r0 *q,
+; r1 *dq,
+; r2 *dst
+; r3 stride
+; sp *dc
|idct_dequant_dc_0_2x_neon| PROC
- ldr r0, [r0] ; *dc
- mov r12, #16
- vld1.32 {d2[0]}, [r1], r12 ; lo
- vld1.32 {d2[1]}, [r1], r12
- vld1.32 {d4[0]}, [r1], r12
- vld1.32 {d4[1]}, [r1]
- sub r1, r1, #44
- vld1.32 {d8[0]}, [r1], r12 ; hi
- vld1.32 {d8[1]}, [r1], r12
- vld1.32 {d10[0]}, [r1], r12
- vld1.32 {d10[1]}, [r1]
+ ; no q- or dq-coeffs, so r0 and r1 are free to use
+ ldr r1, [sp] ; *dc
+ add r12, r2, #4
+ ldr r0, [r1]
+
+ vld1.32 {d2[0]}, [r2], r3 ; lo
+ vld1.32 {d8[0]}, [r12], r3 ; hi
+ vld1.32 {d2[1]}, [r2], r3
+ vld1.32 {d8[1]}, [r12], r3
+ vld1.32 {d4[0]}, [r2], r3
+ vld1.32 {d10[0]}, [r12], r3
+ vld1.32 {d4[1]}, [r2], r3
+ vld1.32 {d10[1]}, [r12]
sxth r1, r0 ; lo *dc
add r1, r1, #4
@@ -53,14 +57,16 @@
vqmovun.s16 d8, q4 ; hi
vqmovun.s16 d10, q5
+ sub r2, r2, r3, lsl #2 ; dst - 4*stride
add r0, r2, #4
+
vst1.32 {d2[0]}, [r2], r3 ; lo
- vst1.32 {d2[1]}, [r2], r3
- vst1.32 {d4[0]}, [r2], r3
- vst1.32 {d4[1]}, [r2]
vst1.32 {d8[0]}, [r0], r3 ; hi
+ vst1.32 {d2[1]}, [r2], r3
vst1.32 {d8[1]}, [r0], r3
+ vst1.32 {d4[0]}, [r2], r3
vst1.32 {d10[0]}, [r0], r3
+ vst1.32 {d4[1]}, [r2]
vst1.32 {d10[1]}, [r0]
bx lr
diff --git a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
index 61fa66075..eea41f68c 100644
--- a/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_dc_full_2x_neon.asm
@@ -15,33 +15,34 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,
+;void idct_dequant_dc_full_2x_neon(short *q, short *dq,
; unsigned char *dst, int stride, short *dc);
; r0 *q,
; r1 *dq,
-; r2 *pre
-; r3 *dst
-; sp stride
-; sp+4 *dc
+; r2 *dst
+; r3 stride
+; sp *dc
|idct_dequant_dc_full_2x_neon| PROC
+ push {r4}
+
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
vld1.16 {q2, q3}, [r0] ; l q
- mov r1, #16 ; pitch
add r0, r0, #32
vld1.16 {q4, q5}, [r0] ; r q
add r12, r2, #4
+
; interleave the predictors
- vld1.32 {d28[0]}, [r2], r1 ; l pre
- vld1.32 {d28[1]}, [r12], r1 ; r pre
- vld1.32 {d29[0]}, [r2], r1
- vld1.32 {d29[1]}, [r12], r1
- vld1.32 {d30[0]}, [r2], r1
- vld1.32 {d30[1]}, [r12], r1
- vld1.32 {d31[0]}, [r2]
- ldr r1, [sp, #4]
+ vld1.32 {d28[0]}, [r2], r3 ; l pre
+ vld1.32 {d28[1]}, [r12], r3 ; r pre
+ vld1.32 {d29[0]}, [r2], r3
+ vld1.32 {d29[1]}, [r12], r3
+ vld1.32 {d30[0]}, [r2], r3
+ vld1.32 {d30[1]}, [r12], r3
+ vld1.32 {d31[0]}, [r2], r3
+ ldr r1, [sp, #4] ; *dc
vld1.32 {d31[1]}, [r12]
- adr r2, cospi8sqrt2minus1 ; pointer to the first constant
+ adr r4, cospi8sqrt2minus1 ; pointer to the first constant
ldrh r12, [r1], #2 ; lo *dc
ldrh r1, [r1] ; hi *dc
@@ -56,7 +57,7 @@
vmov.16 d4[0], r12
vmov.16 d8[0], r1
- vld1.16 {d0}, [r2]
+ vld1.16 {d0}, [r4]
; q2: l0r0 q3: l8r8
; q4: l4r4 q5: l12r12
@@ -176,26 +177,28 @@
sub r0, r0, #32
vst1.16 {q14, q15}, [r0] ; write over low input
+ sub r2, r2, r3, lsl #2 ; dst - 4*stride
+ add r1, r2, #4 ; hi
+
;saturate and narrow
vqmovun.s16 d0, q4 ; lo
vqmovun.s16 d1, q5
vqmovun.s16 d2, q6 ; hi
vqmovun.s16 d3, q7
- ldr r1, [sp] ; stride
- add r2, r3, #4 ; hi
- vst1.32 {d0[0]}, [r3], r1 ; lo
- vst1.32 {d0[1]}, [r2], r1 ; hi
- vst1.32 {d1[0]}, [r3], r1
- vst1.32 {d1[1]}, [r2], r1
- vst1.32 {d2[0]}, [r3], r1
- vst1.32 {d2[1]}, [r2], r1
- vst1.32 {d3[0]}, [r3]
- vst1.32 {d3[1]}, [r2]
+ vst1.32 {d0[0]}, [r2], r3 ; lo
+ vst1.32 {d0[1]}, [r1], r3 ; hi
+ vst1.32 {d1[0]}, [r2], r3
+ vst1.32 {d1[1]}, [r1], r3
+ vst1.32 {d2[0]}, [r2], r3
+ vst1.32 {d2[1]}, [r1], r3
+ vst1.32 {d3[0]}, [r2]
+ vst1.32 {d3[1]}, [r1]
- bx lr
+ pop {r4}
+ bx lr
- ENDP ; |idct_dequant_dc_full_2x_neon|
+ ENDP ; |idct_dequant_dc_full_2x_neon|
; Constant Pool
cospi8sqrt2minus1 DCD 0x4e7b
diff --git a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
index 772ec4685..d5dce63f6 100644
--- a/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
+++ b/vp8/decoder/arm/neon/idct_dequant_full_2x_neon.asm
@@ -15,32 +15,30 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
-;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,
-; unsigned char *dst, int pitch, int stride);
+;void idct_dequant_full_2x_neon(short *q, short *dq,
+; unsigned char *dst, int stride);
; r0 *q,
; r1 *dq,
-; r2 *pre
-; r3 *dst
-; sp pitch
-; sp+4 stride
+; r2 *dst
+; r3 stride
|idct_dequant_full_2x_neon| PROC
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
vld1.16 {q2, q3}, [r0] ; l q
- ldr r1, [sp] ; pitch
add r0, r0, #32
vld1.16 {q4, q5}, [r0] ; r q
add r12, r2, #4
+
; interleave the predictors
- vld1.32 {d28[0]}, [r2], r1 ; l pre
- vld1.32 {d28[1]}, [r12], r1 ; r pre
- vld1.32 {d29[0]}, [r2], r1
- vld1.32 {d29[1]}, [r12], r1
- vld1.32 {d30[0]}, [r2], r1
- vld1.32 {d30[1]}, [r12], r1
- vld1.32 {d31[0]}, [r2]
+ vld1.32 {d28[0]}, [r2], r3 ; l pre
+ vld1.32 {d28[1]}, [r12], r3 ; r pre
+ vld1.32 {d29[0]}, [r2], r3
+ vld1.32 {d29[1]}, [r12], r3
+ vld1.32 {d30[0]}, [r2], r3
+ vld1.32 {d30[1]}, [r12], r3
+ vld1.32 {d31[0]}, [r2], r3
vld1.32 {d31[1]}, [r12]
- adr r2, cospi8sqrt2minus1 ; pointer to the first constant
+ adr r1, cospi8sqrt2minus1 ; pointer to the first constant
; dequant: q[i] = q[i] * dq[i]
vmul.i16 q2, q2, q0
@@ -48,7 +46,7 @@
vmul.i16 q4, q4, q0
vmul.i16 q5, q5, q1
- vld1.16 {d0}, [r2]
+ vld1.16 {d0}, [r1]
; q2: l0r0 q3: l8r8
; q4: l4r4 q5: l12r12
@@ -168,22 +166,23 @@
sub r0, r0, #32
vst1.16 {q14, q15}, [r0] ; write over low input
+ sub r2, r2, r3, lsl #2 ; dst - 4*stride
+ add r1, r2, #4 ; hi
+
;saturate and narrow
vqmovun.s16 d0, q4 ; lo
vqmovun.s16 d1, q5
vqmovun.s16 d2, q6 ; hi
vqmovun.s16 d3, q7
- ldr r1, [sp, #4] ; stride
- add r2, r3, #4 ; hi
- vst1.32 {d0[0]}, [r3], r1 ; lo
- vst1.32 {d0[1]}, [r2], r1 ; hi
- vst1.32 {d1[0]}, [r3], r1
- vst1.32 {d1[1]}, [r2], r1
- vst1.32 {d2[0]}, [r3], r1
- vst1.32 {d2[1]}, [r2], r1
- vst1.32 {d3[0]}, [r3]
- vst1.32 {d3[1]}, [r2]
+ vst1.32 {d0[0]}, [r2], r3 ; lo
+ vst1.32 {d0[1]}, [r1], r3 ; hi
+ vst1.32 {d1[0]}, [r2], r3
+ vst1.32 {d1[1]}, [r1], r3
+ vst1.32 {d2[0]}, [r2], r3
+ vst1.32 {d2[1]}, [r1], r3
+ vst1.32 {d3[0]}, [r2]
+ vst1.32 {d3[1]}, [r1]
bx lr
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 6bbc71f79..81f28db89 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -167,12 +167,12 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
/* do prediction */
if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
{
- RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv)(xd);
+ RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd);
if (mode != B_PRED)
{
RECON_INVOKE(&pbi->common.rtcd.recon,
- build_intra_predictors_mby)(xd);
+ build_intra_predictors_mby_s)(xd);
} else {
vp8_intra_prediction_down_copy(xd);
}
@@ -211,20 +211,24 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
int b_mode = xd->mode_info_context->bmi[i].as_mode;
RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict)
- (b, b_mode, b->predictor);
+ (b, b_mode, *(b->base_dst) + b->dst, b->dst_stride);
- if (xd->eobs[i] > 1)
+ if (xd->eobs[i] )
{
- DEQUANT_INVOKE(&pbi->dequant, idct_add)
- (b->qcoeff, b->dequant, b->predictor,
- *(b->base_dst) + b->dst, 16, b->dst_stride);
- }
- else
- {
- IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
- (b->qcoeff[0] * b->dequant[0], b->predictor,
- *(b->base_dst) + b->dst, 16, b->dst_stride);
- ((int *)b->qcoeff)[0] = 0;
+ if (xd->eobs[i] > 1)
+ {
+ DEQUANT_INVOKE(&pbi->dequant, idct_add)
+ (b->qcoeff, b->dequant,
+ *(b->base_dst) + b->dst, b->dst_stride);
+ }
+ else
+ {
+ IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+ (b->qcoeff[0] * b->dequant[0],
+ *(b->base_dst) + b->dst, b->dst_stride,
+ *(b->base_dst) + b->dst, b->dst_stride);
+ ((int *)b->qcoeff)[0] = 0;
+ }
}
}
@@ -233,18 +237,18 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
{
DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
(xd->qcoeff, xd->block[0].dequant,
- xd->predictor, xd->dst.y_buffer,
+ xd->dst.y_buffer,
xd->dst.y_stride, xd->eobs);
}
else
{
BLOCKD *b = &xd->block[24];
- DEQUANT_INVOKE(&pbi->dequant, block)(b);
-
/* do 2nd order transform on the dc block */
if (xd->eobs[24] > 1)
{
+ DEQUANT_INVOKE(&pbi->dequant, block)(b);
+
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
((int *)b->qcoeff)[0] = 0;
((int *)b->qcoeff)[1] = 0;
@@ -257,19 +261,20 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
}
else
{
+ b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
((int *)b->qcoeff)[0] = 0;
}
DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
(xd->qcoeff, xd->block[0].dequant,
- xd->predictor, xd->dst.y_buffer,
+ xd->dst.y_buffer,
xd->dst.y_stride, xd->eobs, xd->block[24].diff);
}
DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
(xd->qcoeff+16*16, xd->block[16].dequant,
- xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.u_buffer, xd->dst.v_buffer,
xd->dst.uv_stride, xd->eobs+16);
}
diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c
index a60442fe8..0861965eb 100644
--- a/vp8/decoder/dequantize.c
+++ b/vp8/decoder/dequantize.c
@@ -14,10 +14,6 @@
#include "vp8/common/idct.h"
#include "vpx_mem/vpx_mem.h"
-extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) ;
-extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
-
-
void vp8_dequantize_b_c(BLOCKD *d)
{
int i;
@@ -31,12 +27,9 @@ void vp8_dequantize_b_c(BLOCKD *d)
}
}
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
- unsigned char *dest, int pitch, int stride)
+void vp8_dequant_idct_add_c(short *input, short *dq,
+ unsigned char *dest, int stride)
{
- short output[16];
- short *diff_ptr = output;
- int r, c;
int i;
for (i = 0; i < 16; i++)
@@ -44,40 +37,17 @@ void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
input[i] = dq[i] * input[i];
}
- /* the idct halves ( >> 1) the pitch */
- vp8_short_idct4x4llm_c(input, output, 4 << 1);
+ vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
vpx_memset(input, 0, 32);
- for (r = 0; r < 4; r++)
- {
- for (c = 0; c < 4; c++)
- {
- int a = diff_ptr[c] + pred[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dest[c] = (unsigned char) a;
- }
-
- dest += stride;
- diff_ptr += 4;
- pred += pitch;
- }
}
-void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
- unsigned char *dest, int pitch, int stride,
+void vp8_dequant_dc_idct_add_c(short *input, short *dq,
+ unsigned char *dest, int stride,
int Dc)
{
int i;
- short output[16];
- short *diff_ptr = output;
- int r, c;
input[0] = (short)Dc;
@@ -86,28 +56,8 @@ void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
input[i] = dq[i] * input[i];
}
- /* the idct halves ( >> 1) the pitch */
- vp8_short_idct4x4llm_c(input, output, 4 << 1);
+ vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
vpx_memset(input, 0, 32);
- for (r = 0; r < 4; r++)
- {
- for (c = 0; c < 4; c++)
- {
- int a = diff_ptr[c] + pred[c];
-
- if (a < 0)
- a = 0;
-
- if (a > 255)
- a = 255;
-
- dest[c] = (unsigned char) a;
- }
-
- dest += stride;
- diff_ptr += 4;
- pred += pitch;
- }
}
diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h
index 2e662a593..019b7f6d1 100644
--- a/vp8/decoder/dequantize.h
+++ b/vp8/decoder/dequantize.h
@@ -18,28 +18,28 @@
#define prototype_dequant_idct_add(sym) \
void sym(short *input, short *dq, \
- unsigned char *pred, unsigned char *output, \
- int pitch, int stride)
+ unsigned char *output, \
+ int stride)
#define prototype_dequant_dc_idct_add(sym) \
void sym(short *input, short *dq, \
- unsigned char *pred, unsigned char *output, \
- int pitch, int stride, \
+ unsigned char *dst, \
+ int stride, \
int dc)
#define prototype_dequant_dc_idct_add_y_block(sym) \
void sym(short *q, short *dq, \
- unsigned char *pre, unsigned char *dst, \
+ unsigned char *dst, \
int stride, char *eobs, short *dc)
#define prototype_dequant_idct_add_y_block(sym) \
void sym(short *q, short *dq, \
- unsigned char *pre, unsigned char *dst, \
+ unsigned char *dst, \
int stride, char *eobs)
#define prototype_dequant_idct_add_uv_block(sym) \
void sym(short *q, short *dq, \
- unsigned char *pre, unsigned char *dst_u, \
+ unsigned char *dst_u, \
unsigned char *dst_v, int stride, char *eobs)
#if ARCH_X86 || ARCH_X86_64
diff --git a/vp8/decoder/error_concealment.c b/vp8/decoder/error_concealment.c
index 48f97b565..86fa191d3 100644
--- a/vp8/decoder/error_concealment.c
+++ b/vp8/decoder/error_concealment.c
@@ -621,9 +621,8 @@ void vp8_conceal_corrupt_mb(MACROBLOCKD *xd)
{
/* This macroblock has corrupt residual, use the motion compensated
image (predictor) for concealment */
- vp8_recon_copy16x16(xd->predictor, 16, xd->dst.y_buffer, xd->dst.y_stride);
- vp8_recon_copy8x8(xd->predictor + 256, 8,
- xd->dst.u_buffer, xd->dst.uv_stride);
- vp8_recon_copy8x8(xd->predictor + 320, 8,
- xd->dst.v_buffer, xd->dst.uv_stride);
+
+ /* The build predictor functions now output directly into the dst buffer,
+ * so the copies are no longer necessary */
+
}
diff --git a/vp8/decoder/idct_blk.c b/vp8/decoder/idct_blk.c
index 04bce665e..1c16b92a9 100644
--- a/vp8/decoder/idct_blk.c
+++ b/vp8/decoder/idct_blk.c
@@ -12,16 +12,17 @@
#include "vp8/common/idct.h"
#include "dequantize.h"
-void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
- unsigned char *dest, int pitch, int stride,
+void vp8_dequant_dc_idct_add_c(short *input, short *dq,
+ unsigned char *dest, int stride,
int Dc);
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
- unsigned char *dest, int pitch, int stride);
-void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
- unsigned char *dst_ptr, int pitch, int stride);
+void vp8_dequant_idct_add_c(short *input, short *dq,
+ unsigned char *dest, int stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,
+ int pred_stride, unsigned char *dst_ptr,
+ int dst_stride);
void vp8_dequant_dc_idct_add_y_block_c
- (short *q, short *dq, unsigned char *pre,
+ (short *q, short *dq,
unsigned char *dst, int stride, char *eobs, short *dc)
{
int i, j;
@@ -31,23 +32,21 @@ void vp8_dequant_dc_idct_add_y_block_c
for (j = 0; j < 4; j++)
{
if (*eobs++ > 1)
- vp8_dequant_dc_idct_add_c (q, dq, pre, dst, 16, stride, dc[0]);
+ vp8_dequant_dc_idct_add_c (q, dq, dst, stride, dc[0]);
else
- vp8_dc_only_idct_add_c (dc[0], pre, dst, 16, stride);
+ vp8_dc_only_idct_add_c (dc[0], dst, stride, dst, stride);
q += 16;
- pre += 4;
dst += 4;
dc ++;
}
- pre += 64 - 16;
dst += 4*stride - 16;
}
}
void vp8_dequant_idct_add_y_block_c
- (short *q, short *dq, unsigned char *pre,
+ (short *q, short *dq,
unsigned char *dst, int stride, char *eobs)
{
int i, j;
@@ -57,25 +56,23 @@ void vp8_dequant_idct_add_y_block_c
for (j = 0; j < 4; j++)
{
if (*eobs++ > 1)
- vp8_dequant_idct_add_c (q, dq, pre, dst, 16, stride);
+ vp8_dequant_idct_add_c (q, dq, dst, stride);
else
{
- vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dst, 16, stride);
+ vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
((int *)q)[0] = 0;
}
q += 16;
- pre += 4;
dst += 4;
}
- pre += 64 - 16;
dst += 4*stride - 16;
}
}
void vp8_dequant_idct_add_uv_block_c
- (short *q, short *dq, unsigned char *pre,
+ (short *q, short *dq,
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
{
int i, j;
@@ -85,19 +82,17 @@ void vp8_dequant_idct_add_uv_block_c
for (j = 0; j < 2; j++)
{
if (*eobs++ > 1)
- vp8_dequant_idct_add_c (q, dq, pre, dstu, 8, stride);
+ vp8_dequant_idct_add_c (q, dq, dstu, stride);
else
{
- vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstu, 8, stride);
+ vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
((int *)q)[0] = 0;
}
q += 16;
- pre += 4;
dstu += 4;
}
- pre += 32 - 8;
dstu += 4*stride - 8;
}
@@ -106,19 +101,17 @@ void vp8_dequant_idct_add_uv_block_c
for (j = 0; j < 2; j++)
{
if (*eobs++ > 1)
- vp8_dequant_idct_add_c (q, dq, pre, dstv, 8, stride);
+ vp8_dequant_idct_add_c (q, dq, dstv, stride);
else
{
- vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstv, 8, stride);
+ vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
((int *)q)[0] = 0;
}
q += 16;
- pre += 4;
dstv += 4;
}
- pre += 32 - 8;
dstv += 4*stride - 8;
}
}
diff --git a/vp8/decoder/reconintra_mt.c b/vp8/decoder/reconintra_mt.c
index 9bba5b75f..bcb2636fd 100644
--- a/vp8/decoder/reconintra_mt.c
+++ b/vp8/decoder/reconintra_mt.c
@@ -606,6 +606,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
MACROBLOCKD *xd,
int b_mode,
unsigned char *predictor,
+ int stride,
int mb_row,
int mb_col,
int num)
@@ -662,7 +663,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
predictor[c] = expected_dc;
}
- predictor += 16;
+ predictor += stride;
}
}
break;
@@ -684,7 +685,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
predictor[c] = pred;
}
- predictor += 16;
+ predictor += stride;
}
}
break;
@@ -706,7 +707,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
predictor[c] = ap[c];
}
- predictor += 16;
+ predictor += stride;
}
}
@@ -729,29 +730,29 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
predictor[c] = lp[r];
}
- predictor += 16;
+ predictor += stride;
}
}
break;
case B_LD_PRED:
{
unsigned char *ptr = Above;
- predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
- predictor[0 * 16 + 1] =
- predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
- predictor[0 * 16 + 2] =
- predictor[1 * 16 + 1] =
- predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
- predictor[0 * 16 + 3] =
- predictor[1 * 16 + 2] =
- predictor[2 * 16 + 1] =
- predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
- predictor[1 * 16 + 3] =
- predictor[2 * 16 + 2] =
- predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
- predictor[2 * 16 + 3] =
- predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
- predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
+ predictor[0 * stride + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
+ predictor[0 * stride + 1] =
+ predictor[1 * stride + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
+ predictor[0 * stride + 2] =
+ predictor[1 * stride + 1] =
+ predictor[2 * stride + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
+ predictor[0 * stride + 3] =
+ predictor[1 * stride + 2] =
+ predictor[2 * stride + 1] =
+ predictor[3 * stride + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
+ predictor[1 * stride + 3] =
+ predictor[2 * stride + 2] =
+ predictor[3 * stride + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
+ predictor[2 * stride + 3] =
+ predictor[3 * stride + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
+ predictor[3 * stride + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
}
break;
@@ -770,22 +771,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
pp[7] = Above[2];
pp[8] = Above[3];
- predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
- predictor[3 * 16 + 1] =
- predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
- predictor[3 * 16 + 2] =
- predictor[2 * 16 + 1] =
- predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
- predictor[3 * 16 + 3] =
- predictor[2 * 16 + 2] =
- predictor[1 * 16 + 1] =
- predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
- predictor[2 * 16 + 3] =
- predictor[1 * 16 + 2] =
- predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
- predictor[1 * 16 + 3] =
- predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
- predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+ predictor[3 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ predictor[3 * stride + 1] =
+ predictor[2 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ predictor[3 * stride + 2] =
+ predictor[2 * stride + 1] =
+ predictor[1 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ predictor[3 * stride + 3] =
+ predictor[2 * stride + 2] =
+ predictor[1 * stride + 1] =
+ predictor[0 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ predictor[2 * stride + 3] =
+ predictor[1 * stride + 2] =
+ predictor[0 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ predictor[1 * stride + 3] =
+ predictor[0 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ predictor[0 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
}
break;
@@ -805,22 +806,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
pp[8] = Above[3];
- predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
- predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
- predictor[3 * 16 + 1] =
- predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
- predictor[2 * 16 + 1] =
- predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
- predictor[3 * 16 + 2] =
- predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
- predictor[2 * 16 + 2] =
- predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
- predictor[3 * 16 + 3] =
- predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
- predictor[2 * 16 + 3] =
- predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
- predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
- predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
+ predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ predictor[2 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ predictor[3 * stride + 1] =
+ predictor[1 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ predictor[2 * stride + 1] =
+ predictor[0 * stride + 0] = (pp[4] + pp[5] + 1) >> 1;
+ predictor[3 * stride + 2] =
+ predictor[1 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ predictor[2 * stride + 2] =
+ predictor[0 * stride + 1] = (pp[5] + pp[6] + 1) >> 1;
+ predictor[3 * stride + 3] =
+ predictor[1 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ predictor[2 * stride + 3] =
+ predictor[0 * stride + 2] = (pp[6] + pp[7] + 1) >> 1;
+ predictor[1 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
+ predictor[0 * stride + 3] = (pp[7] + pp[8] + 1) >> 1;
}
break;
@@ -829,22 +830,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
unsigned char *pp = Above;
- predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
- predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
- predictor[2 * 16 + 0] =
- predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
- predictor[1 * 16 + 1] =
- predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
- predictor[2 * 16 + 1] =
- predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
- predictor[3 * 16 + 1] =
- predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
- predictor[0 * 16 + 3] =
- predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
- predictor[1 * 16 + 3] =
- predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
- predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
- predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
+ predictor[1 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ predictor[2 * stride + 0] =
+ predictor[0 * stride + 1] = (pp[1] + pp[2] + 1) >> 1;
+ predictor[1 * stride + 1] =
+ predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ predictor[2 * stride + 1] =
+ predictor[0 * stride + 2] = (pp[2] + pp[3] + 1) >> 1;
+ predictor[3 * stride + 1] =
+ predictor[1 * stride + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ predictor[0 * stride + 3] =
+ predictor[2 * stride + 2] = (pp[3] + pp[4] + 1) >> 1;
+ predictor[1 * stride + 3] =
+ predictor[3 * stride + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ predictor[2 * stride + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ predictor[3 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
}
break;
@@ -862,22 +863,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
pp[8] = Above[3];
- predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
- predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
- predictor[2 * 16 + 0] =
- predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
- predictor[2 * 16 + 1] =
- predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
- predictor[2 * 16 + 2] =
- predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
- predictor[2 * 16 + 3] =
- predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
- predictor[1 * 16 + 2] =
- predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
- predictor[1 * 16 + 3] =
- predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
- predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
- predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
+ predictor[3 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
+ predictor[3 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ predictor[2 * stride + 0] =
+ predictor[3 * stride + 2] = (pp[1] + pp[2] + 1) >> 1;
+ predictor[2 * stride + 1] =
+ predictor[3 * stride + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ predictor[2 * stride + 2] =
+ predictor[1 * stride + 0] = (pp[2] + pp[3] + 1) >> 1;
+ predictor[2 * stride + 3] =
+ predictor[1 * stride + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
+ predictor[1 * stride + 2] =
+ predictor[0 * stride + 0] = (pp[3] + pp[4] + 1) >> 1;
+ predictor[1 * stride + 3] =
+ predictor[0 * stride + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
+ predictor[0 * stride + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
+ predictor[0 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
}
break;
@@ -885,22 +886,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
case B_HU_PRED:
{
unsigned char *pp = Left;
- predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
- predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
- predictor[0 * 16 + 2] =
- predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
- predictor[0 * 16 + 3] =
- predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
- predictor[1 * 16 + 2] =
- predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
- predictor[1 * 16 + 3] =
- predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
- predictor[2 * 16 + 2] =
- predictor[2 * 16 + 3] =
- predictor[3 * 16 + 0] =
- predictor[3 * 16 + 1] =
- predictor[3 * 16 + 2] =
- predictor[3 * 16 + 3] = pp[3];
+ predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
+ predictor[0 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
+ predictor[0 * stride + 2] =
+ predictor[1 * stride + 0] = (pp[1] + pp[2] + 1) >> 1;
+ predictor[0 * stride + 3] =
+ predictor[1 * stride + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
+ predictor[1 * stride + 2] =
+ predictor[2 * stride + 0] = (pp[2] + pp[3] + 1) >> 1;
+ predictor[1 * stride + 3] =
+ predictor[2 * stride + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
+ predictor[2 * stride + 2] =
+ predictor[2 * stride + 3] =
+ predictor[3 * stride + 0] =
+ predictor[3 * stride + 1] =
+ predictor[3 * stride + 2] =
+ predictor[3 * stride + 3] = pp[3];
}
break;
diff --git a/vp8/decoder/reconintra_mt.h b/vp8/decoder/reconintra_mt.h
index d401295b2..4576a8064 100644
--- a/vp8/decoder/reconintra_mt.h
+++ b/vp8/decoder/reconintra_mt.h
@@ -19,7 +19,7 @@ extern void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, i
extern void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
extern void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
-extern void vp8mt_predict_intra4x4(VP8D_COMP *pbi, MACROBLOCKD *x, int b_mode, unsigned char *predictor, int mb_row, int mb_col, int num);
+extern void vp8mt_predict_intra4x4(VP8D_COMP *pbi, MACROBLOCKD *x, int b_mode, unsigned char *predictor, int stride, int mb_row, int mb_col, int num);
extern void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
#endif
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index bfe09735c..eba5830d5 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -138,11 +138,11 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
/* do prediction */
if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
{
- vp8mt_build_intra_predictors_mbuv(pbi, xd, mb_row, mb_col);
+ vp8mt_build_intra_predictors_mbuv_s(pbi, xd, mb_row, mb_col);
if (xd->mode_info_context->mbmi.mode != B_PRED)
{
- vp8mt_build_intra_predictors_mby(pbi, xd, mb_row, mb_col);
+ vp8mt_build_intra_predictors_mby_s(pbi, xd, mb_row, mb_col);
} else {
vp8mt_intra_prediction_down_copy(pbi, xd, mb_row, mb_col);
}
@@ -201,7 +201,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
(xd->qcoeff, xd->block[0].dequant,
- xd->predictor, xd->dst.y_buffer,
+ xd->dst.y_buffer,
xd->dst.y_stride, xd->eobs, xd->block[24].diff);
}
else if (xd->mode_info_context->mbmi.mode == B_PRED)
@@ -211,19 +211,21 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
BLOCKD *b = &xd->block[i];
int b_mode = xd->mode_info_context->bmi[i].as_mode;
- vp8mt_predict_intra4x4(pbi, xd, b_mode, b->predictor, mb_row, mb_col, i);
+ vp8mt_predict_intra4x4(pbi, xd, b_mode, *(b->base_dst) + b->dst,
+ b->dst_stride, mb_row, mb_col, i);
if (xd->eobs[i] > 1)
{
DEQUANT_INVOKE(&pbi->dequant, idct_add)
- (b->qcoeff, b->dequant, b->predictor,
- *(b->base_dst) + b->dst, 16, b->dst_stride);
+ (b->qcoeff, b->dequant,
+ *(b->base_dst) + b->dst, b->dst_stride);
}
else
{
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
- (b->qcoeff[0] * b->dequant[0], b->predictor,
- *(b->base_dst) + b->dst, 16, b->dst_stride);
+ (b->qcoeff[0] * b->dequant[0],
+ *(b->base_dst) + b->dst, b->dst_stride,
+ *(b->base_dst) + b->dst, b->dst_stride);
((int *)b->qcoeff)[0] = 0;
}
}
@@ -232,13 +234,13 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
{
DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
(xd->qcoeff, xd->block[0].dequant,
- xd->predictor, xd->dst.y_buffer,
+ xd->dst.y_buffer,
xd->dst.y_stride, xd->eobs);
}
DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
(xd->qcoeff+16*16, xd->block[16].dequant,
- xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.u_buffer, xd->dst.v_buffer,
xd->dst.uv_stride, xd->eobs+16);
}
diff --git a/vp8/decoder/x86/dequantize_mmx.asm b/vp8/decoder/x86/dequantize_mmx.asm
index 0d6133a46..648bde4c5 100644
--- a/vp8/decoder/x86/dequantize_mmx.asm
+++ b/vp8/decoder/x86/dequantize_mmx.asm
@@ -50,14 +50,17 @@ sym(vp8_dequantize_b_impl_mmx):
ret
-;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
+;void dequant_idct_add_mmx(
+;short *input, 0
+;short *dq, 1
+;unsigned char *dest, 2
+;int stride) 3
global sym(vp8_dequant_idct_add_mmx)
sym(vp8_dequant_idct_add_mmx):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
+ SHADOW_ARGS_TO_STACK 4
GET_GOT rbx
- push rsi
push rdi
; end prolog
@@ -77,8 +80,8 @@ sym(vp8_dequant_idct_add_mmx):
movq mm3, [rax+24]
pmullw mm3, [rdx+24]
- mov rdx, arg(3) ;dest
- mov rsi, arg(2) ;pred
+ mov rdx, arg(2) ;dest
+
pxor mm7, mm7
@@ -89,8 +92,7 @@ sym(vp8_dequant_idct_add_mmx):
movq [rax+24],mm7
- movsxd rax, dword ptr arg(4) ;pitch
- movsxd rdi, dword ptr arg(5) ;stride
+ movsxd rdi, dword ptr arg(3) ;stride
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
@@ -211,28 +213,27 @@ sym(vp8_dequant_idct_add_mmx):
pxor mm7, mm7
- movd mm4, [rsi]
+ movd mm4, [rdx]
punpcklbw mm4, mm7
paddsw mm0, mm4
packuswb mm0, mm7
movd [rdx], mm0
- movd mm4, [rsi+rax]
+ movd mm4, [rdx+rdi]
punpcklbw mm4, mm7
paddsw mm1, mm4
packuswb mm1, mm7
movd [rdx+rdi], mm1
- movd mm4, [rsi+2*rax]
+ movd mm4, [rdx+2*rdi]
punpcklbw mm4, mm7
paddsw mm2, mm4
packuswb mm2, mm7
movd [rdx+rdi*2], mm2
add rdx, rdi
- add rsi, rax
- movd mm4, [rsi+2*rax]
+ movd mm4, [rdx+2*rdi]
punpcklbw mm4, mm7
paddsw mm5, mm4
packuswb mm5, mm7
@@ -240,22 +241,24 @@ sym(vp8_dequant_idct_add_mmx):
; begin epilog
pop rdi
- pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
-;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
+;void dequant_dc_idct_add_mmx(
+;short *input, 0
+;short *dq, 1
+;unsigned char *dest, 2
+;int stride, 3
+;int Dc) 4
global sym(vp8_dequant_dc_idct_add_mmx)
sym(vp8_dequant_dc_idct_add_mmx):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
+ SHADOW_ARGS_TO_STACK 5
GET_GOT rbx
- push rsi
- push rdi
; end prolog
mov rax, arg(0) ;input
@@ -273,8 +276,7 @@ sym(vp8_dequant_dc_idct_add_mmx):
movq mm3, [rax+24]
pmullw mm3, [rdx+24]
- mov rdx, arg(3) ;dest
- mov rsi, arg(2) ;pred
+ mov rdx, arg(2) ;pred
pxor mm7, mm7
@@ -286,13 +288,12 @@ sym(vp8_dequant_dc_idct_add_mmx):
; move lower word of Dc to lower word of mm0
psrlq mm0, 16
- movzx rcx, word ptr arg(6) ;Dc
+ movzx rcx, word ptr arg(4) ;Dc
psllq mm0, 16
movq mm7, rcx
por mm0, mm7
- movsxd rax, dword ptr arg(4) ;pitch
- movsxd rdi, dword ptr arg(5) ;stride
+ movsxd rax, dword ptr arg(3) ;stride
psubw mm0, mm2 ; b1= 0-2
paddw mm2, mm2 ;
@@ -413,36 +414,33 @@ sym(vp8_dequant_dc_idct_add_mmx):
pxor mm7, mm7
- movd mm4, [rsi]
+ movd mm4, [rdx]
punpcklbw mm4, mm7
paddsw mm0, mm4
packuswb mm0, mm7
movd [rdx], mm0
- movd mm4, [rsi+rax]
+ movd mm4, [rdx+rax]
punpcklbw mm4, mm7
paddsw mm1, mm4
packuswb mm1, mm7
- movd [rdx+rdi], mm1
+ movd [rdx+rax], mm1
- movd mm4, [rsi+2*rax]
+ movd mm4, [rdx+2*rax]
punpcklbw mm4, mm7
paddsw mm2, mm4
packuswb mm2, mm7
- movd [rdx+rdi*2], mm2
+ movd [rdx+rax*2], mm2
- add rdx, rdi
- add rsi, rax
+ add rdx, rax
- movd mm4, [rsi+2*rax]
+ movd mm4, [rdx+2*rax]
punpcklbw mm4, mm7
paddsw mm5, mm4
packuswb mm5, mm7
- movd [rdx+rdi*2], mm5
+ movd [rdx+rax*2], mm5
; begin epilog
- pop rdi
- pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
diff --git a/vp8/decoder/x86/idct_blk_mmx.c b/vp8/decoder/x86/idct_blk_mmx.c
index 558dbaf7e..37de5b9fd 100644
--- a/vp8/decoder/x86/idct_blk_mmx.c
+++ b/vp8/decoder/x86/idct_blk_mmx.c
@@ -13,7 +13,7 @@
#include "vp8/decoder/dequantize.h"
void vp8_dequant_dc_idct_add_y_block_mmx
- (short *q, short *dq, unsigned char *pre,
+ (short *q, short *dq,
unsigned char *dst, int stride, char *eobs, short *dc)
{
int i;
@@ -21,35 +21,34 @@ void vp8_dequant_dc_idct_add_y_block_mmx
for (i = 0; i < 4; i++)
{
if (eobs[0] > 1)
- vp8_dequant_dc_idct_add_mmx (q, dq, pre, dst, 16, stride, dc[0]);
- else
- vp8_dc_only_idct_add_mmx (dc[0], pre, dst, 16, stride);
+ vp8_dequant_dc_idct_add_mmx (q, dq, dst, stride, dc[0]);
+ else if (eobs[0] == 1)
+ vp8_dc_only_idct_add_mmx (dc[0], dst, stride, dst, stride);
if (eobs[1] > 1)
- vp8_dequant_dc_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
- else
- vp8_dc_only_idct_add_mmx (dc[1], pre+4, dst+4, 16, stride);
+ vp8_dequant_dc_idct_add_mmx (q+16, dq, dst+4, stride, dc[1]);
+ else if (eobs[1] == 1)
+ vp8_dc_only_idct_add_mmx (dc[1], dst+4, stride, dst+4, stride);
if (eobs[2] > 1)
- vp8_dequant_dc_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
- else
- vp8_dc_only_idct_add_mmx (dc[2], pre+8, dst+8, 16, stride);
+ vp8_dequant_dc_idct_add_mmx (q+32, dq, dst+8, stride, dc[2]);
+ else if (eobs[2] == 1)
+ vp8_dc_only_idct_add_mmx (dc[2], dst+8, stride, dst+8, stride);
if (eobs[3] > 1)
- vp8_dequant_dc_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
- else
- vp8_dc_only_idct_add_mmx (dc[3], pre+12, dst+12, 16, stride);
+ vp8_dequant_dc_idct_add_mmx (q+48, dq, dst+12, stride, dc[3]);
+ else if (eobs[3] == 1)
+ vp8_dc_only_idct_add_mmx (dc[3], dst+12, stride, dst+12, stride);
q += 64;
dc += 4;
- pre += 64;
dst += 4*stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_y_block_mmx
- (short *q, short *dq, unsigned char *pre,
+ (short *q, short *dq,
unsigned char *dst, int stride, char *eobs)
{
int i;
@@ -57,46 +56,48 @@ void vp8_dequant_idct_add_y_block_mmx
for (i = 0; i < 4; i++)
{
if (eobs[0] > 1)
- vp8_dequant_idct_add_mmx (q, dq, pre, dst, 16, stride);
- else
+ vp8_dequant_idct_add_mmx (q, dq, dst, stride);
+ else if (eobs[0] == 1)
{
- vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dst, 16, stride);
+ vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
- vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride);
- else
+ vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride);
+ else if (eobs[1] == 1)
{
- vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dst+4, 16, stride);
+ vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
+ dst+4, stride);
((int *)(q+16))[0] = 0;
}
if (eobs[2] > 1)
- vp8_dequant_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride);
- else
+ vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride);
+ else if (eobs[2] == 1)
{
- vp8_dc_only_idct_add_mmx (q[32]*dq[0], pre+8, dst+8, 16, stride);
+ vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
+ dst+8, stride);
((int *)(q+32))[0] = 0;
}
if (eobs[3] > 1)
- vp8_dequant_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride);
- else
+ vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride);
+ else if (eobs[3] == 1)
{
- vp8_dc_only_idct_add_mmx (q[48]*dq[0], pre+12, dst+12, 16, stride);
+ vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
+ dst+12, stride);
((int *)(q+48))[0] = 0;
}
q += 64;
- pre += 64;
dst += 4*stride;
eobs += 4;
}
}
void vp8_dequant_idct_add_uv_block_mmx
- (short *q, short *dq, unsigned char *pre,
+ (short *q, short *dq,
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
{
int i;
@@ -104,23 +105,23 @@ void vp8_dequant_idct_add_uv_block_mmx
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
- vp8_dequant_idct_add_mmx (q, dq, pre, dstu, 8, stride);
- else
+ vp8_dequant_idct_add_mmx (q, dq, dstu, stride);
+ else if (eobs[0] == 1)
{
- vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstu, 8, stride);
+ vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
- vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstu+4, 8, stride);
- else
+ vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride);
+ else if (eobs[1] == 1)
{
- vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstu+4, 8, stride);
+ vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
+ dstu+4, stride);
((int *)(q+16))[0] = 0;
}
q += 32;
- pre += 32;
dstu += 4*stride;
eobs += 2;
}
@@ -128,23 +129,23 @@ void vp8_dequant_idct_add_uv_block_mmx
for (i = 0; i < 2; i++)
{
if (eobs[0] > 1)
- vp8_dequant_idct_add_mmx (q, dq, pre, dstv, 8, stride);
- else
+ vp8_dequant_idct_add_mmx (q, dq, dstv, stride);
+ else if (eobs[0] == 1)
{
- vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstv, 8, stride);
+ vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
((int *)q)[0] = 0;
}
if (eobs[1] > 1)
- vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstv+4, 8, stride);
- else
+ vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride);
+ else if (eobs[1] == 1)
{
- vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstv+4, 8, stride);
+ vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
+ dstv+4, stride);
((int *)(q+16))[0] = 0;
}
q += 32;
- pre += 32;
dstv += 4*stride;
eobs += 2;
}
diff --git a/vp8/decoder/x86/idct_blk_sse2.c b/vp8/decoder/x86/idct_blk_sse2.c
index a6a720639..0495b0610 100644
--- a/vp8/decoder/x86/idct_blk_sse2.c
+++ b/vp8/decoder/x86/idct_blk_sse2.c
@@ -13,102 +13,115 @@
#include "vp8/decoder/dequantize.h"
void vp8_idct_dequant_dc_0_2x_sse2
- (short *q, short *dq, unsigned char *pre,
+ (short *q, short *dq,
unsigned char *dst, int dst_stride, short *dc);
void vp8_idct_dequant_dc_full_2x_sse2
- (short *q, short *dq, unsigned char *pre,
+ (short *q, short *dq,
unsigned char *dst, int dst_stride, short *dc);
void vp8_idct_dequant_0_2x_sse2
- (short *q, short *dq ,unsigned char *pre,
- unsigned char *dst, int dst_stride, int blk_stride);
+ (short *q, short *dq ,
+ unsigned char *dst, int dst_stride);
void vp8_idct_dequant_full_2x_sse2
- (short *q, short *dq ,unsigned char *pre,
- unsigned char *dst, int dst_stride, int blk_stride);
+ (short *q, short *dq ,
+ unsigned char *dst, int dst_stride);
void vp8_dequant_dc_idct_add_y_block_sse2
- (short *q, short *dq, unsigned char *pre,
+ (short *q, short *dq,
unsigned char *dst, int stride, char *eobs, short *dc)
{
int i;
for (i = 0; i < 4; i++)
{
- if (((short *)(eobs))[0] & 0xfefe)
- vp8_idct_dequant_dc_full_2x_sse2 (q, dq, pre, dst, stride, dc);
- else
- vp8_idct_dequant_dc_0_2x_sse2 (q, dq, pre, dst, stride, dc);
-
- if (((short *)(eobs))[1] & 0xfefe)
- vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
- else
- vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
-
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)(eobs))[0] & 0xfefe)
+ vp8_idct_dequant_dc_full_2x_sse2 (q, dq, dst, stride, dc);
+ else
+ vp8_idct_dequant_dc_0_2x_sse2 (q, dq, dst, stride, dc);
+ }
+
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)(eobs))[1] & 0xfefe)
+ vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
+ else
+ vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
+ }
q += 64;
dc += 4;
- pre += 64;
dst += stride*4;
eobs += 4;
}
}
void vp8_dequant_idct_add_y_block_sse2
- (short *q, short *dq, unsigned char *pre,
+ (short *q, short *dq,
unsigned char *dst, int stride, char *eobs)
{
int i;
for (i = 0; i < 4; i++)
{
- if (((short *)(eobs))[0] & 0xfefe)
- vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dst, stride, 16);
- else
- vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dst, stride, 16);
-
- if (((short *)(eobs))[1] & 0xfefe)
- vp8_idct_dequant_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
- else
- vp8_idct_dequant_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
-
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)(eobs))[0] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride);
+ }
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)(eobs))[1] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride);
+ }
q += 64;
- pre += 64;
dst += stride*4;
eobs += 4;
}
}
void vp8_dequant_idct_add_uv_block_sse2
- (short *q, short *dq, unsigned char *pre,
+ (short *q, short *dq,
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
{
- if (((short *)(eobs))[0] & 0xfefe)
- vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
- else
- vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
-
+ if (((short *)(eobs))[0])
+ {
+ if (((short *)(eobs))[0] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
+ }
q += 32;
- pre += 32;
dstu += stride*4;
- if (((short *)(eobs))[1] & 0xfefe)
- vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
- else
- vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
-
+ if (((short *)(eobs))[1])
+ {
+ if (((short *)(eobs))[1] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
+ }
q += 32;
- pre += 32;
-
- if (((short *)(eobs))[2] & 0xfefe)
- vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
- else
- vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
+ if (((short *)(eobs))[2])
+ {
+ if (((short *)(eobs))[2] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
+ }
q += 32;
- pre += 32;
dstv += stride*4;
- if (((short *)(eobs))[3] & 0xfefe)
- vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
- else
- vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
+ if (((short *)(eobs))[3])
+ {
+ if (((short *)(eobs))[3] & 0xfefe)
+ vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
+ else
+ vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
+ }
}
diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
index c00375e88..c061b2fab 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
@@ -65,7 +65,7 @@
numparts_loop
ldr r10, [sp, #40] ; ptr
ldr r5, [sp, #36] ; move mb_rows to the counting section
- sub r5, r5, r11 ; move start point with each partition
+ subs r5, r5, r11 ; move start point with each partition
; mb_rows starts at i
str r5, [sp, #12]
@@ -80,6 +80,8 @@ numparts_loop
str r2, [r0, #vp8_writer_pos]
str r10, [r0, #vp8_writer_buffer]
+ ble end_partition ; if (mb_rows <= 0) end partition
+
mb_row_loop
ldr r1, [r7, #tokenlist_start]
@@ -344,6 +346,7 @@ check_p_lt_stop
str r6, [sp, #12]
bgt mb_row_loop
+end_partition
mov r12, #32
stop_encode_loop
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 510e4cc98..7f2b46daa 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -456,7 +456,7 @@ void encode_mb_row(VP8_COMP *cpi,
vp8_activity_masking(cpi, x);
// Is segmentation enabled
- // MB level adjutment to quantizer
+ // MB level adjustment to quantizer
if (xd->segmentation_enabled)
{
// Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
@@ -505,7 +505,8 @@ void encode_mb_row(VP8_COMP *cpi,
// Special case code for cyclic refresh
// If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
// during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
- if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
+ if ((cpi->current_layer == 0) &&
+ (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled))
{
cpi->segmentation_map[map_index+mb_col] = xd->mode_info_context->mbmi.segment_id;
@@ -648,6 +649,30 @@ void init_encode_frame_mb_context(VP8_COMP *cpi)
+ vp8_cost_one(255)
+ vp8_cost_one(128);
}
+ else if ((cpi->oxcf.number_of_layers > 1) &&
+ (cpi->ref_frame_flags == VP8_GOLD_FLAG))
+ {
+ xd->ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_zero(1);
+ xd->ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_one(1)
+ + vp8_cost_zero(255);
+ xd->ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_one(1)
+ + vp8_cost_one(255);
+ }
+ else if ((cpi->oxcf.number_of_layers > 1) &&
+ (cpi->ref_frame_flags == VP8_ALT_FLAG))
+ {
+ xd->ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_zero(1);
+ xd->ref_frame_cost[GOLDEN_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_one(1)
+ + vp8_cost_zero(1);
+ xd->ref_frame_cost[ALTREF_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
+ + vp8_cost_one(1)
+ + vp8_cost_one(1);
+ }
else
{
xd->ref_frame_cost[LAST_FRAME] = vp8_cost_one(cpi->prob_intra_coded)
@@ -937,7 +962,8 @@ void vp8_encode_frame(VP8_COMP *cpi)
// Adjust the projected reference frame useage probability numbers to reflect
// what we have just seen. This may be usefull when we make multiple itterations
// of the recode loop rather than continuing to use values from the previous frame.
- if ((cm->frame_type != KEY_FRAME) && !cm->refresh_alt_ref_frame && !cm->refresh_golden_frame)
+ if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) ||
+ (!cm->refresh_alt_ref_frame && !cm->refresh_golden_frame)))
{
const int *const rfct = cpi->count_mb_ref_frame_usage;
const int rf_intra = rfct[INTRA_FRAME];
@@ -1220,7 +1246,7 @@ int vp8cx_encode_inter_macroblock
if (xd->segmentation_enabled)
{
// If cyclic update enabled
- if (cpi->cyclic_refresh_mode_enabled)
+ if (cpi->current_layer == 0 && cpi->cyclic_refresh_mode_enabled)
{
// Clear segment_id back to 0 if not coded (last frame 0,0)
if ((xd->mode_info_context->mbmi.segment_id == 1) &&
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index 4a77c1ff3..74e40323d 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -64,7 +64,7 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
BLOCK *be = &x->block[ib];
RECON_INVOKE(&rtcd->common->recon, intra4x4_predict)
- (b, b->bmi.as_mode, b->predictor);
+ (b, b->bmi.as_mode, b->predictor, 16);
ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
@@ -72,9 +72,8 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
x->quantize_b(be, b);
- vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 32);
+ vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 16);
- RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
}
void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
@@ -106,9 +105,6 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
- RECON_INVOKE(&rtcd->common->recon, recon_mby)
- (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
-
}
void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
@@ -126,5 +122,4 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
- vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
}
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index ff9e3e6ee..b3c7df502 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -577,9 +577,70 @@ void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
}
}
+static void recon_dcblock(MACROBLOCKD *x)
+{
+ BLOCKD *b = &x->block[24];
+ int i;
+
+ for (i = 0; i < 16; i++)
+ {
+ x->block[i].dqcoeff[0] = b->diff[i];
+ }
+
+}
+
+
+static void inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *x)
+{
+ int i;
+
+ if (x->mode_info_context->mbmi.mode != B_PRED &&
+ x->mode_info_context->mbmi.mode != SPLITMV)
+ {
+ /* do 2nd order transform on the dc block */
+
+ IDCT_INVOKE(rtcd, iwalsh16)(&x->block[24].dqcoeff[0], x->block[24].diff);
+ recon_dcblock(x);
+ }
+
+ for (i = 0; i < 16; i++)
+ {
+ BLOCKD *b = &x->block[i];
+
+ if (b->eob > 1)
+ {
+ IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, 16,
+ *(b->base_dst) + b->dst, b->dst_stride);
+ }
+ else
+ {
+ IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, 16,
+ *(b->base_dst) + b->dst, b->dst_stride);
+ }
+ }
+
+
+ for (i = 16; i < 24; i++)
+ {
+ BLOCKD *b = &x->block[i];
+
+ if (b->eob > 1)
+ {
+ IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, 8,
+ *(b->base_dst) + b->dst, b->dst_stride);
+ }
+ else
+ {
+ IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, 8,
+ *(b->base_dst) + b->dst, b->dst_stride);
+ }
+ }
+
+}
void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
{
- vp8_build_inter_predictors_mb(&x->e_mbd);
+ vp8_build_inter_predictors_mb_e(&x->e_mbd);
vp8_subtract_mb(rtcd, x);
@@ -590,10 +651,8 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
if (x->optimize)
optimize_mb(x, rtcd);
- vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+ inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
- RECON_INVOKE(&rtcd->common->recon, recon_mb)
- (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
}
@@ -612,6 +671,4 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
- RECON_INVOKE(&rtcd->common->recon, recon_mby)
- (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
}
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index cac92057c..43c971480 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -237,6 +237,79 @@ void vp8_initialize()
extern FILE *vpxlogc;
#endif
+static void save_layer_context(VP8_COMP *cpi)
+{
+ LAYER_CONTEXT *lc = &cpi->layer_context[cpi->current_layer];
+
+ // Save layer dependent coding state
+ lc->target_bandwidth = cpi->target_bandwidth;
+ //lc->target_bandwidth = cpi->oxcf.target_bandwidth;
+ lc->starting_buffer_level = cpi->oxcf.starting_buffer_level;
+ lc->optimal_buffer_level = cpi->oxcf.optimal_buffer_level;
+ lc->maximum_buffer_size = cpi->oxcf.maximum_buffer_size;
+ lc->buffer_level = cpi->buffer_level;
+ lc->bits_off_target = cpi->bits_off_target;
+ lc->total_actual_bits = cpi->total_actual_bits;
+ lc->worst_quality = cpi->worst_quality;
+ lc->active_worst_quality = cpi->active_worst_quality;
+ lc->best_quality = cpi->best_quality;
+ lc->active_best_quality = cpi->active_best_quality;
+ lc->ni_av_qi = cpi->ni_av_qi;
+ lc->ni_tot_qi = cpi->ni_tot_qi;
+ lc->ni_frames = cpi->ni_frames;
+ lc->avg_frame_qindex = cpi->avg_frame_qindex;
+ lc->rate_correction_factor = cpi->rate_correction_factor;
+ lc->key_frame_rate_correction_factor = cpi->key_frame_rate_correction_factor;
+ lc->gf_rate_correction_factor = cpi->gf_rate_correction_factor;
+ lc->zbin_over_quant = cpi->zbin_over_quant;
+ lc->inter_frame_target = cpi->inter_frame_target;
+ lc->total_byte_count = cpi->total_byte_count;
+ lc->filter_level = cpi->common.filter_level;
+
+ lc->last_frame_percent_intra = cpi->last_frame_percent_intra;
+
+ memcpy (lc->count_mb_ref_frame_usage,
+ cpi->count_mb_ref_frame_usage,
+ sizeof(cpi->count_mb_ref_frame_usage));
+}
+
+static void restore_layer_context(VP8_COMP *cpi, const int layer)
+{
+ LAYER_CONTEXT *lc = &cpi->layer_context[layer];
+
+ // Restore layer dependent coding state
+ cpi->current_layer = layer;
+ cpi->target_bandwidth = lc->target_bandwidth;
+ cpi->oxcf.target_bandwidth = lc->target_bandwidth;
+ cpi->oxcf.starting_buffer_level = lc->starting_buffer_level;
+ cpi->oxcf.optimal_buffer_level = lc->optimal_buffer_level;
+ cpi->oxcf.maximum_buffer_size = lc->maximum_buffer_size;
+ cpi->buffer_level = lc->buffer_level;
+ cpi->bits_off_target = lc->bits_off_target;
+ cpi->total_actual_bits = lc->total_actual_bits;
+ //cpi->worst_quality = lc->worst_quality;
+ cpi->active_worst_quality = lc->active_worst_quality;
+ //cpi->best_quality = lc->best_quality;
+ cpi->active_best_quality = lc->active_best_quality;
+ cpi->ni_av_qi = lc->ni_av_qi;
+ cpi->ni_tot_qi = lc->ni_tot_qi;
+ cpi->ni_frames = lc->ni_frames;
+ cpi->avg_frame_qindex = lc->avg_frame_qindex;
+ cpi->rate_correction_factor = lc->rate_correction_factor;
+ cpi->key_frame_rate_correction_factor = lc->key_frame_rate_correction_factor;
+ cpi->gf_rate_correction_factor = lc->gf_rate_correction_factor;
+ cpi->zbin_over_quant = lc->zbin_over_quant;
+ cpi->inter_frame_target = lc->inter_frame_target;
+ cpi->total_byte_count = lc->total_byte_count;
+ cpi->common.filter_level = lc->filter_level;
+
+ cpi->last_frame_percent_intra = lc->last_frame_percent_intra;
+
+ memcpy (cpi->count_mb_ref_frame_usage,
+ lc->count_mb_ref_frame_usage,
+ sizeof(cpi->count_mb_ref_frame_usage));
+}
+
static void setup_features(VP8_COMP *cpi)
{
// Set up default state for MB feature flags
@@ -510,7 +583,7 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
set_segment_data((VP8_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
// Delete sementation map
- vpx_free(seg_map);
+ vpx_free(seg_map);
seg_map = 0;
@@ -1397,11 +1470,13 @@ void vp8_new_frame_rate(VP8_COMP *cpi, double framerate)
if(framerate < .1)
framerate = 30;
- cpi->oxcf.frame_rate = framerate;
- cpi->output_frame_rate = cpi->oxcf.frame_rate;
- cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
- cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
- cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+ cpi->oxcf.frame_rate = framerate;
+ cpi->output_frame_rate = cpi->oxcf.frame_rate;
+ cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth /
+ cpi->output_frame_rate);
+ cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
+ cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth *
+ cpi->oxcf.two_pass_vbrmin_section / 100);
// Set Maximum gf/arf interval
cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);
@@ -1472,6 +1547,65 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
cpi->total_actual_bits = 0;
cpi->total_target_vs_actual = 0;
+ // Temporal scalabilty
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ int i;
+ int prev_layer_frame_rate=0;
+
+ for (i=0; i<cpi->oxcf.number_of_layers; i++)
+ {
+ LAYER_CONTEXT *lc = &cpi->layer_context[i];
+
+ // Layer configuration
+ lc->frame_rate =
+ cpi->output_frame_rate / cpi->oxcf.rate_decimator[i];
+ lc->target_bandwidth = cpi->oxcf.target_bitrate[i] * 1000;
+
+ lc->starting_buffer_level =
+ rescale(oxcf->starting_buffer_level,
+ lc->target_bandwidth, 1000);
+
+ if (oxcf->optimal_buffer_level == 0)
+ lc->optimal_buffer_level = lc->target_bandwidth / 8;
+ else
+ lc->optimal_buffer_level =
+ rescale(oxcf->optimal_buffer_level,
+ lc->target_bandwidth, 1000);
+
+ if (oxcf->maximum_buffer_size == 0)
+ lc->maximum_buffer_size = lc->target_bandwidth / 8;
+ else
+ lc->maximum_buffer_size =
+ rescale(oxcf->maximum_buffer_size,
+ lc->target_bandwidth, 1000);
+
+ // Work out the average size of a frame within this layer
+ if (i > 0)
+ lc->avg_frame_size_for_layer = (cpi->oxcf.target_bitrate[i] -
+ cpi->oxcf.target_bitrate[i-1]) * 1000 /
+ (lc->frame_rate - prev_layer_frame_rate);
+
+ lc->active_worst_quality = cpi->oxcf.worst_allowed_q;
+ lc->active_best_quality = cpi->oxcf.best_allowed_q;
+ lc->avg_frame_qindex = cpi->oxcf.worst_allowed_q;
+
+ lc->buffer_level = lc->starting_buffer_level;
+ lc->bits_off_target = lc->starting_buffer_level;
+
+ lc->total_actual_bits = 0;
+ lc->ni_av_qi = 0;
+ lc->ni_tot_qi = 0;
+ lc->ni_frames = 0;
+ lc->rate_correction_factor = 1.0;
+ lc->key_frame_rate_correction_factor = 1.0;
+ lc->gf_rate_correction_factor = 1.0;
+ lc->inter_frame_target = 0.0;
+
+ prev_layer_frame_rate = lc->frame_rate;
+ }
+ }
+
#if VP8_TEMPORAL_ALT_REF
{
int i;
@@ -1693,11 +1827,11 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
- cm->Width = cpi->oxcf.Width ;
- cm->Height = cpi->oxcf.Height ;
+ cm->Width = cpi->oxcf.Width;
+ cm->Height = cpi->oxcf.Height;
cm->horiz_scale = cpi->horiz_scale;
- cm->vert_scale = cpi->vert_scale ;
+ cm->vert_scale = cpi->vert_scale;
// VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
if (cpi->oxcf.Sharpness > 7)
@@ -1828,7 +1962,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->prob_gf_coded = 128;
cpi->prob_intra_coded = 63;
- // Prime the recent reference frame useage counters.
+ // Prime the recent reference frame usage counters.
// Hereafter they will be maintained as a sort of moving average
cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
@@ -2143,35 +2277,106 @@ void vp8_remove_compressor(VP8_PTR *ptr)
FILE *f = fopen("opsnr.stt", "a");
double time_encoded = (cpi->last_end_time_stamp_seen
- cpi->first_time_stamp_ever) / 10000000.000;
- double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
- double dr = (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded;
+ double total_encode_time = (cpi->time_receive_data +
+ cpi->time_compress_data) / 1000.000;
+ double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
if (cpi->b_calculate_psnr)
{
- YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
- double samples = 3.0 / 2 * cpi->count * lst_yv12->y_width * lst_yv12->y_height;
- double total_psnr = vp8_mse2psnr(samples, 255.0, cpi->total_sq_error);
- double total_psnr2 = vp8_mse2psnr(samples, 255.0, cpi->total_sq_error2);
- double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
-
- fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\tVPXSSIM\t Time(us)\n");
- fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n",
- dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,
- total_encode_time);
+ YV12_BUFFER_CONFIG *lst_yv12 =
+ &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
+
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ int i;
+
+ fprintf(f, "Layer\tBitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t"
+ "GLPsnrP\tVPXSSIM\t\n");
+ for (i=0; i<cpi->oxcf.number_of_layers; i++)
+ {
+ double dr = (double)cpi->bytes_in_layer[i] *
+ 8.0 / 1000.0 / time_encoded;
+ double samples = 3.0 / 2 * cpi->frames_in_layer[i] *
+ lst_yv12->y_width * lst_yv12->y_height;
+ double total_psnr = vp8_mse2psnr(samples, 255.0,
+ cpi->total_error2[i]);
+ double total_psnr2 = vp8_mse2psnr(samples, 255.0,
+ cpi->total_error2_p[i]);
+ double total_ssim = 100 * pow(cpi->sum_ssim[i] /
+ cpi->sum_weights[i], 8.0);
+
+ fprintf(f, "%5d\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\n",
+ i, dr,
+ cpi->sum_psnr[i] / cpi->frames_in_layer[i],
+ total_psnr,
+ cpi->sum_psnr_p[i] / cpi->frames_in_layer[i],
+ total_psnr2, total_ssim);
+ }
+ }
+ else
+ {
+ double samples = 3.0 / 2 * cpi->count *
+ lst_yv12->y_width * lst_yv12->y_height;
+ double total_psnr = vp8_mse2psnr(samples, 255.0,
+ cpi->total_sq_error);
+ double total_psnr2 = vp8_mse2psnr(samples, 255.0,
+ cpi->total_sq_error2);
+ double total_ssim = 100 * pow(cpi->summed_quality /
+ cpi->summed_weights, 8.0);
+
+ fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\t"
+ "GLPsnrP\tVPXSSIM\t Time(us)\n");
+ fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%8.0f\n",
+ dr, cpi->total / cpi->count, total_psnr,
+ cpi->totalp / cpi->count, total_psnr2,
+ total_ssim, total_encode_time);
+ }
}
if (cpi->b_calculate_ssimg)
{
- fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t Time(us)\n");
- fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
- cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
- cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time);
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ int i;
+
+ fprintf(f, "Layer\tBitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t"
+ "Time(us)\n");
+ for (i=0; i<cpi->oxcf.number_of_layers; i++)
+ {
+ double dr = (double)cpi->bytes_in_layer[i] *
+ 8.0 / 1000.0 / time_encoded;
+ fprintf(f, "%5d\t%7.3f\t%6.4f\t"
+ "%6.4f\t%6.4f\t%6.4f\t%8.0f\n",
+ i, dr,
+ cpi->total_ssimg_y_in_layer[i] /
+ cpi->frames_in_layer[i],
+ cpi->total_ssimg_u_in_layer[i] /
+ cpi->frames_in_layer[i],
+ cpi->total_ssimg_v_in_layer[i] /
+ cpi->frames_in_layer[i],
+ cpi->total_ssimg_all_in_layer[i] /
+ cpi->frames_in_layer[i],
+ total_encode_time);
+ }
+ }
+ else
+ {
+ fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t"
+ "Time(us)\n");
+ fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
+ cpi->total_ssimg_y / cpi->count,
+ cpi->total_ssimg_u / cpi->count,
+ cpi->total_ssimg_v / cpi->count,
+ cpi->total_ssimg_all / cpi->count, total_encode_time);
+ }
}
fclose(f);
#if 0
f = fopen("qskip.stt", "a");
- fprintf(f, "minq:%d -maxq:%d skipture:skipfalse = %d:%d\n", cpi->oxcf.best_allowed_q, cpi->oxcf.worst_allowed_q, skiptruecount, skipfalsecount);
+ fprintf(f, "minq:%d -maxq:%d skiptrue:skipfalse = %d:%d\n", cpi->oxcf.best_allowed_q, cpi->oxcf.worst_allowed_q, skiptruecount, skipfalsecount);
fclose(f);
#endif
@@ -2841,10 +3046,41 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
}
else if (!(rf_intra + rf_inter))
{
- // This is a trap in case this function is called with cpi->recent_ref_frame_usage[] blank.
- cpi->prob_intra_coded = 63;
- cpi->prob_last_coded = 128;
- cpi->prob_gf_coded = 128;
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ if (cpi->ref_frame_flags == VP8_LAST_FLAG)
+ {
+ cpi->prob_intra_coded = 63;
+ cpi->prob_last_coded = 255;
+ cpi->prob_gf_coded = 128;
+ }
+ else if (cpi->ref_frame_flags == VP8_GOLD_FLAG)
+ {
+ cpi->prob_intra_coded = 63;
+ cpi->prob_last_coded = 1;
+ cpi->prob_gf_coded = 255;
+ }
+ else if (cpi->ref_frame_flags == VP8_ALT_FLAG)
+ {
+ cpi->prob_intra_coded = 63;
+ cpi->prob_last_coded = 1;
+ cpi->prob_gf_coded = 1;
+ }
+ else
+ {
+ cpi->prob_intra_coded = 63;
+ cpi->prob_last_coded = 128;
+ cpi->prob_gf_coded = 128;
+ }
+ }
+ else
+ {
+ // This is a trap in case this function is called with
+ // cpi->recent_ref_frame_usage[] blank.
+ cpi->prob_intra_coded = 63;
+ cpi->prob_last_coded = 128;
+ cpi->prob_gf_coded = 128;
+ }
}
else
{
@@ -2866,32 +3102,33 @@ static void update_rd_ref_frame_probs(VP8_COMP *cpi)
}
// update reference frame costs since we can do better than what we got last frame.
-
- if (cpi->common.refresh_alt_ref_frame)
- {
- cpi->prob_intra_coded += 40;
- cpi->prob_last_coded = 200;
- cpi->prob_gf_coded = 1;
- }
- else if (cpi->common.frames_since_golden == 0)
+ if (cpi->oxcf.number_of_layers == 1)
{
- cpi->prob_last_coded = 214;
- cpi->prob_gf_coded = 1;
- }
- else if (cpi->common.frames_since_golden == 1)
- {
- cpi->prob_last_coded = 192;
- cpi->prob_gf_coded = 220;
- }
- else if (cpi->source_alt_ref_active)
- {
- //int dist = cpi->common.frames_till_alt_ref_frame + cpi->common.frames_since_golden;
- cpi->prob_gf_coded -= 20;
+ if (cpi->common.refresh_alt_ref_frame)
+ {
+ cpi->prob_intra_coded += 40;
+ cpi->prob_last_coded = 200;
+ cpi->prob_gf_coded = 1;
+ }
+ else if (cpi->common.frames_since_golden == 0)
+ {
+ cpi->prob_last_coded = 214;
+ cpi->prob_gf_coded = 1;
+ }
+ else if (cpi->common.frames_since_golden == 1)
+ {
+ cpi->prob_last_coded = 192;
+ cpi->prob_gf_coded = 220;
+ }
+ else if (cpi->source_alt_ref_active)
+ {
+ //int dist = cpi->common.frames_till_alt_ref_frame + cpi->common.frames_since_golden;
+ cpi->prob_gf_coded -= 20;
- if (cpi->prob_gf_coded < 10)
- cpi->prob_gf_coded = 10;
+ if (cpi->prob_gf_coded < 10)
+ cpi->prob_gf_coded = 10;
+ }
}
-
#endif
}
@@ -3283,7 +3520,6 @@ static void encode_frame_to_data_rate
// Enable or disable mode based tweaking of the zbin
// For 2 Pass Only used where GF/ARF prediction quality
// is above a threshold
- cpi->zbin_mode_boost = 0;
cpi->zbin_mode_boost_enabled = TRUE;
if (cpi->pass == 2)
{
@@ -3432,6 +3668,19 @@ static void encode_frame_to_data_rate
cpi->buffer_level = cpi->bits_off_target;
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ int i;
+
+ // Propagate bits saved by dropping the frame to higher layers
+ for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
+ {
+ cpi->layer_context[i].bits_off_target
+ += cpi->av_per_frame_bandwidth;
+ cpi->layer_context[i].buffer_level = cpi->bits_off_target;
+ }
+ }
+
return;
}
else
@@ -3478,7 +3727,7 @@ static void encode_frame_to_data_rate
}
// Set an active best quality and if necessary active worst quality
- // There is some odd behaviour for one pass here that needs attention.
+ // There is some odd behavior for one pass here that needs attention.
if ( (cpi->pass == 2) || (cpi->ni_frames > 150))
{
vp8_clear_system_state();
@@ -3510,13 +3759,14 @@ static void encode_frame_to_data_rate
cpi->active_best_quality = kf_high_motion_minq[Q];
}
- else if (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame)
+ else if (cpi->oxcf.number_of_layers==1 &&
+ (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame))
{
// Use the lower of cpi->active_worst_quality and recent
// average Q as basis for GF/ARF Q limit unless last frame was
// a key frame.
if ( (cpi->frames_since_key > 1) &&
- (cpi->avg_frame_qindex < cpi->active_worst_quality) )
+ (cpi->avg_frame_qindex < cpi->active_worst_quality) )
{
Q = cpi->avg_frame_qindex;
}
@@ -3617,13 +3867,17 @@ static void encode_frame_to_data_rate
// Set highest allowed value for Zbin over quant
if (cm->frame_type == KEY_FRAME)
zbin_oq_high = 0; //ZBIN_OQ_MAX/16
- else if (cm->refresh_alt_ref_frame || (cm->refresh_golden_frame && !cpi->source_alt_ref_active))
- zbin_oq_high = 16;
+ else if ((cpi->oxcf.number_of_layers == 1) && ((cm->refresh_alt_ref_frame ||
+ (cm->refresh_golden_frame && !cpi->source_alt_ref_active))))
+ {
+ zbin_oq_high = 16;
+ }
else
zbin_oq_high = ZBIN_OQ_MAX;
- // Setup background Q adjustment for error resilliant mode
- if (cpi->cyclic_refresh_mode_enabled)
+ // Setup background Q adjustment for error resilient mode.
+ // For multi-layer encodes only enable this for the base layer.
+ if (cpi->cyclic_refresh_mode_enabled && (cpi->current_layer==0))
cyclic_background_refresh(cpi, Q, 0);
vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);
@@ -3756,10 +4010,8 @@ static void encode_frame_to_data_rate
if (cpi->prob_skip_false > 250)
cpi->prob_skip_false = 250;
- if (cpi->is_src_frame_alt_ref)
+ if (cpi->oxcf.number_of_layers == 1 && cpi->is_src_frame_alt_ref)
cpi->prob_skip_false = 1;
-
-
}
#if 0
@@ -4111,9 +4363,10 @@ static void encode_frame_to_data_rate
}
// Update the GF useage maps.
- // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter
- // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter
- vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
+ // This is done after completing the compression of a frame when all
+ // modes etc. are finalized but before loop filter
+ if (cpi->oxcf.number_of_layers == 1)
+ vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
if (cm->frame_type == KEY_FRAME)
cm->refresh_last_frame = 1;
@@ -4179,6 +4432,13 @@ static void encode_frame_to_data_rate
cpi->total_byte_count += (*size);
cpi->projected_frame_size = (*size) << 3;
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ int i;
+ for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
+ cpi->layer_context[i].total_byte_count += (*size);
+ }
+
if (!active_worst_qchanged)
vp8_update_rate_correction_factors(cpi, 2);
@@ -4194,7 +4454,8 @@ static void encode_frame_to_data_rate
cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
// Keep a record from which we can calculate the average Q excluding GF updates and key frames
- if ((cm->frame_type != KEY_FRAME) && !cm->refresh_golden_frame && !cm->refresh_alt_ref_frame)
+ if ((cm->frame_type != KEY_FRAME) && ((cpi->oxcf.number_of_layers > 1) ||
+ (!cm->refresh_golden_frame && !cm->refresh_alt_ref_frame)))
{
cpi->ni_frames++;
@@ -4245,7 +4506,7 @@ static void encode_frame_to_data_rate
#endif
- // Set the count for maximum consequative dropped frames based upon the ratio of
+ // Set the count for maximum consecutive dropped frames based upon the ratio of
// this frame size to the target average per frame bandwidth.
// (cpi->av_per_frame_bandwidth > 0) is just a sanity check to prevent / 0.
if (cpi->drop_frames_allowed && (cpi->av_per_frame_bandwidth > 0))
@@ -4270,13 +4531,32 @@ static void encode_frame_to_data_rate
cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32;
// Actual bits spent
- cpi->total_actual_bits += cpi->projected_frame_size;
+ cpi->total_actual_bits += cpi->projected_frame_size;
// Debug stats
cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
cpi->buffer_level = cpi->bits_off_target;
+ // Propagate values to higher temporal layers
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ int i;
+
+ for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
+ {
+ LAYER_CONTEXT *lc = &cpi->layer_context[i];
+ int bits_off_for_this_layer = lc->target_bandwidth / lc->frame_rate
+ - cpi->projected_frame_size;
+
+ lc->bits_off_target += bits_off_for_this_layer;
+
+ lc->total_actual_bits += cpi->projected_frame_size;
+ lc->total_target_vs_actual += bits_off_for_this_layer;
+ lc->buffer_level = lc->bits_off_target;
+ }
+ }
+
// Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames
if (cm->frame_type == KEY_FRAME)
{
@@ -4322,7 +4602,7 @@ static void encode_frame_to_data_rate
vp8_clear_system_state(); //__asm emms;
- if (cpi->twopass.total_left_stats.coded_error != 0.0)
+ if (cpi->twopass.total_left_stats->coded_error != 0.0)
fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
"%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f"
"%10.3f %8d\n",
@@ -4340,9 +4620,9 @@ static void encode_frame_to_data_rate
cm->frame_type, cpi->gfu_boost,
cpi->twopass.est_max_qcorrection_factor,
(int)cpi->twopass.bits_left,
- cpi->twopass.total_left_stats.coded_error,
+ cpi->twopass.total_left_stats->coded_error,
(double)cpi->twopass.bits_left /
- cpi->twopass.total_left_stats.coded_error,
+ cpi->twopass.total_left_stats->coded_error,
cpi->tot_recode_hits);
else
fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
@@ -4362,7 +4642,7 @@ static void encode_frame_to_data_rate
cm->frame_type, cpi->gfu_boost,
cpi->twopass.est_max_qcorrection_factor,
(int)cpi->twopass.bits_left,
- cpi->twopass.total_left_stats.coded_error,
+ cpi->twopass.total_left_stats->coded_error,
cpi->tot_recode_hits);
fclose(f);
@@ -4675,7 +4955,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
cm->refresh_golden_frame = 0;
cm->refresh_last_frame = 0;
cm->show_frame = 0;
- cpi->source_alt_ref_pending = FALSE; // Clear Pending altf Ref flag.
+ cpi->source_alt_ref_pending = FALSE; // Clear Pending alt Ref flag.
cpi->is_src_frame_alt_ref = 0;
}
}
@@ -4727,6 +5007,13 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
return -1;
}
+ // Restore layer specific context if necessary
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ restore_layer_context (cpi,
+ cpi->oxcf.layer_id[cm->current_video_frame % cpi->oxcf.periodicity]);
+ }
+
if (cpi->source->ts_start < cpi->first_time_stamp_ever)
{
cpi->first_time_stamp_ever = cpi->source->ts_start;
@@ -4734,7 +5021,16 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
}
// adjust frame rates based on timestamps given
- if (!cm->refresh_alt_ref_frame)
+ if (cpi->oxcf.number_of_layers > 1 )
+ {
+ vp8_new_frame_rate (
+ cpi, cpi->layer_context[cpi->current_layer].frame_rate);
+
+ cpi->last_time_stamp_seen = cpi->source->ts_start;
+ cpi->last_end_time_stamp_seen = cpi->source->ts_end;
+
+ }
+ else if (!cm->refresh_alt_ref_frame)
{
int64_t this_duration;
int step = 0;
@@ -4786,7 +5082,8 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
if (cpi->compressor_speed == 2)
{
- check_gf_quality(cpi);
+ if (cpi->oxcf.number_of_layers == 1)
+ check_gf_quality(cpi);
vpx_usec_timer_start(&tsctimer);
vpx_usec_timer_start(&ticktimer);
}
@@ -4893,6 +5190,10 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
}
+ // Save layer specific state
+ if (cpi->oxcf.number_of_layers > 1)
+ save_layer_context (cpi);
+
vpx_usec_timer_mark(&cmptimer);
cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
@@ -4922,7 +5223,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
int y_samples = orig->y_height * orig->y_width ;
int uv_samples = orig->uv_height * orig->uv_width ;
int t_samples = y_samples + 2 * uv_samples;
- int64_t sq_error;
+ int64_t sq_error, sq_error2;
ye = calc_plane_error(orig->y_buffer, orig->y_stride,
recon->y_buffer, recon->y_stride, orig->y_width, orig->y_height,
@@ -4964,14 +5265,14 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
pp->v_buffer, pp->uv_stride, orig->uv_width, orig->uv_height,
IF_RTCD(&cpi->rtcd.variance));
- sq_error = ye + ue + ve;
+ sq_error2 = ye + ue + ve;
- frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error);
+ frame_psnr2 = vp8_mse2psnr(t_samples, 255.0, sq_error2);
cpi->totalp_y += vp8_mse2psnr(y_samples, 255.0, ye);
cpi->totalp_u += vp8_mse2psnr(uv_samples, 255.0, ue);
cpi->totalp_v += vp8_mse2psnr(uv_samples, 255.0, ve);
- cpi->total_sq_error2 += sq_error;
+ cpi->total_sq_error2 += sq_error2;
cpi->totalp += frame_psnr2;
frame_ssim2 = vp8_calc_ssim(cpi->Source,
@@ -4981,6 +5282,24 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
cpi->summed_quality += frame_ssim2 * weight;
cpi->summed_weights += weight;
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ int i;
+
+ for (i=cpi->current_layer;
+ i<cpi->oxcf.number_of_layers; i++)
+ {
+ cpi->frames_in_layer[i]++;
+
+ cpi->bytes_in_layer[i] += *size;
+ cpi->sum_psnr[i] += frame_psnr;
+ cpi->sum_psnr_p[i] += frame_psnr2;
+ cpi->total_error2[i] += sq_error;
+ cpi->total_error2_p[i] += sq_error2;
+ cpi->sum_ssim[i] += frame_ssim2 * weight;
+ cpi->sum_weights[i] += weight;
+ }
+ }
}
}
@@ -4989,10 +5308,30 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon
double y, u, v, frame_all;
frame_all = vp8_calc_ssimg(cpi->Source, cm->frame_to_show,
&y, &u, &v, IF_RTCD(&cpi->rtcd.variance));
- cpi->total_ssimg_y += y;
- cpi->total_ssimg_u += u;
- cpi->total_ssimg_v += v;
- cpi->total_ssimg_all += frame_all;
+
+ if (cpi->oxcf.number_of_layers > 1)
+ {
+ int i;
+
+ for (i=cpi->current_layer;
+ i<cpi->oxcf.number_of_layers; i++)
+ {
+ if (!cpi->b_calculate_psnr)
+ cpi->frames_in_layer[i]++;
+
+ cpi->total_ssimg_y_in_layer[i] += y;
+ cpi->total_ssimg_u_in_layer[i] += u;
+ cpi->total_ssimg_v_in_layer[i] += v;
+ cpi->total_ssimg_all_in_layer[i] += frame_all;
+ }
+ }
+ else
+ {
+ cpi->total_ssimg_y += y;
+ cpi->total_ssimg_u += u;
+ cpi->total_ssimg_v += v;
+ cpi->total_ssimg_all += frame_all;
+ }
}
}
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index ee519fad0..6678c15fb 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -56,6 +56,8 @@
#define VP8_TEMPORAL_ALT_REF 1
#endif
+#define MAX_PERIODICITY 16
+
typedef struct
{
int kf_indicated;
@@ -238,6 +240,52 @@ enum
BLOCK_MAX_SEGMENTS
};
+typedef struct
+{
+ // Layer configuration
+ double frame_rate;
+ int target_bandwidth;
+
+ // Layer specific coding parameters
+ int starting_buffer_level;
+ int optimal_buffer_level;
+ int maximum_buffer_size;
+
+ int avg_frame_size_for_layer;
+
+ int buffer_level;
+ int bits_off_target;
+
+ long long total_actual_bits;
+ int total_target_vs_actual;
+
+ int worst_quality;
+ int active_worst_quality;
+ int best_quality;
+ int active_best_quality;
+
+ int ni_av_qi;
+ int ni_tot_qi;
+ int ni_frames;
+ int avg_frame_qindex;
+
+ double rate_correction_factor;
+ double key_frame_rate_correction_factor;
+ double gf_rate_correction_factor;
+
+ int zbin_over_quant;
+
+ int inter_frame_target;
+ INT64 total_byte_count;
+
+ int filter_level;
+
+ int last_frame_percent_intra;
+
+ int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+
+} LAYER_CONTEXT;
+
typedef struct VP8_COMP
{
@@ -368,7 +416,7 @@ typedef struct VP8_COMP
int buffered_mode;
- int buffer_level;
+ int64_t buffer_level;
int bits_off_target;
int rolling_target_bits;
@@ -610,6 +658,25 @@ typedef struct VP8_COMP
int force_next_frame_intra; /* force next frame to intra when kf_auto says so */
int droppable;
+
+ // Coding layer state variables
+ unsigned int current_layer;
+ LAYER_CONTEXT layer_context[MAX_LAYERS];
+
+ long long frames_in_layer[MAX_LAYERS];
+ long long bytes_in_layer[MAX_LAYERS];
+ double sum_psnr[MAX_LAYERS];
+ double sum_psnr_p[MAX_LAYERS];
+ double total_error2[MAX_LAYERS];
+ double total_error2_p[MAX_LAYERS];
+ double sum_ssim[MAX_LAYERS];
+ double sum_weights[MAX_LAYERS];
+
+ double total_ssimg_y_in_layer[MAX_LAYERS];
+ double total_ssimg_u_in_layer[MAX_LAYERS];
+ double total_ssimg_v_in_layer[MAX_LAYERS];
+ double total_ssimg_all_in_layer[MAX_LAYERS];
+
} VP8_COMP;
void control_data_rate(VP8_COMP *cpi);
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 1e602138f..62e644dea 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -157,7 +157,7 @@ static int pick_intra4x4block(
rate = mode_costs[mode];
RECON_INVOKE(&rtcd->common->recon, intra4x4_predict)
- (b, mode, b->predictor);
+ (b, mode, b->predictor, 16);
distortion = get_prediction_error(be, b, &rtcd->variance);
this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
@@ -471,7 +471,8 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
else
skip_mode[GOLDEN_FRAME] = 1;
- if (cpi->ref_frame_flags & VP8_ALT_FLAG && cpi->source_alt_ref_active)
+ if ((cpi->ref_frame_flags & VP8_ALT_FLAG) &&
+ (cpi->source_alt_ref_active || cpi->oxcf.number_of_layers > 1))
{
YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx];
y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset;
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 46e1d9dd9..1ac905021 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -436,7 +436,8 @@ static void calc_iframe_target_size(VP8_COMP *cpi)
}
-// Do the best we can to define the parameteres for the next GF based on what information we have available.
+// Do the best we can to define the parameters for the next GF based on what
+// information we have available.
static void calc_gf_params(VP8_COMP *cpi)
{
int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
@@ -607,6 +608,11 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
{
int min_frame_target;
int Adjustment;
+ int old_per_frame_bandwidth = cpi->per_frame_bandwidth;
+
+ if ( cpi->current_layer > 0)
+ cpi->per_frame_bandwidth =
+ cpi->layer_context[cpi->current_layer].avg_frame_size_for_layer;
min_frame_target = 0;
@@ -622,7 +628,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
// Special alt reference frame case
- if (cpi->common.refresh_alt_ref_frame)
+ if((cpi->common.refresh_alt_ref_frame) && (cpi->oxcf.number_of_layers == 1))
{
if (cpi->pass == 2)
{
@@ -789,7 +795,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
// Decide whether or not we need to adjust the frame data rate target.
//
// If we are are below the optimal buffer fullness level and adherence
- // to buffering contraints is important to the end useage then adjust
+ // to buffering constraints is important to the end usage then adjust
// the per frame target.
if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
(cpi->buffer_level < cpi->oxcf.optimal_buffer_level))
@@ -812,12 +818,12 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
percent_low = 0;
// lower the target bandwidth for this frame.
- cpi->this_frame_target -= (cpi->this_frame_target * percent_low)
- / 200;
+ cpi->this_frame_target -=
+ (cpi->this_frame_target * percent_low) / 200;
// Are we using allowing control of active_worst_allowed_q
// according to buffer level.
- if (cpi->auto_worst_q)
+ if (cpi->auto_worst_q && cpi->ni_frames > 150)
{
int critical_buffer_level;
@@ -834,7 +840,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
(cpi->buffer_level < cpi->bits_off_target)
? cpi->buffer_level : cpi->bits_off_target;
}
- // For local file playback short term buffering contraints
+ // For local file playback short term buffering constraints
// are less of an issue
else
{
@@ -905,11 +911,11 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
percent_high = 0;
cpi->this_frame_target += (cpi->this_frame_target *
- percent_high) / 200;
-
+ percent_high) / 200;
- // Are we allowing control of active_worst_allowed_q according to bufferl level.
- if (cpi->auto_worst_q)
+ // Are we allowing control of active_worst_allowed_q according
+ // to buffer level.
+ if (cpi->auto_worst_q && cpi->ni_frames > 150)
{
// When using the relaxed buffer model stick to the user specified value
cpi->active_worst_quality = cpi->ni_av_qi;
@@ -1112,6 +1118,8 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
}
}
+
+ cpi->per_frame_bandwidth = old_per_frame_bandwidth;
}
@@ -1421,8 +1429,14 @@ void vp8_adjust_key_frame_context(VP8_COMP *cpi)
* bits allocated than those following other gfs.
*/
overspend = (cpi->projected_frame_size - cpi->per_frame_bandwidth);
- cpi->kf_overspend_bits += overspend * 7 / 8;
- cpi->gf_overspend_bits += overspend * 1 / 8;
+
+ if (cpi->oxcf.number_of_layers > 1)
+ cpi->kf_overspend_bits += overspend;
+ else
+ {
+ cpi->kf_overspend_bits += overspend * 7 / 8;
+ cpi->gf_overspend_bits += overspend * 1 / 8;
+ }
/* Work out how much to try and recover per frame. */
cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits
@@ -1452,7 +1466,9 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit,
}
else
{
- if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+ if (cpi->oxcf.number_of_layers > 1 ||
+ cpi->common.refresh_alt_ref_frame ||
+ cpi->common.refresh_golden_frame)
{
*frame_over_shoot_limit = cpi->this_frame_target * 9 / 8;
*frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 124cfe564..fdb519c19 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -631,7 +631,7 @@ static int rd_pick_intra4x4block(
rate = bmode_costs[mode];
RECON_INVOKE(&cpi->rtcd.common->recon, intra4x4_predict)
- (b, mode, b->predictor);
+ (b, mode, b->predictor, 16);
ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16);
x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
x->quantize_b(be, b);
@@ -660,8 +660,8 @@ static int rd_pick_intra4x4block(
}
b->bmi.as_mode = (B_PREDICTION_MODE)(*best_mode);
- IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff, b->diff, 32);
- RECON_INVOKE(IF_RTCD(&cpi->rtcd.common->recon), recon)(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff,
+ best_predictor, 16, *(b->base_dst) + b->dst, b->dst_stride);
return best_rd;
}
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 5f2e6a354..85b8113d7 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -64,7 +64,6 @@ VP8_COMMON_SRCS-yes += common/mbpitch.c
VP8_COMMON_SRCS-yes += common/modecont.c
VP8_COMMON_SRCS-yes += common/modecontext.c
VP8_COMMON_SRCS-yes += common/quant_common.c
-VP8_COMMON_SRCS-yes += common/recon.c
VP8_COMMON_SRCS-yes += common/reconinter.c
VP8_COMMON_SRCS-yes += common/reconintra.c
VP8_COMMON_SRCS-yes += common/reconintra4x4.c
@@ -125,7 +124,6 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/iwalsh_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/filter_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/idct_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/loopfilter_v6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/recon_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/simpleloopfilter_v6$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/sixtappredict8x4_v6$(ASM)
@@ -143,16 +141,10 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfilter_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/mbloopfilter_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon2b_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon4b_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/reconb_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/shortidct4x4llm_1_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/shortidct4x4llm_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict4x4_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x4_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x8_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict16x16_neon$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon16x16mb_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/save_neon_reg$(ASM)
-VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon_neon.c
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index ca4e505dc..f8336240c 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -218,6 +218,25 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
}
#endif
+ RANGE_CHECK(cfg, ts_number_layers, 1, 5);
+
+ if (cfg->ts_number_layers > 1)
+ {
+ int i;
+ RANGE_CHECK_HI(cfg, ts_periodicity, 16);
+
+ for (i=1; i<cfg->ts_number_layers; i++)
+ if (cfg->ts_target_bitrate[i] <= cfg->ts_target_bitrate[i-1])
+ ERROR("ts_target_bitrate entries are not strictly increasing");
+
+ RANGE_CHECK(cfg, ts_rate_decimator[cfg->ts_number_layers-1], 1, 1);
+ for (i=cfg->ts_number_layers-2; i>0; i--)
+ if (cfg->ts_rate_decimator[i-1] != 2*cfg->ts_rate_decimator[i])
+ ERROR("ts_rate_decimator factors are not powers of 2");
+
+ RANGE_CHECK_HI(cfg, ts_layer_id[i], cfg->ts_number_layers-1);
+ }
+
return VPX_CODEC_OK;
}
@@ -253,14 +272,15 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
oxcf->Width = cfg.g_w;
oxcf->Height = cfg.g_h;
/* guess a frame rate if out of whack, use 30 */
- oxcf->frame_rate = (double)(cfg.g_timebase.den) / (double)(cfg.g_timebase.num);
+ oxcf->frame_rate = (double)(cfg.g_timebase.den) /
+ (double)(cfg.g_timebase.num);
if (oxcf->frame_rate > 180)
{
oxcf->frame_rate = 30;
}
- oxcf->error_resilient_mode = cfg.g_error_resilient;
+ oxcf->error_resilient_mode = cfg.g_error_resilient;
switch (cfg.g_pass)
{
@@ -277,13 +297,13 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
if (cfg.g_pass == VPX_RC_FIRST_PASS)
{
- oxcf->allow_lag = 0;
- oxcf->lag_in_frames = 0;
+ oxcf->allow_lag = 0;
+ oxcf->lag_in_frames = 0;
}
else
{
- oxcf->allow_lag = (cfg.g_lag_in_frames) > 0;
- oxcf->lag_in_frames = cfg.g_lag_in_frames;
+ oxcf->allow_lag = (cfg.g_lag_in_frames) > 0;
+ oxcf->lag_in_frames = cfg.g_lag_in_frames;
}
oxcf->allow_df = (cfg.rc_dropframe_thresh > 0);
@@ -295,59 +315,71 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
if (cfg.rc_end_usage == VPX_VBR)
{
- oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
+ oxcf->end_usage = USAGE_LOCAL_FILE_PLAYBACK;
}
else if (cfg.rc_end_usage == VPX_CBR)
{
- oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+ oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
}
else if (cfg.rc_end_usage == VPX_CQ)
{
- oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
+ oxcf->end_usage = USAGE_CONSTRAINED_QUALITY;
}
- oxcf->target_bandwidth = cfg.rc_target_bitrate;
+ oxcf->target_bandwidth = cfg.rc_target_bitrate;
oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct;
- oxcf->best_allowed_q = cfg.rc_min_quantizer;
- oxcf->worst_allowed_q = cfg.rc_max_quantizer;
- oxcf->cq_level = vp8_cfg.cq_level;
+ oxcf->best_allowed_q = cfg.rc_min_quantizer;
+ oxcf->worst_allowed_q = cfg.rc_max_quantizer;
+ oxcf->cq_level = vp8_cfg.cq_level;
oxcf->fixed_q = -1;
- oxcf->under_shoot_pct = cfg.rc_undershoot_pct;
- oxcf->over_shoot_pct = cfg.rc_overshoot_pct;
+ oxcf->under_shoot_pct = cfg.rc_undershoot_pct;
+ oxcf->over_shoot_pct = cfg.rc_overshoot_pct;
- oxcf->maximum_buffer_size = cfg.rc_buf_sz;
- oxcf->starting_buffer_level = cfg.rc_buf_initial_sz;
- oxcf->optimal_buffer_level = cfg.rc_buf_optimal_sz;
+ oxcf->maximum_buffer_size = cfg.rc_buf_sz;
+ oxcf->starting_buffer_level = cfg.rc_buf_initial_sz;
+ oxcf->optimal_buffer_level = cfg.rc_buf_optimal_sz;
- oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct;
+ oxcf->two_pass_vbrbias = cfg.rc_2pass_vbr_bias_pct;
oxcf->two_pass_vbrmin_section = cfg.rc_2pass_vbr_minsection_pct;
oxcf->two_pass_vbrmax_section = cfg.rc_2pass_vbr_maxsection_pct;
- oxcf->auto_key = cfg.kf_mode == VPX_KF_AUTO
- && cfg.kf_min_dist != cfg.kf_max_dist;
- //oxcf->kf_min_dist = cfg.kf_min_dis;
- oxcf->key_freq = cfg.kf_max_dist;
+ oxcf->auto_key = cfg.kf_mode == VPX_KF_AUTO
+ && cfg.kf_min_dist != cfg.kf_max_dist;
+ //oxcf->kf_min_dist = cfg.kf_min_dis;
+ oxcf->key_freq = cfg.kf_max_dist;
+
+ oxcf->number_of_layers = cfg.ts_number_layers;
+ oxcf->periodicity = cfg.ts_periodicity;
+
+ if (oxcf->number_of_layers > 1)
+ {
+ memcpy (oxcf->target_bitrate, cfg.ts_target_bitrate,
+ sizeof(cfg.ts_target_bitrate));
+ memcpy (oxcf->rate_decimator, cfg.ts_rate_decimator,
+ sizeof(cfg.ts_rate_decimator));
+ memcpy (oxcf->layer_id, cfg.ts_layer_id, sizeof(cfg.ts_layer_id));
+ }
//oxcf->delete_first_pass_file = cfg.g_delete_firstpassfile;
//strcpy(oxcf->first_pass_file, cfg.g_firstpass_file);
- oxcf->cpu_used = vp8_cfg.cpu_used;
- oxcf->encode_breakout = vp8_cfg.static_thresh;
- oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref;
- oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity;
- oxcf->Sharpness = vp8_cfg.Sharpness;
- oxcf->token_partitions = vp8_cfg.token_partitions;
+ oxcf->cpu_used = vp8_cfg.cpu_used;
+ oxcf->encode_breakout = vp8_cfg.static_thresh;
+ oxcf->play_alternate = vp8_cfg.enable_auto_alt_ref;
+ oxcf->noise_sensitivity = vp8_cfg.noise_sensitivity;
+ oxcf->Sharpness = vp8_cfg.Sharpness;
+ oxcf->token_partitions = vp8_cfg.token_partitions;
- oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in;
- oxcf->output_pkt_list = vp8_cfg.pkt_list;
+ oxcf->two_pass_stats_in = cfg.rc_twopass_stats_in;
+ oxcf->output_pkt_list = vp8_cfg.pkt_list;
- oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;
- oxcf->arnr_strength = vp8_cfg.arnr_strength;
- oxcf->arnr_type = vp8_cfg.arnr_type;
+ oxcf->arnr_max_frames = vp8_cfg.arnr_max_frames;
+ oxcf->arnr_strength = vp8_cfg.arnr_strength;
+ oxcf->arnr_type = vp8_cfg.arnr_type;
- oxcf->tuning = vp8_cfg.tuning;
+ oxcf->tuning = vp8_cfg.tuning;
/*
printf("Current VP8 Settings: \n");
@@ -515,7 +547,7 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx)
cfg = &ctx->priv->alg_priv->cfg;
- /* Select the extra vp6 configuration table based on the current
+ /* Select the extra vp8 configuration table based on the current
* usage value. If the current usage value isn't found, use the
* values for usage case 0.
*/
@@ -1143,6 +1175,12 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
1, /* g_delete_first_pass_file */
"vp8.fpf" /* first pass filename */
#endif
+
+ 1, /* ts_number_layers */
+ {0}, /* ts_target_bitrate */
+ {0}, /* ts_rate_decimator */
+ 0, /* ts_periodicity */
+ {0}, /* ts_layer_id */
}},
{ -1, {NOT_IMPLEMENTED}}
};