From fcccbcbb395ce4cf31b54ce1245cc28e5e3ef4c1 Mon Sep 17 00:00:00 2001 From: John Koleszar Date: Tue, 27 Nov 2012 13:59:17 -0800 Subject: Add vp9_ prefix to all vp9 files Support for gyp which doesn't support multiple objects in the same static library having the same basename. Change-Id: Ib947eefbaf68f8b177a796d23f875ccdfa6bc9dc --- vp9/common/alloccommon.c | 223 --- vp9/common/alloccommon.h | 26 - vp9/common/arm/arm_systemdependent.c | 91 - vp9/common/arm/armv6/bilinearfilter_v6.asm | 237 --- vp9/common/arm/armv6/copymem16x16_v6.asm | 186 -- vp9/common/arm/armv6/copymem8x4_v6.asm | 128 -- vp9/common/arm/armv6/copymem8x8_v6.asm | 128 -- vp9/common/arm/armv6/dc_only_idct_add_v6.asm | 67 - vp9/common/arm/armv6/filter_v6.asm | 624 ------- vp9/common/arm/armv6/idct_v6.asm | 345 ---- vp9/common/arm/armv6/iwalsh_v6.asm | 152 -- vp9/common/arm/armv6/loopfilter_v6.asm | 1282 -------------- vp9/common/arm/armv6/recon_v6.asm | 281 --- vp9/common/arm/armv6/simpleloopfilter_v6.asm | 286 ---- vp9/common/arm/armv6/sixtappredict8x4_v6.asm | 273 --- vp9/common/arm/armv6/vp9_bilinearfilter_v6.asm | 237 +++ vp9/common/arm/armv6/vp9_copymem16x16_v6.asm | 186 ++ vp9/common/arm/armv6/vp9_copymem8x4_v6.asm | 128 ++ vp9/common/arm/armv6/vp9_copymem8x8_v6.asm | 128 ++ vp9/common/arm/armv6/vp9_dc_only_idct_add_v6.asm | 67 + vp9/common/arm/armv6/vp9_filter_v6.asm | 624 +++++++ vp9/common/arm/armv6/vp9_idct_v6.asm | 345 ++++ vp9/common/arm/armv6/vp9_iwalsh_v6.asm | 152 ++ vp9/common/arm/armv6/vp9_loopfilter_v6.asm | 1282 ++++++++++++++ vp9/common/arm/armv6/vp9_recon_v6.asm | 281 +++ vp9/common/arm/armv6/vp9_simpleloopfilter_v6.asm | 286 ++++ vp9/common/arm/armv6/vp9_sixtappredict8x4_v6.asm | 273 +++ vp9/common/arm/bilinearfilter_arm.c | 108 -- vp9/common/arm/bilinearfilter_arm.h | 35 - vp9/common/arm/filter_arm.c | 198 --- vp9/common/arm/idct_arm.h | 65 - vp9/common/arm/loopfilter_arm.c | 166 -- vp9/common/arm/loopfilter_arm.h | 41 - vp9/common/arm/neon/bilinearpredict16x16_neon.asm | 357 ---- vp9/common/arm/neon/bilinearpredict4x4_neon.asm | 130 -- vp9/common/arm/neon/bilinearpredict8x4_neon.asm | 135 -- vp9/common/arm/neon/bilinearpredict8x8_neon.asm | 183 -- .../arm/neon/buildintrapredictorsmby_neon.asm | 584 ------- vp9/common/arm/neon/copymem16x16_neon.asm | 59 - vp9/common/arm/neon/copymem8x4_neon.asm | 34 - vp9/common/arm/neon/copymem8x8_neon.asm | 43 - vp9/common/arm/neon/dc_only_idct_add_neon.asm | 49 - vp9/common/arm/neon/iwalsh_neon.asm | 80 - vp9/common/arm/neon/loopfilter_neon.asm | 397 ----- .../neon/loopfiltersimplehorizontaledge_neon.asm | 117 -- .../arm/neon/loopfiltersimpleverticaledge_neon.asm | 154 -- vp9/common/arm/neon/mbloopfilter_neon.asm | 469 ----- vp9/common/arm/neon/recon16x16mb_neon.asm | 131 -- vp9/common/arm/neon/recon2b_neon.asm | 54 - vp9/common/arm/neon/recon4b_neon.asm | 69 - vp9/common/arm/neon/recon_neon.c | 29 - vp9/common/arm/neon/reconb_neon.asm | 61 - vp9/common/arm/neon/save_neon_reg.asm | 36 - vp9/common/arm/neon/shortidct4x4llm_1_neon.asm | 67 - vp9/common/arm/neon/shortidct4x4llm_neon.asm | 122 -- vp9/common/arm/neon/sixtappredict16x16_neon.asm | 490 ------ vp9/common/arm/neon/sixtappredict4x4_neon.asm | 422 ----- vp9/common/arm/neon/sixtappredict8x4_neon.asm | 473 ------ vp9/common/arm/neon/sixtappredict8x8_neon.asm | 524 ------ .../arm/neon/vp9_bilinearpredict16x16_neon.asm | 357 ++++ .../arm/neon/vp9_bilinearpredict4x4_neon.asm | 130 ++ .../arm/neon/vp9_bilinearpredict8x4_neon.asm | 135 ++ .../arm/neon/vp9_bilinearpredict8x8_neon.asm | 183 ++ .../arm/neon/vp9_buildintrapredictorsmby_neon.asm | 584 +++++++ vp9/common/arm/neon/vp9_copymem16x16_neon.asm | 59 + vp9/common/arm/neon/vp9_copymem8x4_neon.asm | 34 + vp9/common/arm/neon/vp9_copymem8x8_neon.asm | 43 + vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm | 49 + vp9/common/arm/neon/vp9_iwalsh_neon.asm | 80 + vp9/common/arm/neon/vp9_loopfilter_neon.asm | 397 +++++ .../vp9_loopfiltersimplehorizontaledge_neon.asm | 117 ++ .../neon/vp9_loopfiltersimpleverticaledge_neon.asm | 154 ++ vp9/common/arm/neon/vp9_mbloopfilter_neon.asm | 469 +++++ vp9/common/arm/neon/vp9_recon16x16mb_neon.asm | 131 ++ vp9/common/arm/neon/vp9_recon2b_neon.asm | 54 + vp9/common/arm/neon/vp9_recon4b_neon.asm | 69 + vp9/common/arm/neon/vp9_recon_neon.c | 29 + vp9/common/arm/neon/vp9_reconb_neon.asm | 61 + vp9/common/arm/neon/vp9_save_neon_reg.asm | 36 + vp9/common/arm/neon/vp9_shortidct4x4llm_1_neon.asm | 67 + vp9/common/arm/neon/vp9_shortidct4x4llm_neon.asm | 122 ++ .../arm/neon/vp9_sixtappredict16x16_neon.asm | 490 ++++++ vp9/common/arm/neon/vp9_sixtappredict4x4_neon.asm | 422 +++++ vp9/common/arm/neon/vp9_sixtappredict8x4_neon.asm | 473 ++++++ vp9/common/arm/neon/vp9_sixtappredict8x8_neon.asm | 524 ++++++ vp9/common/arm/recon_arm.h | 90 - vp9/common/arm/reconintra_arm.c | 62 - vp9/common/arm/subpixel_arm.h | 89 - vp9/common/arm/vp9_arm_systemdependent.c | 91 + vp9/common/arm/vp9_bilinearfilter_arm.c | 108 ++ vp9/common/arm/vp9_bilinearfilter_arm.h | 35 + vp9/common/arm/vp9_filter_arm.c | 198 +++ vp9/common/arm/vp9_idct_arm.h | 65 + vp9/common/arm/vp9_loopfilter_arm.c | 166 ++ vp9/common/arm/vp9_loopfilter_arm.h | 41 + vp9/common/arm/vp9_recon_arm.h | 90 + vp9/common/arm/vp9_reconintra_arm.c | 62 + vp9/common/arm/vp9_subpixel_arm.h | 89 + vp9/common/asm_com_offsets.c | 40 - vp9/common/blockd.c | 29 - vp9/common/blockd.h | 568 ------- vp9/common/coefupdateprobs.h | 16 - vp9/common/common.h | 41 - vp9/common/common_types.h | 18 - vp9/common/context.c | 397 ----- vp9/common/debugmodes.c | 146 -- vp9/common/default_coef_probs.h | 1377 --------------- vp9/common/entropy.c | 447 ----- vp9/common/entropy.h | 114 -- vp9/common/entropymode.c | 713 -------- vp9/common/entropymode.h | 117 -- vp9/common/entropymv.c | 469 ----- vp9/common/entropymv.h | 129 -- vp9/common/extend.c | 169 -- vp9/common/extend.h | 27 - vp9/common/filter.c | 1159 ------------- vp9/common/filter.h | 28 - vp9/common/findnearmv.c | 296 ---- vp9/common/findnearmv.h | 183 -- vp9/common/generic/systemdependent.c | 46 - vp9/common/generic/vp9_systemdependent.c | 46 + vp9/common/header.h | 42 - vp9/common/idctllm.c | 1784 -------------------- vp9/common/implicit_segmentation.c | 255 --- vp9/common/invtrans.c | 120 -- vp9/common/invtrans.h | 41 - vp9/common/loopfilter.c | 524 ------ vp9/common/loopfilter.h | 104 -- vp9/common/loopfilter_filters.c | 480 ------ vp9/common/maskingmv.c | 806 --------- vp9/common/mbpitch.c | 124 -- vp9/common/modecont.c | 31 - vp9/common/modecont.h | 17 - vp9/common/modecontext.c | 147 -- vp9/common/mv.h | 26 - vp9/common/mvref_common.c | 398 ----- vp9/common/mvref_common.h | 28 - vp9/common/onyx.h | 225 --- vp9/common/onyxc_int.h | 323 ---- vp9/common/onyxd.h | 68 - vp9/common/postproc.c | 1031 ----------- vp9/common/postproc.h | 128 -- vp9/common/ppc/copy_altivec.asm | 47 - vp9/common/ppc/filter_altivec.asm | 1013 ----------- vp9/common/ppc/filter_bilinear_altivec.asm | 677 -------- vp9/common/ppc/idctllm_altivec.asm | 189 --- vp9/common/ppc/loopfilter_altivec.c | 127 -- vp9/common/ppc/loopfilter_filters_altivec.asm | 1253 -------------- vp9/common/ppc/platform_altivec.asm | 59 - vp9/common/ppc/recon_altivec.asm | 175 -- vp9/common/ppc/systemdependent.c | 166 -- vp9/common/ppc/vp9_copy_altivec.asm | 47 + vp9/common/ppc/vp9_filter_altivec.asm | 1013 +++++++++++ vp9/common/ppc/vp9_filter_bilinear_altivec.asm | 677 ++++++++ vp9/common/ppc/vp9_idctllm_altivec.asm | 189 +++ vp9/common/ppc/vp9_loopfilter_altivec.c | 127 ++ vp9/common/ppc/vp9_loopfilter_filters_altivec.asm | 1253 ++++++++++++++ vp9/common/ppc/vp9_platform_altivec.asm | 59 + vp9/common/ppc/vp9_recon_altivec.asm | 175 ++ vp9/common/ppc/vp9_systemdependent.c | 166 ++ vp9/common/ppflags.h | 38 - vp9/common/pragmas.h | 19 - vp9/common/pred_common.c | 463 ----- vp9/common/pred_common.h | 56 - vp9/common/quant_common.c | 125 -- vp9/common/quant_common.h | 22 - vp9/common/recon.c | 197 --- vp9/common/reconinter.c | 1140 ------------- vp9/common/reconinter.h | 78 - vp9/common/reconintra.c | 819 --------- vp9/common/reconintra.h | 36 - vp9/common/reconintra4x4.c | 472 ------ vp9/common/reconintra4x4.h | 17 - vp9/common/rtcd.c | 21 - vp9/common/rtcd_defs.sh | 689 -------- vp9/common/sadmxn.h | 37 - vp9/common/seg_common.c | 104 -- vp9/common/seg_common.h | 64 - vp9/common/setupintrarecon.c | 31 - vp9/common/setupintrarecon.h | 13 - vp9/common/subpelvar.h | 147 -- vp9/common/subpixel.h | 21 - vp9/common/swapyv12buffer.c | 32 - vp9/common/swapyv12buffer.h | 19 - vp9/common/systemdependent.h | 21 - vp9/common/tapify.py | 106 -- vp9/common/textblit.c | 117 -- vp9/common/textblit.h | 19 - vp9/common/treecoder.c | 138 -- vp9/common/treecoder.h | 83 - vp9/common/type_aliases.h | 120 -- vp9/common/vp9_alloccommon.c | 223 +++ vp9/common/vp9_alloccommon.h | 26 + vp9/common/vp9_asm_com_offsets.c | 40 + vp9/common/vp9_blockd.c | 29 + vp9/common/vp9_blockd.h | 568 +++++++ vp9/common/vp9_coefupdateprobs.h | 16 + vp9/common/vp9_common.h | 41 + vp9/common/vp9_common_types.h | 18 + vp9/common/vp9_context.c | 397 +++++ vp9/common/vp9_debugmodes.c | 146 ++ vp9/common/vp9_default_coef_probs.h | 1377 +++++++++++++++ vp9/common/vp9_entropy.c | 447 +++++ vp9/common/vp9_entropy.h | 114 ++ vp9/common/vp9_entropymode.c | 713 ++++++++ vp9/common/vp9_entropymode.h | 117 ++ vp9/common/vp9_entropymv.c | 469 +++++ vp9/common/vp9_entropymv.h | 129 ++ vp9/common/vp9_extend.c | 169 ++ vp9/common/vp9_extend.h | 27 + vp9/common/vp9_filter.c | 1159 +++++++++++++ vp9/common/vp9_filter.h | 28 + vp9/common/vp9_findnearmv.c | 296 ++++ vp9/common/vp9_findnearmv.h | 183 ++ vp9/common/vp9_header.h | 42 + vp9/common/vp9_idctllm.c | 1784 ++++++++++++++++++++ vp9/common/vp9_implicit_segmentation.c | 255 +++ vp9/common/vp9_invtrans.c | 120 ++ vp9/common/vp9_invtrans.h | 41 + vp9/common/vp9_loopfilter.c | 524 ++++++ vp9/common/vp9_loopfilter.h | 104 ++ vp9/common/vp9_loopfilter_filters.c | 480 ++++++ vp9/common/vp9_maskingmv.c | 806 +++++++++ vp9/common/vp9_mbpitch.c | 124 ++ vp9/common/vp9_modecont.c | 31 + vp9/common/vp9_modecont.h | 17 + vp9/common/vp9_modecontext.c | 147 ++ vp9/common/vp9_mv.h | 26 + vp9/common/vp9_mvref_common.c | 398 +++++ vp9/common/vp9_mvref_common.h | 28 + vp9/common/vp9_onyx.h | 225 +++ vp9/common/vp9_onyxc_int.h | 323 ++++ vp9/common/vp9_onyxd.h | 68 + vp9/common/vp9_postproc.c | 1031 +++++++++++ vp9/common/vp9_postproc.h | 128 ++ vp9/common/vp9_ppflags.h | 38 + vp9/common/vp9_pragmas.h | 19 + vp9/common/vp9_pred_common.c | 463 +++++ vp9/common/vp9_pred_common.h | 56 + vp9/common/vp9_quant_common.c | 125 ++ vp9/common/vp9_quant_common.h | 22 + vp9/common/vp9_recon.c | 197 +++ vp9/common/vp9_reconinter.c | 1140 +++++++++++++ vp9/common/vp9_reconinter.h | 78 + vp9/common/vp9_reconintra.c | 819 +++++++++ vp9/common/vp9_reconintra.h | 36 + vp9/common/vp9_reconintra4x4.c | 472 ++++++ vp9/common/vp9_reconintra4x4.h | 17 + vp9/common/vp9_rtcd.c | 21 + vp9/common/vp9_rtcd_defs.sh | 689 ++++++++ vp9/common/vp9_sadmxn.h | 37 + vp9/common/vp9_seg_common.c | 104 ++ vp9/common/vp9_seg_common.h | 64 + vp9/common/vp9_setupintrarecon.c | 31 + vp9/common/vp9_setupintrarecon.h | 13 + vp9/common/vp9_subpelvar.h | 147 ++ vp9/common/vp9_subpixel.h | 21 + vp9/common/vp9_swapyv12buffer.c | 32 + vp9/common/vp9_swapyv12buffer.h | 19 + vp9/common/vp9_systemdependent.h | 21 + vp9/common/vp9_tapify.py | 106 ++ vp9/common/vp9_textblit.c | 117 ++ vp9/common/vp9_textblit.h | 19 + vp9/common/vp9_treecoder.c | 138 ++ vp9/common/vp9_treecoder.h | 83 + vp9/common/vp9_type_aliases.h | 120 ++ vp9/common/x86/filter_sse2.c | 289 ---- vp9/common/x86/filter_sse4.c | 362 ---- vp9/common/x86/idct_x86.h | 64 - vp9/common/x86/idctllm_mmx.asm | 241 --- vp9/common/x86/idctllm_sse2.asm | 712 -------- vp9/common/x86/iwalsh_mmx.asm | 173 -- vp9/common/x86/iwalsh_sse2.asm | 119 -- vp9/common/x86/loopfilter_mmx.asm | 969 ----------- vp9/common/x86/loopfilter_sse2.asm | 1238 -------------- vp9/common/x86/loopfilter_x86.c | 547 ------ vp9/common/x86/loopfilter_x86.h | 43 - vp9/common/x86/mask_sse3.asm | 484 ------ vp9/common/x86/postproc_mmx.asm | 534 ------ vp9/common/x86/postproc_sse2.asm | 695 -------- vp9/common/x86/postproc_x86.h | 64 - vp9/common/x86/recon_mmx.asm | 321 ---- vp9/common/x86/recon_sse2.asm | 688 -------- vp9/common/x86/recon_wrapper_sse2.c | 101 -- vp9/common/x86/sadmxn_x86.c | 99 -- vp9/common/x86/subpixel_8t_ssse3.asm | 550 ------ vp9/common/x86/subpixel_mmx.asm | 727 -------- vp9/common/x86/subpixel_sse2.asm | 1372 --------------- vp9/common/x86/subpixel_ssse3.asm | 1515 ----------------- vp9/common/x86/subpixel_x86.h | 122 -- vp9/common/x86/vp8_asm_stubs.c | 602 ------- vp9/common/x86/vp9_filter_sse2.c | 289 ++++ vp9/common/x86/vp9_filter_sse4.c | 362 ++++ vp9/common/x86/vp9_idct_x86.h | 64 + vp9/common/x86/vp9_idctllm_mmx.asm | 241 +++ vp9/common/x86/vp9_idctllm_sse2.asm | 712 ++++++++ vp9/common/x86/vp9_iwalsh_mmx.asm | 173 ++ vp9/common/x86/vp9_iwalsh_sse2.asm | 119 ++ vp9/common/x86/vp9_loopfilter_mmx.asm | 969 +++++++++++ vp9/common/x86/vp9_loopfilter_sse2.asm | 1238 ++++++++++++++ vp9/common/x86/vp9_loopfilter_x86.c | 547 ++++++ vp9/common/x86/vp9_loopfilter_x86.h | 43 + vp9/common/x86/vp9_mask_sse3.asm | 484 ++++++ vp9/common/x86/vp9_postproc_mmx.asm | 534 ++++++ vp9/common/x86/vp9_postproc_sse2.asm | 695 ++++++++ vp9/common/x86/vp9_postproc_x86.h | 64 + vp9/common/x86/vp9_recon_mmx.asm | 321 ++++ vp9/common/x86/vp9_recon_sse2.asm | 688 ++++++++ vp9/common/x86/vp9_recon_wrapper_sse2.c | 101 ++ vp9/common/x86/vp9_sadmxn_x86.c | 99 ++ vp9/common/x86/vp9_subpixel_8t_ssse3.asm | 550 ++++++ vp9/common/x86/vp9_subpixel_mmx.asm | 727 ++++++++ vp9/common/x86/vp9_subpixel_sse2.asm | 1372 +++++++++++++++ vp9/common/x86/vp9_subpixel_ssse3.asm | 1515 +++++++++++++++++ vp9/common/x86/vp9_subpixel_x86.h | 122 ++ vp9/common/x86/vp9_vp8_asm_stubs.c | 602 +++++++ vp9/common/x86/vp9_x86_systemdependent.c | 74 + vp9/common/x86/x86_systemdependent.c | 74 - 318 files changed, 45358 insertions(+), 45358 deletions(-) delete mode 100644 vp9/common/alloccommon.c delete mode 100644 vp9/common/alloccommon.h delete mode 100644 vp9/common/arm/arm_systemdependent.c delete mode 100644 vp9/common/arm/armv6/bilinearfilter_v6.asm delete mode 100644 vp9/common/arm/armv6/copymem16x16_v6.asm delete mode 100644 vp9/common/arm/armv6/copymem8x4_v6.asm delete mode 100644 vp9/common/arm/armv6/copymem8x8_v6.asm delete mode 100644 vp9/common/arm/armv6/dc_only_idct_add_v6.asm delete mode 100644 vp9/common/arm/armv6/filter_v6.asm delete mode 100644 vp9/common/arm/armv6/idct_v6.asm delete mode 100644 vp9/common/arm/armv6/iwalsh_v6.asm delete mode 100644 vp9/common/arm/armv6/loopfilter_v6.asm delete mode 100644 vp9/common/arm/armv6/recon_v6.asm delete mode 100644 vp9/common/arm/armv6/simpleloopfilter_v6.asm delete mode 100644 vp9/common/arm/armv6/sixtappredict8x4_v6.asm create mode 100644 vp9/common/arm/armv6/vp9_bilinearfilter_v6.asm create mode 100644 vp9/common/arm/armv6/vp9_copymem16x16_v6.asm create mode 100644 vp9/common/arm/armv6/vp9_copymem8x4_v6.asm create mode 100644 vp9/common/arm/armv6/vp9_copymem8x8_v6.asm create mode 100644 vp9/common/arm/armv6/vp9_dc_only_idct_add_v6.asm create mode 100644 vp9/common/arm/armv6/vp9_filter_v6.asm create mode 100644 vp9/common/arm/armv6/vp9_idct_v6.asm create mode 100644 vp9/common/arm/armv6/vp9_iwalsh_v6.asm create mode 100644 vp9/common/arm/armv6/vp9_loopfilter_v6.asm create mode 100644 vp9/common/arm/armv6/vp9_recon_v6.asm create mode 100644 vp9/common/arm/armv6/vp9_simpleloopfilter_v6.asm create mode 100644 vp9/common/arm/armv6/vp9_sixtappredict8x4_v6.asm delete mode 100644 vp9/common/arm/bilinearfilter_arm.c delete mode 100644 vp9/common/arm/bilinearfilter_arm.h delete mode 100644 vp9/common/arm/filter_arm.c delete mode 100644 vp9/common/arm/idct_arm.h delete mode 100644 vp9/common/arm/loopfilter_arm.c delete mode 100644 vp9/common/arm/loopfilter_arm.h delete mode 100644 vp9/common/arm/neon/bilinearpredict16x16_neon.asm delete mode 100644 vp9/common/arm/neon/bilinearpredict4x4_neon.asm delete mode 100644 vp9/common/arm/neon/bilinearpredict8x4_neon.asm delete mode 100644 vp9/common/arm/neon/bilinearpredict8x8_neon.asm delete mode 100644 vp9/common/arm/neon/buildintrapredictorsmby_neon.asm delete mode 100644 vp9/common/arm/neon/copymem16x16_neon.asm delete mode 100644 vp9/common/arm/neon/copymem8x4_neon.asm delete mode 100644 vp9/common/arm/neon/copymem8x8_neon.asm delete mode 100644 vp9/common/arm/neon/dc_only_idct_add_neon.asm delete mode 100644 vp9/common/arm/neon/iwalsh_neon.asm delete mode 100644 vp9/common/arm/neon/loopfilter_neon.asm delete mode 100644 vp9/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm delete mode 100644 vp9/common/arm/neon/loopfiltersimpleverticaledge_neon.asm delete mode 100644 vp9/common/arm/neon/mbloopfilter_neon.asm delete mode 100644 vp9/common/arm/neon/recon16x16mb_neon.asm delete mode 100644 vp9/common/arm/neon/recon2b_neon.asm delete mode 100644 vp9/common/arm/neon/recon4b_neon.asm delete mode 100644 vp9/common/arm/neon/recon_neon.c delete mode 100644 vp9/common/arm/neon/reconb_neon.asm delete mode 100644 vp9/common/arm/neon/save_neon_reg.asm delete mode 100644 vp9/common/arm/neon/shortidct4x4llm_1_neon.asm delete mode 100644 vp9/common/arm/neon/shortidct4x4llm_neon.asm delete mode 100644 vp9/common/arm/neon/sixtappredict16x16_neon.asm delete mode 100644 vp9/common/arm/neon/sixtappredict4x4_neon.asm delete mode 100644 vp9/common/arm/neon/sixtappredict8x4_neon.asm delete mode 100644 vp9/common/arm/neon/sixtappredict8x8_neon.asm create mode 100644 vp9/common/arm/neon/vp9_bilinearpredict16x16_neon.asm create mode 100644 vp9/common/arm/neon/vp9_bilinearpredict4x4_neon.asm create mode 100644 vp9/common/arm/neon/vp9_bilinearpredict8x4_neon.asm create mode 100644 vp9/common/arm/neon/vp9_bilinearpredict8x8_neon.asm create mode 100644 vp9/common/arm/neon/vp9_buildintrapredictorsmby_neon.asm create mode 100644 vp9/common/arm/neon/vp9_copymem16x16_neon.asm create mode 100644 vp9/common/arm/neon/vp9_copymem8x4_neon.asm create mode 100644 vp9/common/arm/neon/vp9_copymem8x8_neon.asm create mode 100644 vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm create mode 100644 vp9/common/arm/neon/vp9_iwalsh_neon.asm create mode 100644 vp9/common/arm/neon/vp9_loopfilter_neon.asm create mode 100644 vp9/common/arm/neon/vp9_loopfiltersimplehorizontaledge_neon.asm create mode 100644 vp9/common/arm/neon/vp9_loopfiltersimpleverticaledge_neon.asm create mode 100644 vp9/common/arm/neon/vp9_mbloopfilter_neon.asm create mode 100644 vp9/common/arm/neon/vp9_recon16x16mb_neon.asm create mode 100644 vp9/common/arm/neon/vp9_recon2b_neon.asm create mode 100644 vp9/common/arm/neon/vp9_recon4b_neon.asm create mode 100644 vp9/common/arm/neon/vp9_recon_neon.c create mode 100644 vp9/common/arm/neon/vp9_reconb_neon.asm create mode 100644 vp9/common/arm/neon/vp9_save_neon_reg.asm create mode 100644 vp9/common/arm/neon/vp9_shortidct4x4llm_1_neon.asm create mode 100644 vp9/common/arm/neon/vp9_shortidct4x4llm_neon.asm create mode 100644 vp9/common/arm/neon/vp9_sixtappredict16x16_neon.asm create mode 100644 vp9/common/arm/neon/vp9_sixtappredict4x4_neon.asm create mode 100644 vp9/common/arm/neon/vp9_sixtappredict8x4_neon.asm create mode 100644 vp9/common/arm/neon/vp9_sixtappredict8x8_neon.asm delete mode 100644 vp9/common/arm/recon_arm.h delete mode 100644 vp9/common/arm/reconintra_arm.c delete mode 100644 vp9/common/arm/subpixel_arm.h create mode 100644 vp9/common/arm/vp9_arm_systemdependent.c create mode 100644 vp9/common/arm/vp9_bilinearfilter_arm.c create mode 100644 vp9/common/arm/vp9_bilinearfilter_arm.h create mode 100644 vp9/common/arm/vp9_filter_arm.c create mode 100644 vp9/common/arm/vp9_idct_arm.h create mode 100644 vp9/common/arm/vp9_loopfilter_arm.c create mode 100644 vp9/common/arm/vp9_loopfilter_arm.h create mode 100644 vp9/common/arm/vp9_recon_arm.h create mode 100644 vp9/common/arm/vp9_reconintra_arm.c create mode 100644 vp9/common/arm/vp9_subpixel_arm.h delete mode 100644 vp9/common/asm_com_offsets.c delete mode 100644 vp9/common/blockd.c delete mode 100644 vp9/common/blockd.h delete mode 100644 vp9/common/coefupdateprobs.h delete mode 100644 vp9/common/common.h delete mode 100644 vp9/common/common_types.h delete mode 100644 vp9/common/context.c delete mode 100644 vp9/common/debugmodes.c delete mode 100644 vp9/common/default_coef_probs.h delete mode 100644 vp9/common/entropy.c delete mode 100644 vp9/common/entropy.h delete mode 100644 vp9/common/entropymode.c delete mode 100644 vp9/common/entropymode.h delete mode 100644 vp9/common/entropymv.c delete mode 100644 vp9/common/entropymv.h delete mode 100644 vp9/common/extend.c delete mode 100644 vp9/common/extend.h delete mode 100644 vp9/common/filter.c delete mode 100644 vp9/common/filter.h delete mode 100644 vp9/common/findnearmv.c delete mode 100644 vp9/common/findnearmv.h delete mode 100644 vp9/common/generic/systemdependent.c create mode 100644 vp9/common/generic/vp9_systemdependent.c delete mode 100644 vp9/common/header.h delete mode 100644 vp9/common/idctllm.c delete mode 100644 vp9/common/implicit_segmentation.c delete mode 100644 vp9/common/invtrans.c delete mode 100644 vp9/common/invtrans.h delete mode 100644 vp9/common/loopfilter.c delete mode 100644 vp9/common/loopfilter.h delete mode 100644 vp9/common/loopfilter_filters.c delete mode 100644 vp9/common/maskingmv.c delete mode 100644 vp9/common/mbpitch.c delete mode 100644 vp9/common/modecont.c delete mode 100644 vp9/common/modecont.h delete mode 100644 vp9/common/modecontext.c delete mode 100644 vp9/common/mv.h delete mode 100644 vp9/common/mvref_common.c delete mode 100644 vp9/common/mvref_common.h delete mode 100644 vp9/common/onyx.h delete mode 100644 vp9/common/onyxc_int.h delete mode 100644 vp9/common/onyxd.h delete mode 100644 vp9/common/postproc.c delete mode 100644 vp9/common/postproc.h delete mode 100644 vp9/common/ppc/copy_altivec.asm delete mode 100644 vp9/common/ppc/filter_altivec.asm delete mode 100644 vp9/common/ppc/filter_bilinear_altivec.asm delete mode 100644 vp9/common/ppc/idctllm_altivec.asm delete mode 100644 vp9/common/ppc/loopfilter_altivec.c delete mode 100644 vp9/common/ppc/loopfilter_filters_altivec.asm delete mode 100644 vp9/common/ppc/platform_altivec.asm delete mode 100644 vp9/common/ppc/recon_altivec.asm delete mode 100644 vp9/common/ppc/systemdependent.c create mode 100644 vp9/common/ppc/vp9_copy_altivec.asm create mode 100644 vp9/common/ppc/vp9_filter_altivec.asm create mode 100644 vp9/common/ppc/vp9_filter_bilinear_altivec.asm create mode 100644 vp9/common/ppc/vp9_idctllm_altivec.asm create mode 100644 vp9/common/ppc/vp9_loopfilter_altivec.c create mode 100644 vp9/common/ppc/vp9_loopfilter_filters_altivec.asm create mode 100644 vp9/common/ppc/vp9_platform_altivec.asm create mode 100644 vp9/common/ppc/vp9_recon_altivec.asm create mode 100644 vp9/common/ppc/vp9_systemdependent.c delete mode 100644 vp9/common/ppflags.h delete mode 100644 vp9/common/pragmas.h delete mode 100644 vp9/common/pred_common.c delete mode 100644 vp9/common/pred_common.h delete mode 100644 vp9/common/quant_common.c delete mode 100644 vp9/common/quant_common.h delete mode 100644 vp9/common/recon.c delete mode 100644 vp9/common/reconinter.c delete mode 100644 vp9/common/reconinter.h delete mode 100644 vp9/common/reconintra.c delete mode 100644 vp9/common/reconintra.h delete mode 100644 vp9/common/reconintra4x4.c delete mode 100644 vp9/common/reconintra4x4.h delete mode 100644 vp9/common/rtcd.c delete mode 100644 vp9/common/rtcd_defs.sh delete mode 100644 vp9/common/sadmxn.h delete mode 100644 vp9/common/seg_common.c delete mode 100644 vp9/common/seg_common.h delete mode 100644 vp9/common/setupintrarecon.c delete mode 100644 vp9/common/setupintrarecon.h delete mode 100644 vp9/common/subpelvar.h delete mode 100644 vp9/common/subpixel.h delete mode 100644 vp9/common/swapyv12buffer.c delete mode 100644 vp9/common/swapyv12buffer.h delete mode 100644 vp9/common/systemdependent.h delete mode 100644 vp9/common/tapify.py delete mode 100644 vp9/common/textblit.c delete mode 100644 vp9/common/textblit.h delete mode 100644 vp9/common/treecoder.c delete mode 100644 vp9/common/treecoder.h delete mode 100644 vp9/common/type_aliases.h create mode 100644 vp9/common/vp9_alloccommon.c create mode 100644 vp9/common/vp9_alloccommon.h create mode 100644 vp9/common/vp9_asm_com_offsets.c create mode 100644 vp9/common/vp9_blockd.c create mode 100644 vp9/common/vp9_blockd.h create mode 100644 vp9/common/vp9_coefupdateprobs.h create mode 100644 vp9/common/vp9_common.h create mode 100644 vp9/common/vp9_common_types.h create mode 100644 vp9/common/vp9_context.c create mode 100644 vp9/common/vp9_debugmodes.c create mode 100644 vp9/common/vp9_default_coef_probs.h create mode 100644 vp9/common/vp9_entropy.c create mode 100644 vp9/common/vp9_entropy.h create mode 100644 vp9/common/vp9_entropymode.c create mode 100644 vp9/common/vp9_entropymode.h create mode 100644 vp9/common/vp9_entropymv.c create mode 100644 vp9/common/vp9_entropymv.h create mode 100644 vp9/common/vp9_extend.c create mode 100644 vp9/common/vp9_extend.h create mode 100644 vp9/common/vp9_filter.c create mode 100644 vp9/common/vp9_filter.h create mode 100644 vp9/common/vp9_findnearmv.c create mode 100644 vp9/common/vp9_findnearmv.h create mode 100644 vp9/common/vp9_header.h create mode 100644 vp9/common/vp9_idctllm.c create mode 100644 vp9/common/vp9_implicit_segmentation.c create mode 100644 vp9/common/vp9_invtrans.c create mode 100644 vp9/common/vp9_invtrans.h create mode 100644 vp9/common/vp9_loopfilter.c create mode 100644 vp9/common/vp9_loopfilter.h create mode 100644 vp9/common/vp9_loopfilter_filters.c create mode 100644 vp9/common/vp9_maskingmv.c create mode 100644 vp9/common/vp9_mbpitch.c create mode 100644 vp9/common/vp9_modecont.c create mode 100644 vp9/common/vp9_modecont.h create mode 100644 vp9/common/vp9_modecontext.c create mode 100644 vp9/common/vp9_mv.h create mode 100644 vp9/common/vp9_mvref_common.c create mode 100644 vp9/common/vp9_mvref_common.h create mode 100644 vp9/common/vp9_onyx.h create mode 100644 vp9/common/vp9_onyxc_int.h create mode 100644 vp9/common/vp9_onyxd.h create mode 100644 vp9/common/vp9_postproc.c create mode 100644 vp9/common/vp9_postproc.h create mode 100644 vp9/common/vp9_ppflags.h create mode 100644 vp9/common/vp9_pragmas.h create mode 100644 vp9/common/vp9_pred_common.c create mode 100644 vp9/common/vp9_pred_common.h create mode 100644 vp9/common/vp9_quant_common.c create mode 100644 vp9/common/vp9_quant_common.h create mode 100644 vp9/common/vp9_recon.c create mode 100644 vp9/common/vp9_reconinter.c create mode 100644 vp9/common/vp9_reconinter.h create mode 100644 vp9/common/vp9_reconintra.c create mode 100644 vp9/common/vp9_reconintra.h create mode 100644 vp9/common/vp9_reconintra4x4.c create mode 100644 vp9/common/vp9_reconintra4x4.h create mode 100644 vp9/common/vp9_rtcd.c create mode 100644 vp9/common/vp9_rtcd_defs.sh create mode 100644 vp9/common/vp9_sadmxn.h create mode 100644 vp9/common/vp9_seg_common.c create mode 100644 vp9/common/vp9_seg_common.h create mode 100644 vp9/common/vp9_setupintrarecon.c create mode 100644 vp9/common/vp9_setupintrarecon.h create mode 100644 vp9/common/vp9_subpelvar.h create mode 100644 vp9/common/vp9_subpixel.h create mode 100644 vp9/common/vp9_swapyv12buffer.c create mode 100644 vp9/common/vp9_swapyv12buffer.h create mode 100644 vp9/common/vp9_systemdependent.h create mode 100644 vp9/common/vp9_tapify.py create mode 100644 vp9/common/vp9_textblit.c create mode 100644 vp9/common/vp9_textblit.h create mode 100644 vp9/common/vp9_treecoder.c create mode 100644 vp9/common/vp9_treecoder.h create mode 100644 vp9/common/vp9_type_aliases.h delete mode 100644 vp9/common/x86/filter_sse2.c delete mode 100644 vp9/common/x86/filter_sse4.c delete mode 100644 vp9/common/x86/idct_x86.h delete mode 100644 vp9/common/x86/idctllm_mmx.asm delete mode 100644 vp9/common/x86/idctllm_sse2.asm delete mode 100644 vp9/common/x86/iwalsh_mmx.asm delete mode 100644 vp9/common/x86/iwalsh_sse2.asm delete mode 100644 vp9/common/x86/loopfilter_mmx.asm delete mode 100644 vp9/common/x86/loopfilter_sse2.asm delete mode 100644 vp9/common/x86/loopfilter_x86.c delete mode 100644 vp9/common/x86/loopfilter_x86.h delete mode 100644 vp9/common/x86/mask_sse3.asm delete mode 100644 vp9/common/x86/postproc_mmx.asm delete mode 100644 vp9/common/x86/postproc_sse2.asm delete mode 100644 vp9/common/x86/postproc_x86.h delete mode 100644 vp9/common/x86/recon_mmx.asm delete mode 100644 vp9/common/x86/recon_sse2.asm delete mode 100644 vp9/common/x86/recon_wrapper_sse2.c delete mode 100644 vp9/common/x86/sadmxn_x86.c delete mode 100644 vp9/common/x86/subpixel_8t_ssse3.asm delete mode 100644 vp9/common/x86/subpixel_mmx.asm delete mode 100644 vp9/common/x86/subpixel_sse2.asm delete mode 100644 vp9/common/x86/subpixel_ssse3.asm delete mode 100644 vp9/common/x86/subpixel_x86.h delete mode 100644 vp9/common/x86/vp8_asm_stubs.c create mode 100644 vp9/common/x86/vp9_filter_sse2.c create mode 100644 vp9/common/x86/vp9_filter_sse4.c create mode 100644 vp9/common/x86/vp9_idct_x86.h create mode 100644 vp9/common/x86/vp9_idctllm_mmx.asm create mode 100644 vp9/common/x86/vp9_idctllm_sse2.asm create mode 100644 vp9/common/x86/vp9_iwalsh_mmx.asm create mode 100644 vp9/common/x86/vp9_iwalsh_sse2.asm create mode 100644 vp9/common/x86/vp9_loopfilter_mmx.asm create mode 100644 vp9/common/x86/vp9_loopfilter_sse2.asm create mode 100644 vp9/common/x86/vp9_loopfilter_x86.c create mode 100644 vp9/common/x86/vp9_loopfilter_x86.h create mode 100644 vp9/common/x86/vp9_mask_sse3.asm create mode 100644 vp9/common/x86/vp9_postproc_mmx.asm create mode 100644 vp9/common/x86/vp9_postproc_sse2.asm create mode 100644 vp9/common/x86/vp9_postproc_x86.h create mode 100644 vp9/common/x86/vp9_recon_mmx.asm create mode 100644 vp9/common/x86/vp9_recon_sse2.asm create mode 100644 vp9/common/x86/vp9_recon_wrapper_sse2.c create mode 100644 vp9/common/x86/vp9_sadmxn_x86.c create mode 100644 vp9/common/x86/vp9_subpixel_8t_ssse3.asm create mode 100644 vp9/common/x86/vp9_subpixel_mmx.asm create mode 100644 vp9/common/x86/vp9_subpixel_sse2.asm create mode 100644 vp9/common/x86/vp9_subpixel_ssse3.asm create mode 100644 vp9/common/x86/vp9_subpixel_x86.h create mode 100644 vp9/common/x86/vp9_vp8_asm_stubs.c create mode 100644 vp9/common/x86/vp9_x86_systemdependent.c delete mode 100644 vp9/common/x86/x86_systemdependent.c (limited to 'vp9/common') diff --git a/vp9/common/alloccommon.c b/vp9/common/alloccommon.c deleted file mode 100644 index 33d322099..000000000 --- a/vp9/common/alloccommon.c +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include "blockd.h" -#include "vpx_mem/vpx_mem.h" -#include "onyxc_int.h" -#include "findnearmv.h" -#include "entropymode.h" -#include "entropymv.h" -#include "systemdependent.h" - - -void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base) { - int stride = cpi->mode_info_stride; - int i; - - // Clear down top border row - vpx_memset(mi_base, 0, sizeof(MODE_INFO) * cpi->mode_info_stride); - - // Clear left border column - for (i = 1; i < cpi->mb_rows + 1; i++) { - vpx_memset(&mi_base[i * stride], 0, sizeof(MODE_INFO)); - } -} - -void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi) { - int i, j; - - // For each in image mode_info element set the in image flag to 1 - for (i = 0; i < cpi->mb_rows; i++) { - for (j = 0; j < cpi->mb_cols; j++) { - mi->mbmi.mb_in_image = 1; - mi++; // Next element in the row - } - - mi++; // Step over border element at start of next row - } -} - -void vp9_de_alloc_frame_buffers(VP9_COMMON *oci) { - int i; - - for (i = 0; i < NUM_YV12_BUFFERS; i++) - vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]); - - vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame); - vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer); - - vpx_free(oci->above_context); - vpx_free(oci->mip); - vpx_free(oci->prev_mip); - - oci->above_context = 0; - oci->mip = 0; - oci->prev_mip = 0; - -} - -int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) { - int i; - - vp9_de_alloc_frame_buffers(oci); - - /* our internal buffers are always multiples of 16 */ - if ((width & 0xf) != 0) - width += 16 - (width & 0xf); - - if ((height & 0xf) != 0) - height += 16 - (height & 0xf); - - - for (i = 0; i < NUM_YV12_BUFFERS; i++) { - oci->fb_idx_ref_cnt[i] = 0; - oci->yv12_fb[i].flags = 0; - if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, - VP9BORDERINPIXELS) < 0) { - vp9_de_alloc_frame_buffers(oci); - return 1; - } - } - - oci->new_fb_idx = 0; - oci->lst_fb_idx = 1; - oci->gld_fb_idx = 2; - oci->alt_fb_idx = 3; - - oci->fb_idx_ref_cnt[0] = 1; - oci->fb_idx_ref_cnt[1] = 1; - oci->fb_idx_ref_cnt[2] = 1; - oci->fb_idx_ref_cnt[3] = 1; - - if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, - VP9BORDERINPIXELS) < 0) { - vp9_de_alloc_frame_buffers(oci); - return 1; - } - - if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, - VP9BORDERINPIXELS) < 0) { - vp9_de_alloc_frame_buffers(oci); - return 1; - } - - oci->mb_rows = height >> 4; - oci->mb_cols = width >> 4; - oci->MBs = oci->mb_rows * oci->mb_cols; - oci->mode_info_stride = oci->mb_cols + 1; - oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO)); - - if (!oci->mip) { - vp9_de_alloc_frame_buffers(oci); - return 1; - } - - oci->mi = oci->mip + oci->mode_info_stride + 1; - - /* allocate memory for last frame MODE_INFO array */ - - oci->prev_mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO)); - - if (!oci->prev_mip) { - vp9_de_alloc_frame_buffers(oci); - return 1; - } - - oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1; - - oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1); - - if (!oci->above_context) { - vp9_de_alloc_frame_buffers(oci); - return 1; - } - - vp9_update_mode_info_border(oci, oci->mip); - vp9_update_mode_info_in_image(oci, oci->mi); - - return 0; -} -void vp9_setup_version(VP9_COMMON *cm) { - if (cm->version & 0x4) { - if (!CONFIG_EXPERIMENTAL) - vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, - "Bitstream was created by an experimental " - "encoder"); - cm->experimental = 1; - } - - switch (cm->version & 0x3) { - case 0: - cm->no_lpf = 0; - cm->filter_type = NORMAL_LOOPFILTER; - cm->use_bilinear_mc_filter = 0; - cm->full_pixel = 0; - break; - case 1: - cm->no_lpf = 0; - cm->filter_type = SIMPLE_LOOPFILTER; - cm->use_bilinear_mc_filter = 1; - cm->full_pixel = 0; - break; - case 2: - case 3: - cm->no_lpf = 1; - cm->filter_type = NORMAL_LOOPFILTER; - cm->use_bilinear_mc_filter = 1; - cm->full_pixel = 0; - break; - // Full pel only code deprecated in experimental code base - // case 3: - // cm->no_lpf = 1; - // cm->filter_type = SIMPLE_LOOPFILTER; - // cm->use_bilinear_mc_filter = 1; - // cm->full_pixel = 1; - // break; - } -} -void vp9_create_common(VP9_COMMON *oci) { - vp9_machine_specific_config(oci); - - vp9_init_mbmode_probs(oci); - - vp9_default_bmode_probs(oci->fc.bmode_prob); - - oci->txfm_mode = ONLY_4X4; - oci->mb_no_coeff_skip = 1; - oci->comp_pred_mode = HYBRID_PREDICTION; - oci->no_lpf = 0; - oci->filter_type = NORMAL_LOOPFILTER; - oci->use_bilinear_mc_filter = 0; - oci->full_pixel = 0; - oci->clr_type = REG_YUV; - oci->clamp_type = RECON_CLAMP_REQUIRED; - - /* Initialise reference frame sign bias structure to defaults */ - vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias)); - - /* Default disable buffer to buffer copying */ - oci->copy_buffer_to_gf = 0; - oci->copy_buffer_to_arf = 0; - oci->kf_ymode_probs_update = 0; -} - -void vp9_remove_common(VP9_COMMON *oci) { - vp9_de_alloc_frame_buffers(oci); -} - -void vp9_initialize_common() { - vp9_coef_tree_initialize(); - - vp9_entropy_mode_init(); - - vp9_entropy_mv_init(); -} diff --git a/vp9/common/alloccommon.h b/vp9/common/alloccommon.h deleted file mode 100644 index 490b869b7..000000000 --- a/vp9/common/alloccommon.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_ALLOCCOMMON_H -#define __INC_ALLOCCOMMON_H - -#include "onyxc_int.h" - -void vp9_create_common(VP9_COMMON *oci); -void vp9_remove_common(VP9_COMMON *oci); -void vp9_de_alloc_frame_buffers(VP9_COMMON *oci); -int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height); -void vp9_setup_version(VP9_COMMON *oci); - -void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base); -void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi); - -#endif diff --git a/vp9/common/arm/arm_systemdependent.c b/vp9/common/arm/arm_systemdependent.c deleted file mode 100644 index 0a0e8098e..000000000 --- a/vp9/common/arm/arm_systemdependent.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include "vpx_ports/arm.h" -#include "vp9/common/pragmas.h" -#include "vp9/common/subpixel.h" -#include "vp9/common/loopfilter.h" -#include "vp9/common/recon.h" -#include "vp9/common/onyxc_int.h" - -void vp9_arch_arm_common_init(VP9_COMMON *ctx) { -#if CONFIG_RUNTIME_CPU_DETECT - VP9_COMMON_RTCD *rtcd = &ctx->rtcd; - int flags = arm_cpu_caps(); - rtcd->flags = flags; - - /* Override default functions with fastest ones for this CPU. */ -#if HAVE_ARMV5TE - if (flags & HAS_EDSP) { - } -#endif - -// The commented functions need to be re-written for vpx. -#if HAVE_ARMV6 - if (flags & HAS_MEDIA) { - rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_armv6; - rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_armv6; - rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_armv6; - rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_armv6; - - rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_armv6; - rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_armv6; - rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_armv6; - rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_armv6; - - // rtcd->idct.idct1 = vp9_short_idct4x4llm_1_v6; - // rtcd->idct.idct16 = vp9_short_idct4x4llm_v6_dual; - // rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_v6; - // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_v6; - - rtcd->recon.copy16x16 = vp9_copy_mem16x16_v6; - rtcd->recon.copy8x8 = vp9_copy_mem8x8_v6; - rtcd->recon.copy8x4 = vp9_copy_mem8x4_v6; - rtcd->recon.recon = vp9_recon_b_armv6; - rtcd->recon.recon2 = vp9_recon2b_armv6; - rtcd->recon.recon4 = vp9_recon4b_armv6; - } -#endif - -#if HAVE_ARMV7 - if (flags & HAS_NEON) { - rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_neon; - rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_neon; - rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_neon; - rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_neon; - - rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_neon; - rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_neon; - rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_neon; - rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_neon; - - // rtcd->idct.idct1 = vp9_short_idct4x4llm_1_neon; - // rtcd->idct.idct16 = vp9_short_idct4x4llm_neon; - // rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_neon; - // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_neon; - - rtcd->recon.copy16x16 = vp9_copy_mem16x16_neon; - rtcd->recon.copy8x8 = vp9_copy_mem8x8_neon; - rtcd->recon.copy8x4 = vp9_copy_mem8x4_neon; - rtcd->recon.recon = vp9_recon_b_neon; - rtcd->recon.recon2 = vp9_recon2b_neon; - rtcd->recon.recon4 = vp9_recon4b_neon; - rtcd->recon.recon_mb = vp9_recon_mb_neon; - rtcd->recon.build_intra_predictors_mby = - vp9_build_intra_predictors_mby_neon; - rtcd->recon.build_intra_predictors_mby_s = - vp9_build_intra_predictors_mby_s_neon; - } -#endif - -#endif -} diff --git a/vp9/common/arm/armv6/bilinearfilter_v6.asm b/vp9/common/arm/armv6/bilinearfilter_v6.asm deleted file mode 100644 index 36e391e2b..000000000 --- a/vp9/common/arm/armv6/bilinearfilter_v6.asm +++ /dev/null @@ -1,237 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_filter_block2d_bil_first_pass_armv6| - EXPORT |vp9_filter_block2d_bil_second_pass_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code - -;------------------------------------- -; r0 unsigned char *src_ptr, -; r1 unsigned short *dst_ptr, -; r2 unsigned int src_pitch, -; r3 unsigned int height, -; stack unsigned int width, -; stack const short *vp9_filter -;------------------------------------- -; The output is transposed stroed in output array to make it easy for second pass filtering. -|vp9_filter_block2d_bil_first_pass_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vp9_filter address - ldr r4, [sp, #36] ; width - - mov r12, r3 ; outer-loop counter - - add r7, r2, r4 ; preload next row - pld [r0, r7] - - sub r2, r2, r4 ; src increment for height loop - - ldr r5, [r11] ; load up filter coefficients - - mov r3, r3, lsl #1 ; height*2 - add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) - - mov r11, r1 ; save dst_ptr for each row - - cmp r5, #128 ; if filter coef = 128, then skip the filter - beq bil_null_1st_filter - -|bil_height_loop_1st_v6| - ldrb r6, [r0] ; load source data - ldrb r7, [r0, #1] - ldrb r8, [r0, #2] - mov lr, r4, lsr #2 ; 4-in-parellel loop counter - -|bil_width_loop_1st_v6| - ldrb r9, [r0, #3] - ldrb r10, [r0, #4] - - pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] - pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] - - smuad r6, r6, r5 ; apply the filter - pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] - smuad r7, r7, r5 - pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] - - smuad r8, r8, r5 - smuad r9, r9, r5 - - add r0, r0, #4 - subs lr, lr, #1 - - add r6, r6, #0x40 ; round_shift_and_clamp - add r7, r7, #0x40 - usat r6, #16, r6, asr #7 - usat r7, #16, r7, asr #7 - - strh r6, [r1], r3 ; result is transposed and stored - - add r8, r8, #0x40 ; round_shift_and_clamp - strh r7, [r1], r3 - add r9, r9, #0x40 - usat r8, #16, r8, asr #7 - usat r9, #16, r9, asr #7 - - strh r8, [r1], r3 ; result is transposed and stored - - ldrneb r6, [r0] ; load source data - strh r9, [r1], r3 - - ldrneb r7, [r0, #1] - ldrneb r8, [r0, #2] - - bne bil_width_loop_1st_v6 - - add r0, r0, r2 ; move to next input row - subs r12, r12, #1 - - add r9, r2, r4, lsl #1 ; adding back block width - pld [r0, r9] ; preload next row - - add r11, r11, #2 ; move over to next column - mov r1, r11 - - bne bil_height_loop_1st_v6 - - ldmia sp!, {r4 - r11, pc} - -|bil_null_1st_filter| -|bil_height_loop_null_1st| - mov lr, r4, lsr #2 ; loop counter - -|bil_width_loop_null_1st| - ldrb r6, [r0] ; load data - ldrb r7, [r0, #1] - ldrb r8, [r0, #2] - ldrb r9, [r0, #3] - - strh r6, [r1], r3 ; store it to immediate buffer - add r0, r0, #4 - strh r7, [r1], r3 - subs lr, lr, #1 - strh r8, [r1], r3 - strh r9, [r1], r3 - - bne bil_width_loop_null_1st - - subs r12, r12, #1 - add r0, r0, r2 ; move to next input line - add r11, r11, #2 ; move over to next column - mov r1, r11 - - bne bil_height_loop_null_1st - - ldmia sp!, {r4 - r11, pc} - - ENDP ; |vp9_filter_block2d_bil_first_pass_armv6| - - -;--------------------------------- -; r0 unsigned short *src_ptr, -; r1 unsigned char *dst_ptr, -; r2 int dst_pitch, -; r3 unsigned int height, -; stack unsigned int width, -; stack const short *vp9_filter -;--------------------------------- -|vp9_filter_block2d_bil_second_pass_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vp9_filter address - ldr r4, [sp, #36] ; width - - ldr r5, [r11] ; load up filter coefficients - mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix - mov r11, r1 - - cmp r5, #128 ; if filter coef = 128, then skip the filter - beq bil_null_2nd_filter - -|bil_height_loop_2nd| - ldr r6, [r0] ; load the data - ldr r8, [r0, #4] - ldrh r10, [r0, #8] - mov lr, r3, lsr #2 ; loop counter - -|bil_width_loop_2nd| - pkhtb r7, r6, r8 ; src[1] | src[2] - pkhtb r9, r8, r10 ; src[3] | src[4] - - smuad r6, r6, r5 ; apply filter - smuad r8, r8, r5 ; apply filter - - subs lr, lr, #1 - - smuadx r7, r7, r5 ; apply filter - smuadx r9, r9, r5 ; apply filter - - add r0, r0, #8 - - add r6, r6, #0x40 ; round_shift_and_clamp - add r7, r7, #0x40 - usat r6, #8, r6, asr #7 - usat r7, #8, r7, asr #7 - strb r6, [r1], r2 ; the result is transposed back and stored - - add r8, r8, #0x40 ; round_shift_and_clamp - strb r7, [r1], r2 - add r9, r9, #0x40 - usat r8, #8, r8, asr #7 - usat r9, #8, r9, asr #7 - strb r8, [r1], r2 ; the result is transposed back and stored - - ldrne r6, [r0] ; load data - strb r9, [r1], r2 - ldrne r8, [r0, #4] - ldrneh r10, [r0, #8] - - bne bil_width_loop_2nd - - subs r12, r12, #1 - add r0, r0, #4 ; update src for next row - add r11, r11, #1 - mov r1, r11 - - bne bil_height_loop_2nd - ldmia sp!, {r4 - r11, pc} - -|bil_null_2nd_filter| -|bil_height_loop_null_2nd| - mov lr, r3, lsr #2 - -|bil_width_loop_null_2nd| - ldr r6, [r0], #4 ; load data - subs lr, lr, #1 - ldr r8, [r0], #4 - - strb r6, [r1], r2 ; store data - mov r7, r6, lsr #16 - strb r7, [r1], r2 - mov r9, r8, lsr #16 - strb r8, [r1], r2 - strb r9, [r1], r2 - - bne bil_width_loop_null_2nd - - subs r12, r12, #1 - add r0, r0, #4 - add r11, r11, #1 - mov r1, r11 - - bne bil_height_loop_null_2nd - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp9_filter_block2d_second_pass_armv6| - - END diff --git a/vp9/common/arm/armv6/copymem16x16_v6.asm b/vp9/common/arm/armv6/copymem16x16_v6.asm deleted file mode 100644 index 44c3c492f..000000000 --- a/vp9/common/arm/armv6/copymem16x16_v6.asm +++ /dev/null @@ -1,186 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_copy_mem16x16_v6| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp9_copy_mem16x16_v6| PROC - stmdb sp!, {r4 - r7} - ;push {r4-r7} - - ;preload - pld [r0, #31] ; preload for next 16x16 block - - ands r4, r0, #15 - beq copy_mem16x16_fast - - ands r4, r0, #7 - beq copy_mem16x16_8 - - ands r4, r0, #3 - beq copy_mem16x16_4 - - ;copy one byte each time - ldrb r4, [r0] - ldrb r5, [r0, #1] - ldrb r6, [r0, #2] - ldrb r7, [r0, #3] - - mov r12, #16 - -copy_mem16x16_1_loop - strb r4, [r2] - strb r5, [r2, #1] - strb r6, [r2, #2] - strb r7, [r2, #3] - - ldrb r4, [r0, #4] - ldrb r5, [r0, #5] - ldrb r6, [r0, #6] - ldrb r7, [r0, #7] - - subs r12, r12, #1 - - strb r4, [r2, #4] - strb r5, [r2, #5] - strb r6, [r2, #6] - strb r7, [r2, #7] - - ldrb r4, [r0, #8] - ldrb r5, [r0, #9] - ldrb r6, [r0, #10] - ldrb r7, [r0, #11] - - strb r4, [r2, #8] - strb r5, [r2, #9] - strb r6, [r2, #10] - strb r7, [r2, #11] - - ldrb r4, [r0, #12] - ldrb r5, [r0, #13] - ldrb r6, [r0, #14] - ldrb r7, [r0, #15] - - add r0, r0, r1 - - strb r4, [r2, #12] - strb r5, [r2, #13] - strb r6, [r2, #14] - strb r7, [r2, #15] - - add r2, r2, r3 - - ldrneb r4, [r0] - ldrneb r5, [r0, #1] - ldrneb r6, [r0, #2] - ldrneb r7, [r0, #3] - - pld [r0, #31] ; preload for next 16x16 block - - bne copy_mem16x16_1_loop - - ldmia sp!, {r4 - r7} - ;pop {r4-r7} - mov pc, lr - -;copy 4 bytes each time -copy_mem16x16_4 - ldr r4, [r0] - ldr r5, [r0, #4] - ldr r6, [r0, #8] - ldr r7, [r0, #12] - - mov r12, #16 - -copy_mem16x16_4_loop - subs r12, r12, #1 - add r0, r0, r1 - - str r4, [r2] - str r5, [r2, #4] - str r6, [r2, #8] - str r7, [r2, #12] - - add r2, r2, r3 - - ldrne r4, [r0] - ldrne r5, [r0, #4] - ldrne r6, [r0, #8] - ldrne r7, [r0, #12] - - pld [r0, #31] ; preload for next 16x16 block - - bne copy_mem16x16_4_loop - - ldmia sp!, {r4 - r7} - ;pop {r4-r7} - mov pc, lr - -;copy 8 bytes each time -copy_mem16x16_8 - sub r1, r1, #16 - sub r3, r3, #16 - - mov r12, #16 - -copy_mem16x16_8_loop - ldmia r0!, {r4-r5} - ;ldm r0, {r4-r5} - ldmia r0!, {r6-r7} - - add r0, r0, r1 - - stmia r2!, {r4-r5} - subs r12, r12, #1 - ;stm r2, {r4-r5} - stmia r2!, {r6-r7} - - add r2, r2, r3 - - pld [r0, #31] ; preload for next 16x16 block - bne copy_mem16x16_8_loop - - ldmia sp!, {r4 - r7} - ;pop {r4-r7} - mov pc, lr - -;copy 16 bytes each time -copy_mem16x16_fast - ;sub r1, r1, #16 - ;sub r3, r3, #16 - - mov r12, #16 - -copy_mem16x16_fast_loop - ldmia r0, {r4-r7} - ;ldm r0, {r4-r7} - add r0, r0, r1 - - subs r12, r12, #1 - stmia r2, {r4-r7} - ;stm r2, {r4-r7} - add r2, r2, r3 - - pld [r0, #31] ; preload for next 16x16 block - bne copy_mem16x16_fast_loop - - ldmia sp!, {r4 - r7} - ;pop {r4-r7} - mov pc, lr - - ENDP ; |vp9_copy_mem16x16_v6| - - END diff --git a/vp9/common/arm/armv6/copymem8x4_v6.asm b/vp9/common/arm/armv6/copymem8x4_v6.asm deleted file mode 100644 index 45b904367..000000000 --- a/vp9/common/arm/armv6/copymem8x4_v6.asm +++ /dev/null @@ -1,128 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_copy_mem8x4_v6| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void vp9_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp9_copy_mem8x4_v6| PROC - ;push {r4-r5} - stmdb sp!, {r4-r5} - - ;preload - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - ands r4, r0, #7 - beq copy_mem8x4_fast - - ands r4, r0, #3 - beq copy_mem8x4_4 - - ;copy 1 byte each time - ldrb r4, [r0] - ldrb r5, [r0, #1] - - mov r12, #4 - -copy_mem8x4_1_loop - strb r4, [r2] - strb r5, [r2, #1] - - ldrb r4, [r0, #2] - ldrb r5, [r0, #3] - - subs r12, r12, #1 - - strb r4, [r2, #2] - strb r5, [r2, #3] - - ldrb r4, [r0, #4] - ldrb r5, [r0, #5] - - strb r4, [r2, #4] - strb r5, [r2, #5] - - ldrb r4, [r0, #6] - ldrb r5, [r0, #7] - - add r0, r0, r1 - - strb r4, [r2, #6] - strb r5, [r2, #7] - - add r2, r2, r3 - - ldrneb r4, [r0] - ldrneb r5, [r0, #1] - - bne copy_mem8x4_1_loop - - ldmia sp!, {r4 - r5} - ;pop {r4-r5} - mov pc, lr - -;copy 4 bytes each time -copy_mem8x4_4 - ldr r4, [r0] - ldr r5, [r0, #4] - - mov r12, #4 - -copy_mem8x4_4_loop - subs r12, r12, #1 - add r0, r0, r1 - - str r4, [r2] - str r5, [r2, #4] - - add r2, r2, r3 - - ldrne r4, [r0] - ldrne r5, [r0, #4] - - bne copy_mem8x4_4_loop - - ldmia sp!, {r4-r5} - ;pop {r4-r5} - mov pc, lr - -;copy 8 bytes each time -copy_mem8x4_fast - ;sub r1, r1, #8 - ;sub r3, r3, #8 - - mov r12, #4 - -copy_mem8x4_fast_loop - ldmia r0, {r4-r5} - ;ldm r0, {r4-r5} - add r0, r0, r1 - - subs r12, r12, #1 - stmia r2, {r4-r5} - ;stm r2, {r4-r5} - add r2, r2, r3 - - bne copy_mem8x4_fast_loop - - ldmia sp!, {r4-r5} - ;pop {r4-r5} - mov pc, lr - - ENDP ; |vp9_copy_mem8x4_v6| - - END diff --git a/vp9/common/arm/armv6/copymem8x8_v6.asm b/vp9/common/arm/armv6/copymem8x8_v6.asm deleted file mode 100644 index 0dd971bfe..000000000 --- a/vp9/common/arm/armv6/copymem8x8_v6.asm +++ /dev/null @@ -1,128 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_copy_mem8x8_v6| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp9_copy_mem8x8_v6| PROC - ;push {r4-r5} - stmdb sp!, {r4-r5} - - ;preload - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - ands r4, r0, #7 - beq copy_mem8x8_fast - - ands r4, r0, #3 - beq copy_mem8x8_4 - - ;copy 1 byte each time - ldrb r4, [r0] - ldrb r5, [r0, #1] - - mov r12, #8 - -copy_mem8x8_1_loop - strb r4, [r2] - strb r5, [r2, #1] - - ldrb r4, [r0, #2] - ldrb r5, [r0, #3] - - subs r12, r12, #1 - - strb r4, [r2, #2] - strb r5, [r2, #3] - - ldrb r4, [r0, #4] - ldrb r5, [r0, #5] - - strb r4, [r2, #4] - strb r5, [r2, #5] - - ldrb r4, [r0, #6] - ldrb r5, [r0, #7] - - add r0, r0, r1 - - strb r4, [r2, #6] - strb r5, [r2, #7] - - add r2, r2, r3 - - ldrneb r4, [r0] - ldrneb r5, [r0, #1] - - bne copy_mem8x8_1_loop - - ldmia sp!, {r4 - r5} - ;pop {r4-r5} - mov pc, lr - -;copy 4 bytes each time -copy_mem8x8_4 - ldr r4, [r0] - ldr r5, [r0, #4] - - mov r12, #8 - -copy_mem8x8_4_loop - subs r12, r12, #1 - add r0, r0, r1 - - str r4, [r2] - str r5, [r2, #4] - - add r2, r2, r3 - - ldrne r4, [r0] - ldrne r5, [r0, #4] - - bne copy_mem8x8_4_loop - - ldmia sp!, {r4 - r5} - ;pop {r4-r5} - mov pc, lr - -;copy 8 bytes each time -copy_mem8x8_fast - ;sub r1, r1, #8 - ;sub r3, r3, #8 - - mov r12, #8 - -copy_mem8x8_fast_loop - ldmia r0, {r4-r5} - ;ldm r0, {r4-r5} - add r0, r0, r1 - - subs r12, r12, #1 - stmia r2, {r4-r5} - ;stm r2, {r4-r5} - add r2, r2, r3 - - bne copy_mem8x8_fast_loop - - ldmia sp!, {r4-r5} - ;pop {r4-r5} - mov pc, lr - - ENDP ; |vp9_copy_mem8x8_v6| - - END diff --git a/vp9/common/arm/armv6/dc_only_idct_add_v6.asm b/vp9/common/arm/armv6/dc_only_idct_add_v6.asm deleted file mode 100644 index e0660e9fd..000000000 --- a/vp9/common/arm/armv6/dc_only_idct_add_v6.asm +++ /dev/null @@ -1,67 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - EXPORT |vp8_dc_only_idct_add_v6| - - AREA |.text|, CODE, READONLY - -;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr, -; unsigned char *dst_ptr, int pitch, int stride) -; r0 input_dc -; r1 pred_ptr -; r2 dest_ptr -; r3 pitch -; sp stride - -|vp8_dc_only_idct_add_v6| PROC - stmdb sp!, {r4 - r7, lr} - - add r0, r0, #4 ; input_dc += 4 - ldr r12, c0x0000FFFF - ldr r4, [r1], r3 - ldr r6, [r1], r3 - and r0, r12, r0, asr #3 ; input_dc >> 3 + mask - ldr lr, [sp, #20] - orr r0, r0, r0, lsl #16 ; a1 | a1 - - uxtab16 r5, r0, r4 ; a1+2 | a1+0 - uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1 - uxtab16 r7, r0, r6 - uxtab16 r6, r0, r6, ror #8 - usat16 r5, #8, r5 - usat16 r4, #8, r4 - usat16 r7, #8, r7 - usat16 r6, #8, r6 - orr r5, r5, r4, lsl #8 - orr r7, r7, r6, lsl #8 - ldr r4, [r1], r3 - ldr r6, [r1] - str r5, [r2], lr - str r7, [r2], lr - - uxtab16 r5, r0, r4 - uxtab16 r4, r0, r4, ror #8 - uxtab16 r7, r0, r6 - uxtab16 r6, r0, r6, ror #8 - usat16 r5, #8, r5 - usat16 r4, #8, r4 - usat16 r7, #8, r7 - usat16 r6, #8, r6 - orr r5, r5, r4, lsl #8 - orr r7, r7, r6, lsl #8 - str r5, [r2], lr - str r7, [r2] - - ldmia sp!, {r4 - r7, pc} - - ENDP ; |vp8_dc_only_idct_add_v6| - -; Constant Pool -c0x0000FFFF DCD 0x0000FFFF - END diff --git a/vp9/common/arm/armv6/filter_v6.asm b/vp9/common/arm/armv6/filter_v6.asm deleted file mode 100644 index 16b321e37..000000000 --- a/vp9/common/arm/armv6/filter_v6.asm +++ /dev/null @@ -1,624 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_filter_block2d_first_pass_armv6| - EXPORT |vp9_filter_block2d_first_pass_16x16_armv6| - EXPORT |vp9_filter_block2d_first_pass_8x8_armv6| - EXPORT |vp9_filter_block2d_second_pass_armv6| - EXPORT |vp9_filter4_block2d_second_pass_armv6| - EXPORT |vp9_filter_block2d_first_pass_only_armv6| - EXPORT |vp9_filter_block2d_second_pass_only_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code -;------------------------------------- -; r0 unsigned char *src_ptr -; r1 short *output_ptr -; r2 unsigned int src_pixels_per_line -; r3 unsigned int output_width -; stack unsigned int output_height -; stack const short *vp9_filter -;------------------------------------- -; vp9_filter the input and put in the output array. Apply the 6 tap FIR filter with -; the output being a 2 byte value and the intput being a 1 byte value. -|vp9_filter_block2d_first_pass_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vp9_filter address - ldr r7, [sp, #36] ; output height - - sub r2, r2, r3 ; inside loop increments input array, - ; so the height loop only needs to add - ; r2 - width to the input pointer - - mov r3, r3, lsl #1 ; multiply width by 2 because using shorts - add r12, r3, #16 ; square off the output - sub sp, sp, #4 - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - str r1, [sp] ; push destination to stack - mov r7, r7, lsl #16 ; height is top part of counter - -; six tap filter -|height_loop_1st_6| - ldrb r8, [r0, #-2] ; load source data - ldrb r9, [r0, #-1] - ldrb r10, [r0], #2 - orr r7, r7, r3, lsr #2 ; construct loop counter - -|width_loop_1st_6| - ldrb r11, [r0, #-1] - - pkhbt lr, r8, r9, lsl #16 ; r9 | r8 - pkhbt r8, r9, r10, lsl #16 ; r10 | r9 - - ldrb r9, [r0] - - smuad lr, lr, r4 ; apply the filter - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - smuad r8, r8, r4 - pkhbt r11, r11, r9, lsl #16 ; r9 | r11 - - smlad lr, r10, r5, lr - ldrb r10, [r0, #1] - smlad r8, r11, r5, r8 - ldrb r11, [r0, #2] - - sub r7, r7, #1 - - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - - smlad lr, r9, r6, lr - smlad r11, r10, r6, r8 - - ands r10, r7, #0xff ; test loop counter - - add lr, lr, #0x40 ; round_shift_and_clamp - ldrneb r8, [r0, #-2] ; load data for next loop - usat lr, #8, lr, asr #7 - add r11, r11, #0x40 - ldrneb r9, [r0, #-1] - usat r11, #8, r11, asr #7 - - strh lr, [r1], r12 ; result is transposed and stored, which - ; will make second pass filtering easier. - ldrneb r10, [r0], #2 - strh r11, [r1], r12 - - bne width_loop_1st_6 - - ldr r1, [sp] ; load and update dst address - subs r7, r7, #0x10000 - add r0, r0, r2 ; move to next input line - - add r1, r1, #2 ; move over to next column - str r1, [sp] - - bne height_loop_1st_6 - - add sp, sp, #4 - ldmia sp!, {r4 - r11, pc} - - ENDP - -; -------------------------- -; 16x16 version -; ----------------------------- -|vp9_filter_block2d_first_pass_16x16_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vp9_filter address - ldr r7, [sp, #36] ; output height - - add r4, r2, #18 ; preload next low - pld [r0, r4] - - sub r2, r2, r3 ; inside loop increments input array, - ; so the height loop only needs to add - ; r2 - width to the input pointer - - mov r3, r3, lsl #1 ; multiply width by 2 because using shorts - add r12, r3, #16 ; square off the output - sub sp, sp, #4 - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - str r1, [sp] ; push destination to stack - mov r7, r7, lsl #16 ; height is top part of counter - -; six tap filter -|height_loop_1st_16_6| - ldrb r8, [r0, #-2] ; load source data - ldrb r9, [r0, #-1] - ldrb r10, [r0], #2 - orr r7, r7, r3, lsr #2 ; construct loop counter - -|width_loop_1st_16_6| - ldrb r11, [r0, #-1] - - pkhbt lr, r8, r9, lsl #16 ; r9 | r8 - pkhbt r8, r9, r10, lsl #16 ; r10 | r9 - - ldrb r9, [r0] - - smuad lr, lr, r4 ; apply the filter - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - smuad r8, r8, r4 - pkhbt r11, r11, r9, lsl #16 ; r9 | r11 - - smlad lr, r10, r5, lr - ldrb r10, [r0, #1] - smlad r8, r11, r5, r8 - ldrb r11, [r0, #2] - - sub r7, r7, #1 - - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - - smlad lr, r9, r6, lr - smlad r11, r10, r6, r8 - - ands r10, r7, #0xff ; test loop counter - - add lr, lr, #0x40 ; round_shift_and_clamp - ldrneb r8, [r0, #-2] ; load data for next loop - usat lr, #8, lr, asr #7 - add r11, r11, #0x40 - ldrneb r9, [r0, #-1] - usat r11, #8, r11, asr #7 - - strh lr, [r1], r12 ; result is transposed and stored, which - ; will make second pass filtering easier. - ldrneb r10, [r0], #2 - strh r11, [r1], r12 - - bne width_loop_1st_16_6 - - ldr r1, [sp] ; load and update dst address - subs r7, r7, #0x10000 - add r0, r0, r2 ; move to next input line - - add r11, r2, #34 ; adding back block width(=16) - pld [r0, r11] ; preload next low - - add r1, r1, #2 ; move over to next column - str r1, [sp] - - bne height_loop_1st_16_6 - - add sp, sp, #4 - ldmia sp!, {r4 - r11, pc} - - ENDP - -; -------------------------- -; 8x8 version -; ----------------------------- -|vp9_filter_block2d_first_pass_8x8_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vp9_filter address - ldr r7, [sp, #36] ; output height - - add r4, r2, #10 ; preload next low - pld [r0, r4] - - sub r2, r2, r3 ; inside loop increments input array, - ; so the height loop only needs to add - ; r2 - width to the input pointer - - mov r3, r3, lsl #1 ; multiply width by 2 because using shorts - add r12, r3, #16 ; square off the output - sub sp, sp, #4 - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - str r1, [sp] ; push destination to stack - mov r7, r7, lsl #16 ; height is top part of counter - -; six tap filter -|height_loop_1st_8_6| - ldrb r8, [r0, #-2] ; load source data - ldrb r9, [r0, #-1] - ldrb r10, [r0], #2 - orr r7, r7, r3, lsr #2 ; construct loop counter - -|width_loop_1st_8_6| - ldrb r11, [r0, #-1] - - pkhbt lr, r8, r9, lsl #16 ; r9 | r8 - pkhbt r8, r9, r10, lsl #16 ; r10 | r9 - - ldrb r9, [r0] - - smuad lr, lr, r4 ; apply the filter - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - smuad r8, r8, r4 - pkhbt r11, r11, r9, lsl #16 ; r9 | r11 - - smlad lr, r10, r5, lr - ldrb r10, [r0, #1] - smlad r8, r11, r5, r8 - ldrb r11, [r0, #2] - - sub r7, r7, #1 - - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - - smlad lr, r9, r6, lr - smlad r11, r10, r6, r8 - - ands r10, r7, #0xff ; test loop counter - - add lr, lr, #0x40 ; round_shift_and_clamp - ldrneb r8, [r0, #-2] ; load data for next loop - usat lr, #8, lr, asr #7 - add r11, r11, #0x40 - ldrneb r9, [r0, #-1] - usat r11, #8, r11, asr #7 - - strh lr, [r1], r12 ; result is transposed and stored, which - ; will make second pass filtering easier. - ldrneb r10, [r0], #2 - strh r11, [r1], r12 - - bne width_loop_1st_8_6 - - ldr r1, [sp] ; load and update dst address - subs r7, r7, #0x10000 - add r0, r0, r2 ; move to next input line - - add r11, r2, #18 ; adding back block width(=8) - pld [r0, r11] ; preload next low - - add r1, r1, #2 ; move over to next column - str r1, [sp] - - bne height_loop_1st_8_6 - - add sp, sp, #4 - ldmia sp!, {r4 - r11, pc} - - ENDP - -;--------------------------------- -; r0 short *src_ptr, -; r1 unsigned char *output_ptr, -; r2 unsigned int output_pitch, -; r3 unsigned int cnt, -; stack const short *vp9_filter -;--------------------------------- -|vp9_filter_block2d_second_pass_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #36] ; vp9_filter address - sub sp, sp, #4 - mov r7, r3, lsl #16 ; height is top part of counter - str r1, [sp] ; push destination to stack - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - pkhbt r12, r5, r4 ; pack the filter differently - pkhbt r11, r6, r5 - - sub r0, r0, #4 ; offset input buffer - -|height_loop_2nd| - ldr r8, [r0] ; load the data - ldr r9, [r0, #4] - orr r7, r7, r3, lsr #1 ; loop counter - -|width_loop_2nd| - smuad lr, r4, r8 ; apply filter - sub r7, r7, #1 - smulbt r8, r4, r8 - - ldr r10, [r0, #8] - - smlad lr, r5, r9, lr - smladx r8, r12, r9, r8 - - ldrh r9, [r0, #12] - - smlad lr, r6, r10, lr - smladx r8, r11, r10, r8 - - add r0, r0, #4 - smlatb r10, r6, r9, r8 - - add lr, lr, #0x40 ; round_shift_and_clamp - ands r8, r7, #0xff - usat lr, #8, lr, asr #7 - add r10, r10, #0x40 - strb lr, [r1], r2 ; the result is transposed back and stored - usat r10, #8, r10, asr #7 - - ldrne r8, [r0] ; load data for next loop - ldrne r9, [r0, #4] - strb r10, [r1], r2 - - bne width_loop_2nd - - ldr r1, [sp] ; update dst for next loop - subs r7, r7, #0x10000 - add r0, r0, #16 ; updata src for next loop - add r1, r1, #1 - str r1, [sp] - - bne height_loop_2nd - - add sp, sp, #4 - ldmia sp!, {r4 - r11, pc} - - ENDP - -;--------------------------------- -; r0 short *src_ptr, -; r1 unsigned char *output_ptr, -; r2 unsigned int output_pitch, -; r3 unsigned int cnt, -; stack const short *vp9_filter -;--------------------------------- -|vp9_filter4_block2d_second_pass_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #36] ; vp9_filter address - mov r7, r3, lsl #16 ; height is top part of counter - - ldr r4, [r11] ; load up packed filter coefficients - add lr, r1, r3 ; save final destination pointer - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - pkhbt r12, r5, r4 ; pack the filter differently - pkhbt r11, r6, r5 - mov r4, #0x40 ; rounding factor (for smlad{x}) - -|height_loop_2nd_4| - ldrd r8, [r0, #-4] ; load the data - orr r7, r7, r3, lsr #1 ; loop counter - -|width_loop_2nd_4| - ldr r10, [r0, #4]! - smladx r6, r9, r12, r4 ; apply filter - pkhbt r8, r9, r8 - smlad r5, r8, r12, r4 - pkhbt r8, r10, r9 - smladx r6, r10, r11, r6 - sub r7, r7, #1 - smlad r5, r8, r11, r5 - - mov r8, r9 ; shift the data for the next loop - mov r9, r10 - - usat r6, #8, r6, asr #7 ; shift and clamp - usat r5, #8, r5, asr #7 - - strb r5, [r1], r2 ; the result is transposed back and stored - tst r7, #0xff - strb r6, [r1], r2 - - bne width_loop_2nd_4 - - subs r7, r7, #0x10000 - add r0, r0, #16 ; update src for next loop - sub r1, lr, r7, lsr #16 ; update dst for next loop - - bne height_loop_2nd_4 - - ldmia sp!, {r4 - r11, pc} - - ENDP - -;------------------------------------ -; r0 unsigned char *src_ptr -; r1 unsigned char *output_ptr, -; r2 unsigned int src_pixels_per_line -; r3 unsigned int cnt, -; stack unsigned int output_pitch, -; stack const short *vp9_filter -;------------------------------------ -|vp9_filter_block2d_first_pass_only_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - add r7, r2, r3 ; preload next low - add r7, r7, #2 - pld [r0, r7] - - ldr r4, [sp, #36] ; output pitch - ldr r11, [sp, #40] ; HFilter address - sub sp, sp, #8 - - mov r7, r3 - sub r2, r2, r3 ; inside loop increments input array, - ; so the height loop only needs to add - ; r2 - width to the input pointer - - sub r4, r4, r3 - str r4, [sp] ; save modified output pitch - str r2, [sp, #4] - - mov r2, #0x40 - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - -; six tap filter -|height_loop_1st_only_6| - ldrb r8, [r0, #-2] ; load data - ldrb r9, [r0, #-1] - ldrb r10, [r0], #2 - - mov r12, r3, lsr #1 ; loop counter - -|width_loop_1st_only_6| - ldrb r11, [r0, #-1] - - pkhbt lr, r8, r9, lsl #16 ; r9 | r8 - pkhbt r8, r9, r10, lsl #16 ; r10 | r9 - - ldrb r9, [r0] - -;; smuad lr, lr, r4 - smlad lr, lr, r4, r2 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 -;; smuad r8, r8, r4 - smlad r8, r8, r4, r2 - pkhbt r11, r11, r9, lsl #16 ; r9 | r11 - - smlad lr, r10, r5, lr - ldrb r10, [r0, #1] - smlad r8, r11, r5, r8 - ldrb r11, [r0, #2] - - subs r12, r12, #1 - - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - - smlad lr, r9, r6, lr - smlad r10, r10, r6, r8 - -;; add lr, lr, #0x40 ; round_shift_and_clamp - ldrneb r8, [r0, #-2] ; load data for next loop - usat lr, #8, lr, asr #7 -;; add r10, r10, #0x40 - strb lr, [r1], #1 ; store the result - usat r10, #8, r10, asr #7 - - ldrneb r9, [r0, #-1] - strb r10, [r1], #1 - ldrneb r10, [r0], #2 - - bne width_loop_1st_only_6 - - ldr lr, [sp] ; load back output pitch - ldr r12, [sp, #4] ; load back output pitch - subs r7, r7, #1 - add r0, r0, r12 ; updata src for next loop - - add r11, r12, r3 ; preload next low - add r11, r11, #2 - pld [r0, r11] - - add r1, r1, lr ; update dst for next loop - - bne height_loop_1st_only_6 - - add sp, sp, #8 - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp9_filter_block2d_first_pass_only_armv6| - - -;------------------------------------ -; r0 unsigned char *src_ptr, -; r1 unsigned char *output_ptr, -; r2 unsigned int src_pixels_per_line -; r3 unsigned int cnt, -; stack unsigned int output_pitch, -; stack const short *vp9_filter -;------------------------------------ -|vp9_filter_block2d_second_pass_only_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; VFilter address - ldr r12, [sp, #36] ; output pitch - - mov r7, r3, lsl #16 ; height is top part of counter - sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after - - sub sp, sp, #8 - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - str r0, [sp] ; save r0 to stack - str r1, [sp, #4] ; save dst to stack - -; six tap filter -|width_loop_2nd_only_6| - ldrb r8, [r0], r2 ; load data - orr r7, r7, r3 ; loop counter - ldrb r9, [r0], r2 - ldrb r10, [r0], r2 - -|height_loop_2nd_only_6| - ; filter first column in this inner loop, than, move to next colum. - ldrb r11, [r0], r2 - - pkhbt lr, r8, r9, lsl #16 ; r9 | r8 - pkhbt r8, r9, r10, lsl #16 ; r10 | r9 - - ldrb r9, [r0], r2 - - smuad lr, lr, r4 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - smuad r8, r8, r4 - pkhbt r11, r11, r9, lsl #16 ; r9 | r11 - - smlad lr, r10, r5, lr - ldrb r10, [r0], r2 - smlad r8, r11, r5, r8 - ldrb r11, [r0] - - sub r7, r7, #2 - sub r0, r0, r2, lsl #2 - - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - - smlad lr, r9, r6, lr - smlad r10, r10, r6, r8 - - ands r9, r7, #0xff - - add lr, lr, #0x40 ; round_shift_and_clamp - ldrneb r8, [r0], r2 ; load data for next loop - usat lr, #8, lr, asr #7 - add r10, r10, #0x40 - strb lr, [r1], r12 ; store the result for the column - usat r10, #8, r10, asr #7 - - ldrneb r9, [r0], r2 - strb r10, [r1], r12 - ldrneb r10, [r0], r2 - - bne height_loop_2nd_only_6 - - ldr r0, [sp] - ldr r1, [sp, #4] - subs r7, r7, #0x10000 - add r0, r0, #1 ; move to filter next column - str r0, [sp] - add r1, r1, #1 - str r1, [sp, #4] - - bne width_loop_2nd_only_6 - - add sp, sp, #8 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp9_filter_block2d_second_pass_only_armv6| - - END diff --git a/vp9/common/arm/armv6/idct_v6.asm b/vp9/common/arm/armv6/idct_v6.asm deleted file mode 100644 index 27215afcd..000000000 --- a/vp9/common/arm/armv6/idct_v6.asm +++ /dev/null @@ -1,345 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14 - EXPORT |vp8_short_idct4x4llm_1_v6| - EXPORT |vp8_short_idct4x4llm_v6| - EXPORT |vp8_short_idct4x4llm_v6_scott| - EXPORT |vp8_short_idct4x4llm_v6_dual| - - AREA |.text|, CODE, READONLY - -;******************************************************************************** -;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch) -;* r0 INT16 * input -;* r1 INT16 * output -;* r2 INT32 pitch -;* bench: 3/5 -;******************************************************************************** - -|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit - ; - ldrsh r0, [r0] ; load input[0] 1, r0 un 2 - add r0, r0, #4 ; 1 +4 - stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup - mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3 - pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack - mov r5, r4 ; expand expand - - strd r4, [r1], r2 ; *output = r0, post inc 1 - strd r4, [r1], r2 ; 1 - strd r4, [r1], r2 ; 1 - strd r4, [r1] ; 1 - ; - ldmia sp!, {r4, r5, pc} ; replace vars, return restore - ENDP ; |vp8_short_idct4x4llm_1_v6| -;******************************************************************************** -;******************************************************************************** -;******************************************************************************** - -;******************************************************************************** -;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch) -;* r0 INT16 * input -;* r1 INT16 * output -;* r2 INT32 pitch -;* bench: -;******************************************************************************** - -|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit - ; - stmdb sp!, {r4-r11, lr} ; backup registers 1 backup - ; - mov r4, #0x00004E00 ; 1 cst - orr r4, r4, #0x0000007B ; cospi8sqrt2minus1 - mov r5, #0x00008A00 ; 1 cst - orr r5, r5, #0x0000008C ; sinpi8sqrt2 - ; - mov r6, #4 ; i=4 1 i -loop1 ; - ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4] - ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12] - ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8] - ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0] - smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1 - smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2 - add r9, r7, r8 ; a1 = [0] + [8] 1 a1 - sub r7, r7, r8 ; b1 = [0] - [8] 1 b1 - add r11, r3, r11 ; temp2 1 - rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1 - smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2 - smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1 - add r8, r7, r11 ; b1 + c1 1 b+c - strh r8, [r1, r2] ; out[pitch] = b1+c1 1 - sub r7, r7, r11 ; b1 - c1 1 b-c - add r10, r12, r10 ; temp1 1 - add r3, r10, r3 ; d1 = temp1 + temp2 1 d1 - add r10, r9, r3 ; a1 + d1 1 a+d - sub r3, r9, r3 ; a1 - d1 1 a-d - add r8, r2, r2 ; pitch * 2 1 p*2 - strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1 - add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3 - strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1 - subs r6, r6, #1 ; i-- 1 -- - strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++ - bne loop1 ; if i>0, continue - ; - sub r1, r1, #8 ; set up out for next loop 1 -4 - ; for this iteration, input=prev output - mov r6, #4 ; i=4 1 i -; b returnfull -loop2 ; - ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1] - ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3] - ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2] - ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0] - smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1 - smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2 - add r7, r0, r3 ; a1 = [0] + [2] 1 a1 - sub r0, r0, r3 ; b1 = [0] - [2] 1 b1 - add r10, r8, r10 ; temp2 1 - rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1 - smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2 - smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1 - add r3, r0, r9 ; b1+c1 1 b+c - add r3, r3, #4 ; b1+c1+4 1 +4 - add r10, r11, r10 ; temp1 1 - mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3 - strh r3, [r1, #2] ; out[1] = b1+c1 1 - add r10, r10, r8 ; d1 = temp1 + temp2 1 d1 - add r3, r7, r10 ; a1+d1 1 a+d - add r3, r3, #4 ; a1+d1+4 1 +4 - sub r7, r7, r10 ; a1-d1 1 a-d - add r7, r7, #4 ; a1-d1+4 1 +4 - mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3 - mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3 - strh r7, [r1, #6] ; out[3] = a1-d1 1 - sub r0, r0, r9 ; b1-c1 1 b-c - add r0, r0, #4 ; b1-c1+4 1 +4 - subs r6, r6, #1 ; i-- 1 -- - mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3 - strh r0, [r1, #4] ; out[2] = b1-c1 1 - strh r3, [r1], r2 ; out[0] = a1+d1 1 -; add r1, r1, r2 ; out += pitch 1 ++ - bne loop2 ; if i>0, continue -returnfull ; - ldmia sp!, {r4 - r11, pc} ; replace vars, return restore - ENDP - -;******************************************************************************** -;******************************************************************************** -;******************************************************************************** - -;******************************************************************************** -;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch) -;* r0 INT16 * input -;* r1 INT16 * output -;* r2 INT32 pitch -;* bench: -;******************************************************************************** - -|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit -; mov r0, #0 ; -; ldr r0, [r0] ; - stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup - ; - mov r3, #0x00004E00 ; cos - orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 - mov r4, #0x00008A00 ; sin - orr r4, r4, #0x0000008C ; sinpi8sqrt2 - ; - mov r5, #0x2 ; i i - ; -short_idct4x4llm_v6_scott_loop1 ; - ldr r10, [r0, #(4*2)] ; i5 | i4 5,4 - ldr r11, [r0, #(12*2)] ; i13 | i12 13,12 - ; - smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1 - smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2 - ; - smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2 - smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1 - ; - add r6, r6, r7 ; partial c1 lt1-lt2 - add r12, r12, r14 ; partial d1 l2t2+l2t1 - ; - smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1 - smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2 - ; - smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1 - smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2 - ; - add r7, r14, r7 ; partial c1_2 ht1+ht2 - sub r8, r8, r9 ; partial d1_2 h2t1-h2t2 - ; - pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack - pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack - ; - usub16 r6, r6, r10 ; c1_2 | c1_1 c - uadd16 r12, r12, r11 ; d1_2 | d1_1 d - ; - ldr r10, [r0, #0] ; i1 | i0 1,0 - ldr r11, [r0, #(8*2)] ; i9 | i10 9,10 - ; -;;;;;; add r0, r0, #0x4 ; +4 -;;;;;; add r1, r1, #0x4 ; +4 - ; - uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a - usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b - ; - uadd16 r7, r8, r12 ; a1 + d1 pair a+d - usub16 r14, r8, r12 ; a1 - d1 pair a-d - ; - str r7, [r1] ; op[0] = a1 + d1 - str r14, [r1, r2] ; op[pitch*3] = a1 - d1 - ; - add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++ - add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++ - ; - subs r5, r5, #0x1 ; -- - bne short_idct4x4llm_v6_scott_loop1 ; - ; - sub r1, r1, #16 ; reset output ptr - mov r5, #0x4 ; - mov r0, r1 ; input = output - ; -short_idct4x4llm_v6_scott_loop2 ; - ; - subs r5, r5, #0x1 ; - bne short_idct4x4llm_v6_scott_loop2 ; - ; - ldmia sp!, {r4 - r11, pc} ; - ENDP ; - ; -;******************************************************************************** -;******************************************************************************** -;******************************************************************************** - -;******************************************************************************** -;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch) -;* r0 INT16 * input -;* r1 INT16 * output -;* r2 INT32 pitch -;* bench: -;******************************************************************************** - -|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit - ; - stmdb sp!, {r4-r11, lr} ; backup registers 1 backup - mov r3, #0x00004E00 ; cos - orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 - mov r4, #0x00008A00 ; sin - orr r4, r4, #0x0000008C ; sinpi8sqrt2 - mov r5, #0x2 ; i=2 i -loop1_dual - ldr r6, [r0, #(4*2)] ; i5 | i4 5|4 - ldr r12, [r0, #(12*2)] ; i13 | i12 13|12 - ldr r14, [r0, #(8*2)] ; i9 | i8 9|8 - - smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c - smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s - smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s - pkhbt r7, r7, r9, lsl #16 ; 5c | 4c - smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c - pkhbt r8, r8, r10, lsl #16 ; 5s | 4s - uadd16 r6, r6, r7 ; 5c+5 | 4c+4 - smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s - smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c - smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s - subs r5, r5, #0x1 ; i-- -- - pkhbt r9, r9, r11, lsl #16 ; 13c | 12c - ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0 - pkhbt r10, r10, r7, lsl #16 ; 13s | 12s - uadd16 r7, r12, r9 ; 13c+13 | 12c+12 - usub16 r7, r8, r7 ; c c - uadd16 r6, r6, r10 ; d d - uadd16 r10, r11, r14 ; a a - usub16 r8, r11, r14 ; b b - uadd16 r9, r10, r6 ; a+d a+d - usub16 r10, r10, r6 ; a-d a-d - uadd16 r6, r8, r7 ; b+c b+c - usub16 r7, r8, r7 ; b-c b-c - str r6, [r1, r2] ; o5 | o4 - add r6, r2, r2 ; pitch * 2 p2 - str r7, [r1, r6] ; o9 | o8 - add r6, r6, r2 ; pitch * 3 p3 - str r10, [r1, r6] ; o13 | o12 - str r9, [r1], #0x4 ; o1 | o0 ++ - bne loop1_dual ; - mov r5, #0x2 ; i=2 i - sub r0, r1, #8 ; reset input/output i/o -loop2_dual - ldr r6, [r0, r2] ; i5 | i4 5|4 - ldr r1, [r0] ; i1 | i0 1|0 - ldr r12, [r0, #0x4] ; i3 | i2 3|2 - add r14, r2, #0x4 ; pitch + 2 p+2 - ldr r14, [r0, r14] ; i7 | i6 7|6 - smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c - smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s - smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s - pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4 - pkhbt r7, r9, r7, lsl #16 ; 1c | 5c - pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1 - pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5 - uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2 - pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6 - uadd16 r10, r11, r9 ; a a - usub16 r9, r11, r9 ; b b - pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7 - subs r5, r5, #0x1 ; i-- -- - smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c - smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s - smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c - smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s - - pkhbt r7, r12, r7, lsl #16 ; 3c | 7c - pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1 - uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2 - usub16 r12, r8, r6 ; c (o1 | o5) c - uadd16 r6, r11, r1 ; d (o3 | o7) d - uadd16 r7, r10, r6 ; a+d a+d - mov r8, #0x4 ; set up 4's 4 - orr r8, r8, #0x40000 ; 4|4 - usub16 r6, r10, r6 ; a-d a-d - uadd16 r6, r6, r8 ; a-d+4 3|7 - uadd16 r7, r7, r8 ; a+d+4 0|4 - uadd16 r10, r9, r12 ; b+c b+c - usub16 r1, r9, r12 ; b-c b-c - uadd16 r10, r10, r8 ; b+c+4 1|5 - uadd16 r1, r1, r8 ; b-c+4 2|6 - mov r8, r10, asr #19 ; o1 >> 3 - strh r8, [r0, #2] ; o1 - mov r8, r1, asr #19 ; o2 >> 3 - strh r8, [r0, #4] ; o2 - mov r8, r6, asr #19 ; o3 >> 3 - strh r8, [r0, #6] ; o3 - mov r8, r7, asr #19 ; o0 >> 3 - strh r8, [r0], r2 ; o0 +p - sxth r10, r10 ; - mov r8, r10, asr #3 ; o5 >> 3 - strh r8, [r0, #2] ; o5 - sxth r1, r1 ; - mov r8, r1, asr #3 ; o6 >> 3 - strh r8, [r0, #4] ; o6 - sxth r6, r6 ; - mov r8, r6, asr #3 ; o7 >> 3 - strh r8, [r0, #6] ; o7 - sxth r7, r7 ; - mov r8, r7, asr #3 ; o4 >> 3 - strh r8, [r0], r2 ; o4 +p -;;;;; subs r5, r5, #0x1 ; i-- -- - bne loop2_dual ; - ; - ldmia sp!, {r4 - r11, pc} ; replace vars, return restore - ENDP - - END diff --git a/vp9/common/arm/armv6/iwalsh_v6.asm b/vp9/common/arm/armv6/iwalsh_v6.asm deleted file mode 100644 index 463bff0f5..000000000 --- a/vp9/common/arm/armv6/iwalsh_v6.asm +++ /dev/null @@ -1,152 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp8_short_inv_walsh4x4_v6| - EXPORT |vp8_short_inv_walsh4x4_1_v6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY ; name this block of code - -;short vp8_short_inv_walsh4x4_v6(short *input, short *output) -|vp8_short_inv_walsh4x4_v6| PROC - - stmdb sp!, {r4 - r11, lr} - - ldr r2, [r0], #4 ; [1 | 0] - ldr r3, [r0], #4 ; [3 | 2] - ldr r4, [r0], #4 ; [5 | 4] - ldr r5, [r0], #4 ; [7 | 6] - ldr r6, [r0], #4 ; [9 | 8] - ldr r7, [r0], #4 ; [11 | 10] - ldr r8, [r0], #4 ; [13 | 12] - ldr r9, [r0] ; [15 | 14] - - qadd16 r10, r2, r8 ; a1 [1+13 | 0+12] - qadd16 r11, r4, r6 ; b1 [5+9 | 4+8] - qsub16 r12, r4, r6 ; c1 [5-9 | 4-8] - qsub16 lr, r2, r8 ; d1 [1-13 | 0-12] - - qadd16 r2, r10, r11 ; a1 + b1 [1 | 0] - qadd16 r4, r12, lr ; c1 + d1 [5 | 4] - qsub16 r6, r10, r11 ; a1 - b1 [9 | 8] - qsub16 r8, lr, r12 ; d1 - c1 [13 | 12] - - qadd16 r10, r3, r9 ; a1 [3+15 | 2+14] - qadd16 r11, r5, r7 ; b1 [7+11 | 6+10] - qsub16 r12, r5, r7 ; c1 [7-11 | 6-10] - qsub16 lr, r3, r9 ; d1 [3-15 | 2-14] - - qadd16 r3, r10, r11 ; a1 + b1 [3 | 2] - qadd16 r5, r12, lr ; c1 + d1 [7 | 6] - qsub16 r7, r10, r11 ; a1 - b1 [11 | 10] - qsub16 r9, lr, r12 ; d1 - c1 [15 | 14] - - ; first transform complete - - qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3] - qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3] - qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7] - qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7] - - qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1] - qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1] - ldr r10, c0x00030003 - qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1] - qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1] - - qadd16 r2, r2, r10 ; [b2+3|c2+3] - qadd16 r3, r3, r10 ; [a2+3|d2+3] - qadd16 r4, r4, r10 ; [b2+3|c2+3] - qadd16 r5, r5, r10 ; [a2+3|d2+3] - - asr r12, r2, #3 ; [1 | x] - pkhtb r12, r12, r3, asr #19; [1 | 0] - lsl lr, r3, #16 ; [~3 | x] - lsl r2, r2, #16 ; [~2 | x] - asr lr, lr, #3 ; [3 | x] - pkhtb lr, lr, r2, asr #19 ; [3 | 2] - - asr r2, r4, #3 ; [5 | x] - pkhtb r2, r2, r5, asr #19 ; [5 | 4] - lsl r3, r5, #16 ; [~7 | x] - lsl r4, r4, #16 ; [~6 | x] - asr r3, r3, #3 ; [7 | x] - pkhtb r3, r3, r4, asr #19 ; [7 | 6] - - str r12, [r1], #4 - str lr, [r1], #4 - str r2, [r1], #4 - str r3, [r1], #4 - - qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11] - qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11] - qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15] - qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15] - - qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1] - qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1] - qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1] - qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1] - - qadd16 r6, r6, r10 ; [b2+3|c2+3] - qadd16 r7, r7, r10 ; [a2+3|d2+3] - qadd16 r8, r8, r10 ; [b2+3|c2+3] - qadd16 r9, r9, r10 ; [a2+3|d2+3] - - asr r2, r6, #3 ; [9 | x] - pkhtb r2, r2, r7, asr #19 ; [9 | 8] - lsl r3, r7, #16 ; [~11| x] - lsl r4, r6, #16 ; [~10| x] - asr r3, r3, #3 ; [11 | x] - pkhtb r3, r3, r4, asr #19 ; [11 | 10] - - asr r4, r8, #3 ; [13 | x] - pkhtb r4, r4, r9, asr #19 ; [13 | 12] - lsl r5, r9, #16 ; [~15| x] - lsl r6, r8, #16 ; [~14| x] - asr r5, r5, #3 ; [15 | x] - pkhtb r5, r5, r6, asr #19 ; [15 | 14] - - str r2, [r1], #4 - str r3, [r1], #4 - str r4, [r1], #4 - str r5, [r1] - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_short_inv_walsh4x4_v6| - - -;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output) -|vp8_short_inv_walsh4x4_1_v6| PROC - - ldrsh r2, [r0] ; [0] - add r2, r2, #3 ; [0] + 3 - asr r2, r2, #3 ; a1 ([0]+3) >> 3 - lsl r2, r2, #16 ; [a1 | x] - orr r2, r2, r2, lsr #16 ; [a1 | a1] - - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1], #4 - str r2, [r1] - - bx lr - ENDP ; |vp8_short_inv_walsh4x4_1_v6| - -; Constant Pool -c0x00030003 DCD 0x00030003 - END diff --git a/vp9/common/arm/armv6/loopfilter_v6.asm b/vp9/common/arm/armv6/loopfilter_v6.asm deleted file mode 100644 index 37b54a39c..000000000 --- a/vp9/common/arm/armv6/loopfilter_v6.asm +++ /dev/null @@ -1,1282 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_loop_filter_horizontal_edge_armv6| - EXPORT |vp9_mbloop_filter_horizontal_edge_armv6| - EXPORT |vp9_loop_filter_vertical_edge_armv6| - EXPORT |vp9_mbloop_filter_vertical_edge_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code - - MACRO - TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 - ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 - ; a0: 03 02 01 00 - ; a1: 13 12 11 10 - ; a2: 23 22 21 20 - ; a3: 33 32 31 30 - ; b3 b2 b1 b0 - - uxtb16 $b1, $a1 ; xx 12 xx 10 - uxtb16 $b0, $a0 ; xx 02 xx 00 - uxtb16 $b3, $a3 ; xx 32 xx 30 - uxtb16 $b2, $a2 ; xx 22 xx 20 - orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 - orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 - - uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 - uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 - uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 - uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 - orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 - orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 - - pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 - pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 - - pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 - pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 - MEND - - -src RN r0 -pstep RN r1 -count RN r5 - -;r0 unsigned char *src_ptr, -;r1 int src_pixel_step, -;r2 const char *blimit, -;r3 const char *limit, -;stack const char *thresh, -;stack int count - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp9_loop_filter_horizontal_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r6, [sp, #36] ; load thresh address - sub sp, sp, #16 ; create temp buffer - - ldr r9, [src], pstep ; p3 - ldrb r4, [r2] ; blimit - ldr r10, [src], pstep ; p2 - ldrb r2, [r3] ; limit - ldr r11, [src], pstep ; p1 - orr r4, r4, r4, lsl #8 - ldrb r3, [r6] ; thresh - orr r2, r2, r2, lsl #8 - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|Hnext8| - ; vp9_filter_mask() function - ; calculate breakout conditions - ldr r12, [src], pstep ; p0 - - uqsub8 r6, r9, r10 ; p3 - p2 - uqsub8 r7, r10, r9 ; p2 - p3 - uqsub8 r8, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - - orr r6, r6, r7 ; abs (p3-p2) - orr r8, r8, r10 ; abs (p2-p1) - uqsub8 lr, r6, r2 ; compare to limit. lr: vp9_filter_mask - uqsub8 r8, r8, r2 ; compare to limit - uqsub8 r6, r11, r12 ; p1 - p0 - orr lr, lr, r8 - uqsub8 r7, r12, r11 ; p0 - p1 - ldr r9, [src], pstep ; q0 - ldr r10, [src], pstep ; q1 - orr r6, r6, r7 ; abs (p1-p0) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later - orr lr, lr, r7 - - uqsub8 r6, r11, r10 ; p1 - q1 - uqsub8 r7, r10, r11 ; q1 - p1 - uqsub8 r11, r12, r9 ; p0 - q0 - uqsub8 r12, r9, r12 ; q0 - p0 - orr r6, r6, r7 ; abs (p1-q1) - ldr r7, c0x7F7F7F7F - orr r12, r11, r12 ; abs (p0-q0) - ldr r11, [src], pstep ; q2 - uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 - and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r7, r9, r10 ; q0 - q1 - uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r6, r10, r9 ; q1 - q0 - uqsub8 r12, r12, r4 ; compare to flimit - uqsub8 r9, r11, r10 ; q2 - q1 - - orr lr, lr, r12 - - ldr r12, [src], pstep ; q3 - uqsub8 r10, r10, r11 ; q1 - q2 - orr r6, r7, r6 ; abs (q1-q0) - orr r10, r9, r10 ; abs (q2-q1) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r10, r10, r2 ; compare to limit - uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later - orr lr, lr, r7 - orr lr, lr, r10 - - uqsub8 r10, r12, r11 ; q3 - q2 - uqsub8 r9, r11, r12 ; q2 - q3 - - mvn r11, #0 ; r11 == -1 - - orr r10, r10, r9 ; abs (q3-q2) - uqsub8 r10, r10, r2 ; compare to limit - - mov r12, #0 - orr lr, lr, r10 - sub src, src, pstep, lsl #2 - - usub8 lr, r12, lr ; use usub8 instead of ssub8 - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq hskip_filter ; skip filtering - - sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines - - ;vp8_hevmask() function - ;calculate high edge variance - orr r10, r6, r8 ; calculate vp8_hevmask - - ldr r7, [src], pstep ; p1 - - usub8 r10, r12, r10 ; use usub8 instead of ssub8 - sel r6, r12, r11 ; obtain vp8_hevmask: r6 - - ;vp9_filter() function - ldr r8, [src], pstep ; p0 - ldr r12, c0x80808080 - ldr r9, [src], pstep ; q0 - ldr r10, [src], pstep ; q1 - - eor r7, r7, r12 ; p1 offset to convert to a signed value - eor r8, r8, r12 ; p0 offset to convert to a signed value - eor r9, r9, r12 ; q0 offset to convert to a signed value - eor r10, r10, r12 ; q1 offset to convert to a signed value - - str r9, [sp] ; store qs0 temporarily - str r8, [sp, #4] ; store ps0 temporarily - str r10, [sp, #8] ; store qs1 temporarily - str r7, [sp, #12] ; store ps1 temporarily - - qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1) - qsub8 r8, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) - - and r7, r7, r6 ; vp9_filter (r7) &= hev - - qadd8 r7, r7, r8 - ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 - - qadd8 r7, r7, r8 - ldr r10, c0x04040404 - - qadd8 r7, r7, r8 - and r7, r7, lr ; vp9_filter &= mask; - - ;modify code for vp8 -- Filter1 = vp9_filter (r7) - qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3) - qadd8 r7 , r7 , r10 ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4) - - mov r9, #0 - shadd8 r8 , r8 , r9 ; Filter2 >>= 3 - shadd8 r7 , r7 , r9 ; vp9_filter >>= 3 - shadd8 r8 , r8 , r9 - shadd8 r7 , r7 , r9 - shadd8 lr , r8 , r9 ; lr: Filter2 - shadd8 r7 , r7 , r9 ; r7: filter - - ;usub8 lr, r8, r10 ; s = (s==4)*-1 - ;sel lr, r11, r9 - ;usub8 r8, r10, r8 - ;sel r8, r11, r9 - ;and r8, r8, lr ; -1 for each element that equals 4 - - ;calculate output - ;qadd8 lr, r8, r7 ; u = vp9_signed_char_clamp(s + vp9_filter) - - ldr r8, [sp] ; load qs0 - ldr r9, [sp, #4] ; load ps0 - - ldr r10, c0x01010101 - - qsub8 r8 ,r8, r7 ; u = vp9_signed_char_clamp(qs0 - vp9_filter) - qadd8 r9, r9, lr ; u = vp9_signed_char_clamp(ps0 + Filter2) - - ;end of modification for vp8 - - mov lr, #0 - sadd8 r7, r7 , r10 ; vp9_filter += 1 - shadd8 r7, r7, lr ; vp9_filter >>= 1 - - ldr r11, [sp, #12] ; load ps1 - ldr r10, [sp, #8] ; load qs1 - - bic r7, r7, r6 ; vp9_filter &= ~hev - sub src, src, pstep, lsl #2 - - qadd8 r11, r11, r7 ; u = vp9_signed_char_clamp(ps1 + vp9_filter) - qsub8 r10, r10,r7 ; u = vp9_signed_char_clamp(qs1 - vp9_filter) - - eor r11, r11, r12 ; *op1 = u^0x80 - str r11, [src], pstep ; store op1 - eor r9, r9, r12 ; *op0 = u^0x80 - str r9, [src], pstep ; store op0 result - eor r8, r8, r12 ; *oq0 = u^0x80 - str r8, [src], pstep ; store oq0 result - eor r10, r10, r12 ; *oq1 = u^0x80 - str r10, [src], pstep ; store oq1 - - sub src, src, pstep, lsl #1 - -|hskip_filter| - add src, src, #4 - sub src, src, pstep, lsl #2 - - subs count, count, #1 - - ldrne r9, [src], pstep ; p3 - ldrne r10, [src], pstep ; p2 - ldrne r11, [src], pstep ; p1 - - bne Hnext8 - - add sp, sp, #16 - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp9_loop_filter_horizontal_edge_armv6| - - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp8_mbloop_filter_horizontal_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r6, [sp, #36] ; load thresh address - sub sp, sp, #16 ; create temp buffer - - ldr r9, [src], pstep ; p3 - ldrb r4, [r2] ; blimit - ldr r10, [src], pstep ; p2 - ldrb r2, [r3] ; limit - ldr r11, [src], pstep ; p1 - orr r4, r4, r4, lsl #8 - ldrb r3, [r6] ; thresh - orr r2, r2, r2, lsl #8 - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|MBHnext8| - - ; vp9_filter_mask() function - ; calculate breakout conditions - ldr r12, [src], pstep ; p0 - - uqsub8 r6, r9, r10 ; p3 - p2 - uqsub8 r7, r10, r9 ; p2 - p3 - uqsub8 r8, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - - orr r6, r6, r7 ; abs (p3-p2) - orr r8, r8, r10 ; abs (p2-p1) - uqsub8 lr, r6, r2 ; compare to limit. lr: vp9_filter_mask - uqsub8 r8, r8, r2 ; compare to limit - - uqsub8 r6, r11, r12 ; p1 - p0 - orr lr, lr, r8 - uqsub8 r7, r12, r11 ; p0 - p1 - ldr r9, [src], pstep ; q0 - ldr r10, [src], pstep ; q1 - orr r6, r6, r7 ; abs (p1-p0) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later - orr lr, lr, r7 - - uqsub8 r6, r11, r10 ; p1 - q1 - uqsub8 r7, r10, r11 ; q1 - p1 - uqsub8 r11, r12, r9 ; p0 - q0 - uqsub8 r12, r9, r12 ; q0 - p0 - orr r6, r6, r7 ; abs (p1-q1) - ldr r7, c0x7F7F7F7F - orr r12, r11, r12 ; abs (p0-q0) - ldr r11, [src], pstep ; q2 - uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 - and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r7, r9, r10 ; q0 - q1 - uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r6, r10, r9 ; q1 - q0 - uqsub8 r12, r12, r4 ; compare to flimit - uqsub8 r9, r11, r10 ; q2 - q1 - - orr lr, lr, r12 - - ldr r12, [src], pstep ; q3 - - uqsub8 r10, r10, r11 ; q1 - q2 - orr r6, r7, r6 ; abs (q1-q0) - orr r10, r9, r10 ; abs (q2-q1) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r10, r10, r2 ; compare to limit - uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later - orr lr, lr, r7 - orr lr, lr, r10 - - uqsub8 r10, r12, r11 ; q3 - q2 - uqsub8 r9, r11, r12 ; q2 - q3 - - mvn r11, #0 ; r11 == -1 - - orr r10, r10, r9 ; abs (q3-q2) - uqsub8 r10, r10, r2 ; compare to limit - - mov r12, #0 - - orr lr, lr, r10 - - usub8 lr, r12, lr ; use usub8 instead of ssub8 - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq mbhskip_filter ; skip filtering - - ;vp8_hevmask() function - ;calculate high edge variance - sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines - sub src, src, pstep, lsl #1 - - orr r10, r6, r8 - ldr r7, [src], pstep ; p1 - - usub8 r10, r12, r10 - sel r6, r12, r11 ; hev mask: r6 - - ;vp8_mbfilter() function - ;p2, q2 are only needed at the end. Don't need to load them in now. - ldr r8, [src], pstep ; p0 - ldr r12, c0x80808080 - ldr r9, [src], pstep ; q0 - ldr r10, [src] ; q1 - - eor r7, r7, r12 ; ps1 - eor r8, r8, r12 ; ps0 - eor r9, r9, r12 ; qs0 - eor r10, r10, r12 ; qs1 - - qsub8 r12, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) - str r7, [sp, #12] ; store ps1 temporarily - qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1) - str r10, [sp, #8] ; store qs1 temporarily - qadd8 r7, r7, r12 - str r9, [sp] ; store qs0 temporarily - qadd8 r7, r7, r12 - str r8, [sp, #4] ; store ps0 temporarily - qadd8 r7, r7, r12 ; vp9_filter: r7 - - ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 - ldr r9, c0x04040404 - - and r7, r7, lr ; vp9_filter &= mask (lr is free) - - mov r12, r7 ; Filter2: r12 - and r12, r12, r6 ; Filter2 &= hev - - ;modify code for vp8 - ;save bottom 3 bits so that we round one side +4 and the other +3 - qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4) - qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3) - - mov r10, #0 - shadd8 r8 , r8 , r10 ; Filter1 >>= 3 - shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - shadd8 r8 , r8 , r10 - shadd8 r12 , r12 , r10 - shadd8 r8 , r8 , r10 ; r8: Filter1 - shadd8 r12 , r12 , r10 ; r12: Filter2 - - ldr r9, [sp] ; load qs0 - ldr r11, [sp, #4] ; load ps0 - - qsub8 r9 , r9, r8 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1) - qadd8 r11, r11, r12 ; ps0 = vp9_signed_char_clamp(ps0 + Filter2) - - ;save bottom 3 bits so that we round one side +4 and the other +3 - ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) - ;qadd8 r12 , r12 , r9 ; Filter2 = vp9_signed_char_clamp(Filter2+4) - ;mov r10, #0 - ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - ;usub8 lr, r8, r9 ; s = (s==4)*-1 - ;sel lr, r11, r10 - ;shadd8 r12 , r12 , r10 - ;usub8 r8, r9, r8 - ;sel r8, r11, r10 - ;ldr r9, [sp] ; load qs0 - ;ldr r11, [sp, #4] ; load ps0 - ;shadd8 r12 , r12 , r10 - ;and r8, r8, lr ; -1 for each element that equals 4 - ;qadd8 r10, r8, r12 ; u = vp9_signed_char_clamp(s + Filter2) - ;qsub8 r9 , r9, r12 ; qs0 = vp9_signed_char_clamp(qs0 - Filter2) - ;qadd8 r11, r11, r10 ; ps0 = vp9_signed_char_clamp(ps0 + u) - - ;end of modification for vp8 - - bic r12, r7, r6 ; vp9_filter &= ~hev ( r6 is free) - ;mov r12, r7 - - ;roughly 3/7th difference across boundary - mov lr, #0x1b ; 27 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r7, r10, lr, r7 - smultb r10, r10, lr - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - add r10, r10, #63 - ssat r7, #8, r7, asr #7 - ssat r10, #8, r10, asr #7 - - ldr lr, c0x80808080 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r7, r10, lsl #16 - uxtb16 r6, r6 - uxtb16 r10, r10 - - sub src, src, pstep - - orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7) - - qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs0 - u) - qadd8 r10, r11, r10 ; s = vp9_signed_char_clamp(ps0 + u) - eor r8, r8, lr ; *oq0 = s^0x80 - str r8, [src] ; store *oq0 - sub src, src, pstep - eor r10, r10, lr ; *op0 = s^0x80 - str r10, [src] ; store *op0 - - ;roughly 2/7th difference across boundary - mov lr, #0x12 ; 18 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r9, r10, lr, r7 - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r9, #8, r9, asr #7 - ssat r10, #8, r10, asr #7 - - ldr lr, c0x80808080 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r9, r10, lsl #16 - - ldr r9, [sp, #8] ; load qs1 - ldr r11, [sp, #12] ; load ps1 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - sub src, src, pstep - - orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7) - - qadd8 r11, r11, r10 ; s = vp9_signed_char_clamp(ps1 + u) - qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs1 - u) - eor r11, r11, lr ; *op1 = s^0x80 - str r11, [src], pstep ; store *op1 - eor r8, r8, lr ; *oq1 = s^0x80 - add src, src, pstep, lsl #1 - - mov r7, #0x3f ; 63 - - str r8, [src], pstep ; store *oq1 - - ;roughly 1/7th difference across boundary - mov lr, #0x9 ; 9 - ldr r9, [src] ; load q2 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r12, r10, lr, r7 - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r12, #8, r12, asr #7 - ssat r10, #8, r10, asr #7 - - sub src, src, pstep, lsl #2 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r12, r10, lsl #16 - - sub src, src, pstep - ldr lr, c0x80808080 - - ldr r11, [src] ; load p2 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - eor r9, r9, lr - eor r11, r11, lr - - orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7) - - qadd8 r8, r11, r10 ; s = vp9_signed_char_clamp(ps2 + u) - qsub8 r10, r9, r10 ; s = vp9_signed_char_clamp(qs2 - u) - eor r8, r8, lr ; *op2 = s^0x80 - str r8, [src], pstep, lsl #2 ; store *op2 - add src, src, pstep - eor r10, r10, lr ; *oq2 = s^0x80 - str r10, [src], pstep, lsl #1 ; store *oq2 - -|mbhskip_filter| - add src, src, #4 - sub src, src, pstep, lsl #3 - subs count, count, #1 - - ldrne r9, [src], pstep ; p3 - ldrne r10, [src], pstep ; p2 - ldrne r11, [src], pstep ; p1 - - bne MBHnext8 - - add sp, sp, #16 - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6| - - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp9_loop_filter_vertical_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, #4 ; move src pointer down by 4 - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r12, [sp, #36] ; load thresh address - sub sp, sp, #16 ; create temp buffer - - ldr r6, [src], pstep ; load source data - ldrb r4, [r2] ; blimit - ldr r7, [src], pstep - ldrb r2, [r3] ; limit - ldr r8, [src], pstep - orr r4, r4, r4, lsl #8 - ldrb r3, [r12] ; thresh - orr r2, r2, r2, lsl #8 - ldr lr, [src], pstep - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|Vnext8| - - ; vp9_filter_mask() function - ; calculate breakout conditions - ; transpose the source data for 4-in-parallel operation - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - uqsub8 r7, r9, r10 ; p3 - p2 - uqsub8 r8, r10, r9 ; p2 - p3 - uqsub8 r9, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - orr r7, r7, r8 ; abs (p3-p2) - orr r10, r9, r10 ; abs (p2-p1) - uqsub8 lr, r7, r2 ; compare to limit. lr: vp9_filter_mask - uqsub8 r10, r10, r2 ; compare to limit - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr lr, lr, r10 - - uqsub8 r6, r11, r12 ; p1 - p0 - uqsub8 r7, r12, r11 ; p0 - p1 - add src, src, #4 ; move src pointer up by 4 - orr r6, r6, r7 ; abs (p1-p0) - str r11, [sp, #12] ; save p1 - uqsub8 r10, r6, r2 ; compare to limit - uqsub8 r11, r6, r3 ; compare to thresh - orr lr, lr, r10 - - ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now - ; transpose the source data for 4-in-parallel operation - ldr r6, [src], pstep ; load source data - str r11, [sp] ; push r11 to stack - ldr r7, [src], pstep - str r12, [sp, #4] ; save current reg before load q0 - q3 data - ldr r8, [src], pstep - str lr, [sp, #8] - ldr lr, [src], pstep - - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - ldr lr, [sp, #8] ; load back (f)limit accumulator - - uqsub8 r6, r12, r11 ; q3 - q2 - uqsub8 r7, r11, r12 ; q2 - q3 - uqsub8 r12, r11, r10 ; q2 - q1 - uqsub8 r11, r10, r11 ; q1 - q2 - orr r6, r6, r7 ; abs (q3-q2) - orr r7, r12, r11 ; abs (q2-q1) - uqsub8 r6, r6, r2 ; compare to limit - uqsub8 r7, r7, r2 ; compare to limit - ldr r11, [sp, #4] ; load back p0 - ldr r12, [sp, #12] ; load back p1 - orr lr, lr, r6 - orr lr, lr, r7 - - uqsub8 r6, r11, r9 ; p0 - q0 - uqsub8 r7, r9, r11 ; q0 - p0 - uqsub8 r8, r12, r10 ; p1 - q1 - uqsub8 r11, r10, r12 ; q1 - p1 - orr r6, r6, r7 ; abs (p0-q0) - ldr r7, c0x7F7F7F7F - orr r8, r8, r11 ; abs (p1-q1) - uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 - and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r11, r10, r9 ; q1 - q0 - uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r12, r9, r10 ; q0 - q1 - uqsub8 r6, r6, r4 ; compare to flimit - - orr r9, r11, r12 ; abs (q1-q0) - uqsub8 r8, r9, r2 ; compare to limit - uqsub8 r10, r9, r3 ; compare to thresh - orr lr, lr, r6 - orr lr, lr, r8 - - mvn r11, #0 ; r11 == -1 - mov r12, #0 - - usub8 lr, r12, lr - ldr r9, [sp] ; load the compared result - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq vskip_filter ; skip filtering - - ;vp8_hevmask() function - ;calculate high edge variance - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr r9, r9, r10 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - usub8 r9, r12, r9 - sel r6, r12, r11 ; hev mask: r6 - - ;vp9_filter() function - ; load soure data to r6, r11, r12, lr - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - pkhbt r12, r7, r8, lsl #16 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - pkhbt r11, r9, r10, lsl #16 - - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first - str r6, [sp] - str lr, [sp, #4] - - pkhbt r6, r7, r8, lsl #16 - pkhbt lr, r9, r10, lsl #16 - - ;transpose r12, r11, r6, lr to r7, r8, r9, r10 - TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 - - ;load back hev_mask r6 and filter_mask lr - ldr r12, c0x80808080 - ldr r6, [sp] - ldr lr, [sp, #4] - - eor r7, r7, r12 ; p1 offset to convert to a signed value - eor r8, r8, r12 ; p0 offset to convert to a signed value - eor r9, r9, r12 ; q0 offset to convert to a signed value - eor r10, r10, r12 ; q1 offset to convert to a signed value - - str r9, [sp] ; store qs0 temporarily - str r8, [sp, #4] ; store ps0 temporarily - str r10, [sp, #8] ; store qs1 temporarily - str r7, [sp, #12] ; store ps1 temporarily - - qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1) - qsub8 r8, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) - - and r7, r7, r6 ; vp9_filter (r7) &= hev (r7 : filter) - - qadd8 r7, r7, r8 - ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 - - qadd8 r7, r7, r8 - ldr r10, c0x04040404 - - qadd8 r7, r7, r8 - ;mvn r11, #0 ; r11 == -1 - - and r7, r7, lr ; vp9_filter &= mask - - ;modify code for vp8 -- Filter1 = vp9_filter (r7) - qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3) - qadd8 r7 , r7 , r10 ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4) - - mov r9, #0 - shadd8 r8 , r8 , r9 ; Filter2 >>= 3 - shadd8 r7 , r7 , r9 ; vp9_filter >>= 3 - shadd8 r8 , r8 , r9 - shadd8 r7 , r7 , r9 - shadd8 lr , r8 , r9 ; lr: filter2 - shadd8 r7 , r7 , r9 ; r7: filter - - ;usub8 lr, r8, r10 ; s = (s==4)*-1 - ;sel lr, r11, r9 - ;usub8 r8, r10, r8 - ;sel r8, r11, r9 - ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s - - ;calculate output - ;qadd8 lr, r8, r7 ; u = vp9_signed_char_clamp(s + vp9_filter) - - ldr r8, [sp] ; load qs0 - ldr r9, [sp, #4] ; load ps0 - - ldr r10, c0x01010101 - - qsub8 r8, r8, r7 ; u = vp9_signed_char_clamp(qs0 - vp9_filter) - qadd8 r9, r9, lr ; u = vp9_signed_char_clamp(ps0 + Filter2) - ;end of modification for vp8 - - eor r8, r8, r12 - eor r9, r9, r12 - - mov lr, #0 - - sadd8 r7, r7, r10 - shadd8 r7, r7, lr - - ldr r10, [sp, #8] ; load qs1 - ldr r11, [sp, #12] ; load ps1 - - bic r7, r7, r6 ; r7: vp9_filter - - qsub8 r10 , r10, r7 ; u = vp9_signed_char_clamp(qs1 - vp9_filter) - qadd8 r11, r11, r7 ; u = vp9_signed_char_clamp(ps1 + vp9_filter) - eor r10, r10, r12 - eor r11, r11, r12 - - sub src, src, pstep, lsl #2 - - ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1 - ;output is b0, b1, b2, b3 - ;b0: 03 02 01 00 - ;b1: 13 12 11 10 - ;b2: 23 22 21 20 - ;b3: 33 32 31 30 - ; p1 p0 q0 q1 - ; (a3 a2 a1 a0) - TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr - - strh r6, [src, #-2] ; store the result - mov r6, r6, lsr #16 - strh r6, [src], pstep - - strh r7, [src, #-2] - mov r7, r7, lsr #16 - strh r7, [src], pstep - - strh r12, [src, #-2] - mov r12, r12, lsr #16 - strh r12, [src], pstep - - strh lr, [src, #-2] - mov lr, lr, lsr #16 - strh lr, [src], pstep - -|vskip_filter| - sub src, src, #4 - subs count, count, #1 - - ldrne r6, [src], pstep ; load source data - ldrne r7, [src], pstep - ldrne r8, [src], pstep - ldrne lr, [src], pstep - - bne Vnext8 - - add sp, sp, #16 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp9_loop_filter_vertical_edge_armv6| - - - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp8_mbloop_filter_vertical_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, #4 ; move src pointer down by 4 - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r12, [sp, #36] ; load thresh address - pld [src, #23] ; preload for next block - sub sp, sp, #16 ; create temp buffer - - ldr r6, [src], pstep ; load source data - ldrb r4, [r2] ; blimit - pld [src, #23] - ldr r7, [src], pstep - ldrb r2, [r3] ; limit - pld [src, #23] - ldr r8, [src], pstep - orr r4, r4, r4, lsl #8 - ldrb r3, [r12] ; thresh - orr r2, r2, r2, lsl #8 - pld [src, #23] - ldr lr, [src], pstep - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|MBVnext8| - ; vp9_filter_mask() function - ; calculate breakout conditions - ; transpose the source data for 4-in-parallel operation - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - uqsub8 r7, r9, r10 ; p3 - p2 - uqsub8 r8, r10, r9 ; p2 - p3 - uqsub8 r9, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - orr r7, r7, r8 ; abs (p3-p2) - orr r10, r9, r10 ; abs (p2-p1) - uqsub8 lr, r7, r2 ; compare to limit. lr: vp9_filter_mask - uqsub8 r10, r10, r2 ; compare to limit - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr lr, lr, r10 - - uqsub8 r6, r11, r12 ; p1 - p0 - uqsub8 r7, r12, r11 ; p0 - p1 - add src, src, #4 ; move src pointer up by 4 - orr r6, r6, r7 ; abs (p1-p0) - str r11, [sp, #12] ; save p1 - uqsub8 r10, r6, r2 ; compare to limit - uqsub8 r11, r6, r3 ; compare to thresh - orr lr, lr, r10 - - ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now - ; transpose the source data for 4-in-parallel operation - ldr r6, [src], pstep ; load source data - str r11, [sp] ; push r11 to stack - ldr r7, [src], pstep - str r12, [sp, #4] ; save current reg before load q0 - q3 data - ldr r8, [src], pstep - str lr, [sp, #8] - ldr lr, [src], pstep - - - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - ldr lr, [sp, #8] ; load back (f)limit accumulator - - uqsub8 r6, r12, r11 ; q3 - q2 - uqsub8 r7, r11, r12 ; q2 - q3 - uqsub8 r12, r11, r10 ; q2 - q1 - uqsub8 r11, r10, r11 ; q1 - q2 - orr r6, r6, r7 ; abs (q3-q2) - orr r7, r12, r11 ; abs (q2-q1) - uqsub8 r6, r6, r2 ; compare to limit - uqsub8 r7, r7, r2 ; compare to limit - ldr r11, [sp, #4] ; load back p0 - ldr r12, [sp, #12] ; load back p1 - orr lr, lr, r6 - orr lr, lr, r7 - - uqsub8 r6, r11, r9 ; p0 - q0 - uqsub8 r7, r9, r11 ; q0 - p0 - uqsub8 r8, r12, r10 ; p1 - q1 - uqsub8 r11, r10, r12 ; q1 - p1 - orr r6, r6, r7 ; abs (p0-q0) - ldr r7, c0x7F7F7F7F - orr r8, r8, r11 ; abs (p1-q1) - uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 - and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r11, r10, r9 ; q1 - q0 - uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r12, r9, r10 ; q0 - q1 - uqsub8 r6, r6, r4 ; compare to flimit - - orr r9, r11, r12 ; abs (q1-q0) - uqsub8 r8, r9, r2 ; compare to limit - uqsub8 r10, r9, r3 ; compare to thresh - orr lr, lr, r6 - orr lr, lr, r8 - - mvn r11, #0 ; r11 == -1 - mov r12, #0 - - usub8 lr, r12, lr - ldr r9, [sp] ; load the compared result - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq mbvskip_filter ; skip filtering - - - - ;vp8_hevmask() function - ;calculate high edge variance - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr r9, r9, r10 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - usub8 r9, r12, r9 - sel r6, r12, r11 ; hev mask: r6 - - - ; vp8_mbfilter() function - ; p2, q2 are only needed at the end. Don't need to load them in now. - ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first - ; load soure data to r6, r11, r12, lr - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - pkhbt r12, r7, r8, lsl #16 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - pkhbt r11, r9, r10, lsl #16 - - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - str r6, [sp] ; save r6 - str lr, [sp, #4] ; save lr - - pkhbt r6, r7, r8, lsl #16 - pkhbt lr, r9, r10, lsl #16 - - ;transpose r12, r11, r6, lr to p1, p0, q0, q1 - TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 - - ;load back hev_mask r6 and filter_mask lr - ldr r12, c0x80808080 - ldr r6, [sp] - ldr lr, [sp, #4] - - eor r7, r7, r12 ; ps1 - eor r8, r8, r12 ; ps0 - eor r9, r9, r12 ; qs0 - eor r10, r10, r12 ; qs1 - - qsub8 r12, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) - str r7, [sp, #12] ; store ps1 temporarily - qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1) - str r10, [sp, #8] ; store qs1 temporarily - qadd8 r7, r7, r12 - str r9, [sp] ; store qs0 temporarily - qadd8 r7, r7, r12 - str r8, [sp, #4] ; store ps0 temporarily - qadd8 r7, r7, r12 ; vp9_filter: r7 - - ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 - ldr r9, c0x04040404 - ;mvn r11, #0 ; r11 == -1 - - and r7, r7, lr ; vp9_filter &= mask (lr is free) - - mov r12, r7 ; Filter2: r12 - and r12, r12, r6 ; Filter2 &= hev - - ;modify code for vp8 - ;save bottom 3 bits so that we round one side +4 and the other +3 - qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4) - qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3) - - mov r10, #0 - shadd8 r8 , r8 , r10 ; Filter1 >>= 3 - shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - shadd8 r8 , r8 , r10 - shadd8 r12 , r12 , r10 - shadd8 r8 , r8 , r10 ; r8: Filter1 - shadd8 r12 , r12 , r10 ; r12: Filter2 - - ldr r9, [sp] ; load qs0 - ldr r11, [sp, #4] ; load ps0 - - qsub8 r9 , r9, r8 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1) - qadd8 r11, r11, r12 ; ps0 = vp9_signed_char_clamp(ps0 + Filter2) - - ;save bottom 3 bits so that we round one side +4 and the other +3 - ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) - ;qadd8 r12 , r12 , r9 ; Filter2 = vp9_signed_char_clamp(Filter2+4) - ;mov r10, #0 - ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - ;usub8 lr, r8, r9 ; s = (s==4)*-1 - ;sel lr, r11, r10 - ;shadd8 r12 , r12 , r10 - ;usub8 r8, r9, r8 - ;sel r8, r11, r10 - ;ldr r9, [sp] ; load qs0 - ;ldr r11, [sp, #4] ; load ps0 - ;shadd8 r12 , r12 , r10 - ;and r8, r8, lr ; -1 for each element that equals 4 - ;qadd8 r10, r8, r12 ; u = vp9_signed_char_clamp(s + Filter2) - ;qsub8 r9 , r9, r12 ; qs0 = vp9_signed_char_clamp(qs0 - Filter2) - ;qadd8 r11, r11, r10 ; ps0 = vp9_signed_char_clamp(ps0 + u) - - ;end of modification for vp8 - - bic r12, r7, r6 ;vp9_filter &= ~hev ( r6 is free) - ;mov r12, r7 - - ;roughly 3/7th difference across boundary - mov lr, #0x1b ; 27 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r7, r10, lr, r7 - smultb r10, r10, lr - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - add r10, r10, #63 - ssat r7, #8, r7, asr #7 - ssat r10, #8, r10, asr #7 - - ldr lr, c0x80808080 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r7, r10, lsl #16 - uxtb16 r6, r6 - uxtb16 r10, r10 - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7) - - qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs0 - u) - qadd8 r10, r11, r10 ; s = vp9_signed_char_clamp(ps0 + u) - eor r8, r8, lr ; *oq0 = s^0x80 - eor r10, r10, lr ; *op0 = s^0x80 - - strb r10, [src, #-1] ; store op0 result - strb r8, [src], pstep ; store oq0 result - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - strb r10, [src, #-1] - strb r8, [src], pstep - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - strb r10, [src, #-1] - strb r8, [src], pstep - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - strb r10, [src, #-1] - strb r8, [src], pstep - - ;roughly 2/7th difference across boundary - mov lr, #0x12 ; 18 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r9, r10, lr, r7 - - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r9, #8, r9, asr #7 - ssat r10, #8, r10, asr #7 - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r9, r10, lsl #16 - - ldr r9, [sp, #8] ; load qs1 - ldr r11, [sp, #12] ; load ps1 - ldr lr, c0x80808080 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - add src, src, #2 - - orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7) - - qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs1 - u) - qadd8 r10, r11, r10 ; s = vp9_signed_char_clamp(ps1 + u) - eor r8, r8, lr ; *oq1 = s^0x80 - eor r10, r10, lr ; *op1 = s^0x80 - - ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary - strb r10, [src, #-4] ; store op1 - strb r8, [src, #-1] ; store oq1 - ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary - - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - - ldrb r6, [src, #-5] - strb r10, [src, #-4] - strb r8, [src, #-1] - ldrb r7, [src], pstep - - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - orr r11, r11, r6, lsl #8 - orr r9, r9, r7, lsl #8 - - ldrb r6, [src, #-5] - strb r10, [src, #-4] - strb r8, [src, #-1] - ldrb r7, [src], pstep - - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - orr r11, r11, r6, lsl #16 - orr r9, r9, r7, lsl #16 - - ldrb r6, [src, #-5] - strb r10, [src, #-4] - strb r8, [src, #-1] - ldrb r7, [src], pstep - orr r11, r11, r6, lsl #24 - orr r9, r9, r7, lsl #24 - - ;roughly 1/7th difference across boundary - eor r9, r9, lr - eor r11, r11, lr - - mov lr, #0x9 ; 9 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r12, r10, lr, r7 - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r12, #8, r12, asr #7 - ssat r10, #8, r10, asr #7 - - sub src, src, pstep, lsl #2 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r12, r10, lsl #16 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - ldr lr, c0x80808080 - - orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7) - - qadd8 r8, r11, r10 ; s = vp9_signed_char_clamp(ps2 + u) - qsub8 r10, r9, r10 ; s = vp9_signed_char_clamp(qs2 - u) - eor r8, r8, lr ; *op2 = s^0x80 - eor r10, r10, lr ; *oq2 = s^0x80 - - strb r8, [src, #-5] ; store *op2 - strb r10, [src], pstep ; store *oq2 - mov r8, r8, lsr #8 - mov r10, r10, lsr #8 - strb r8, [src, #-5] - strb r10, [src], pstep - mov r8, r8, lsr #8 - mov r10, r10, lsr #8 - strb r8, [src, #-5] - strb r10, [src], pstep - mov r8, r8, lsr #8 - mov r10, r10, lsr #8 - strb r8, [src, #-5] - strb r10, [src], pstep - - ;adjust src pointer for next loop - sub src, src, #2 - -|mbvskip_filter| - sub src, src, #4 - subs count, count, #1 - - pld [src, #23] ; preload for next block - ldrne r6, [src], pstep ; load source data - pld [src, #23] - ldrne r7, [src], pstep - pld [src, #23] - ldrne r8, [src], pstep - pld [src, #23] - ldrne lr, [src], pstep - - bne MBVnext8 - - add sp, sp, #16 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_mbloop_filter_vertical_edge_armv6| - -; Constant Pool -c0x80808080 DCD 0x80808080 -c0x03030303 DCD 0x03030303 -c0x04040404 DCD 0x04040404 -c0x01010101 DCD 0x01010101 -c0x7F7F7F7F DCD 0x7F7F7F7F - - END diff --git a/vp9/common/arm/armv6/recon_v6.asm b/vp9/common/arm/armv6/recon_v6.asm deleted file mode 100644 index 99c7bcf2d..000000000 --- a/vp9/common/arm/armv6/recon_v6.asm +++ /dev/null @@ -1,281 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_recon_b_armv6| - EXPORT |vp8_recon2b_armv6| - EXPORT |vp8_recon4b_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code -prd RN r0 -dif RN r1 -dst RN r2 -stride RN r3 - -;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride) -; R0 char* pred_ptr -; R1 short * dif_ptr -; R2 char * dst_ptr -; R3 int stride - -; Description: -; Loop through the block adding the Pred and Diff together. Clamp and then -; store back into the Dst. - -; Restrictions : -; all buffers are expected to be 4 byte aligned coming in and -; going out. -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -; -; -; -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp8_recon_b_armv6| PROC - stmdb sp!, {r4 - r9, lr} - - ;0, 1, 2, 3 - ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 - ldr r6, [dif, #0] ; 1 | 0 - ldr r7, [dif, #4] ; 3 | 2 - - pkhbt r8, r6, r7, lsl #16 ; 2 | 0 - pkhtb r9, r7, r6, asr #16 ; 3 | 1 - - uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 - uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 - - usat16 r8, #8, r8 - usat16 r9, #8, r9 - add dif, dif, #32 - orr r8, r8, r9, lsl #8 - - str r8, [dst], stride - - ;0, 1, 2, 3 - ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 -;; ldr r6, [dif, #8] ; 1 | 0 -;; ldr r7, [dif, #12] ; 3 | 2 - ldr r6, [dif, #0] ; 1 | 0 - ldr r7, [dif, #4] ; 3 | 2 - - pkhbt r8, r6, r7, lsl #16 ; 2 | 0 - pkhtb r9, r7, r6, asr #16 ; 3 | 1 - - uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 - uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 - - usat16 r8, #8, r8 - usat16 r9, #8, r9 - add dif, dif, #32 - orr r8, r8, r9, lsl #8 - - str r8, [dst], stride - - ;0, 1, 2, 3 - ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 -;; ldr r6, [dif, #16] ; 1 | 0 -;; ldr r7, [dif, #20] ; 3 | 2 - ldr r6, [dif, #0] ; 1 | 0 - ldr r7, [dif, #4] ; 3 | 2 - - pkhbt r8, r6, r7, lsl #16 ; 2 | 0 - pkhtb r9, r7, r6, asr #16 ; 3 | 1 - - uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 - uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 - - usat16 r8, #8, r8 - usat16 r9, #8, r9 - add dif, dif, #32 - orr r8, r8, r9, lsl #8 - - str r8, [dst], stride - - ;0, 1, 2, 3 - ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 -;; ldr r6, [dif, #24] ; 1 | 0 -;; ldr r7, [dif, #28] ; 3 | 2 - ldr r6, [dif, #0] ; 1 | 0 - ldr r7, [dif, #4] ; 3 | 2 - - pkhbt r8, r6, r7, lsl #16 ; 2 | 0 - pkhtb r9, r7, r6, asr #16 ; 3 | 1 - - uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 - uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 - - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst], stride - - ldmia sp!, {r4 - r9, pc} - - ENDP ; |recon_b| - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -; -; -; -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -; R0 char *pred_ptr -; R1 short *dif_ptr -; R2 char *dst_ptr -; R3 int stride -|vp8_recon4b_armv6| PROC - stmdb sp!, {r4 - r9, lr} - - mov lr, #4 - -recon4b_loop - ;0, 1, 2, 3 - ldr r4, [prd], #4 ; 3 | 2 | 1 | 0 - ldr r6, [dif, #0] ; 1 | 0 - ldr r7, [dif, #4] ; 3 | 2 - - pkhbt r8, r6, r7, lsl #16 ; 2 | 0 - pkhtb r9, r7, r6, asr #16 ; 3 | 1 - - uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 - uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 - - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst] - - ;4, 5, 6, 7 - ldr r4, [prd], #4 -;; ldr r6, [dif, #32] -;; ldr r7, [dif, #36] - ldr r6, [dif, #8] - ldr r7, [dif, #12] - - pkhbt r8, r6, r7, lsl #16 - pkhtb r9, r7, r6, asr #16 - - uxtab16 r8, r8, r4 - uxtab16 r9, r9, r4, ror #8 - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst, #4] - - ;8, 9, 10, 11 - ldr r4, [prd], #4 -;; ldr r6, [dif, #64] -;; ldr r7, [dif, #68] - ldr r6, [dif, #16] - ldr r7, [dif, #20] - - pkhbt r8, r6, r7, lsl #16 - pkhtb r9, r7, r6, asr #16 - - uxtab16 r8, r8, r4 - uxtab16 r9, r9, r4, ror #8 - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst, #8] - - ;12, 13, 14, 15 - ldr r4, [prd], #4 -;; ldr r6, [dif, #96] -;; ldr r7, [dif, #100] - ldr r6, [dif, #24] - ldr r7, [dif, #28] - - pkhbt r8, r6, r7, lsl #16 - pkhtb r9, r7, r6, asr #16 - - uxtab16 r8, r8, r4 - uxtab16 r9, r9, r4, ror #8 - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst, #12] - - add dst, dst, stride -;; add dif, dif, #8 - add dif, dif, #32 - - subs lr, lr, #1 - bne recon4b_loop - - ldmia sp!, {r4 - r9, pc} - - ENDP ; |Recon4B| - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -; -; -; -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -; R0 char *pred_ptr -; R1 short *dif_ptr -; R2 char *dst_ptr -; R3 int stride -|vp8_recon2b_armv6| PROC - stmdb sp!, {r4 - r9, lr} - - mov lr, #4 - -recon2b_loop - ;0, 1, 2, 3 - ldr r4, [prd], #4 - ldr r6, [dif, #0] - ldr r7, [dif, #4] - - pkhbt r8, r6, r7, lsl #16 - pkhtb r9, r7, r6, asr #16 - - uxtab16 r8, r8, r4 - uxtab16 r9, r9, r4, ror #8 - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst] - - ;4, 5, 6, 7 - ldr r4, [prd], #4 -;; ldr r6, [dif, #32] -;; ldr r7, [dif, #36] - ldr r6, [dif, #8] - ldr r7, [dif, #12] - - pkhbt r8, r6, r7, lsl #16 - pkhtb r9, r7, r6, asr #16 - - uxtab16 r8, r8, r4 - uxtab16 r9, r9, r4, ror #8 - usat16 r8, #8, r8 - usat16 r9, #8, r9 - orr r8, r8, r9, lsl #8 - - str r8, [dst, #4] - - add dst, dst, stride -;; add dif, dif, #8 - add dif, dif, #16 - - subs lr, lr, #1 - bne recon2b_loop - - ldmia sp!, {r4 - r9, pc} - - ENDP ; |Recon2B| - - END diff --git a/vp9/common/arm/armv6/simpleloopfilter_v6.asm b/vp9/common/arm/armv6/simpleloopfilter_v6.asm deleted file mode 100644 index 8306912be..000000000 --- a/vp9/common/arm/armv6/simpleloopfilter_v6.asm +++ /dev/null @@ -1,286 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_loop_filter_simple_horizontal_edge_armv6| - EXPORT |vp9_loop_filter_simple_vertical_edge_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code - - MACRO - TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 - ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 - ; a0: 03 02 01 00 - ; a1: 13 12 11 10 - ; a2: 23 22 21 20 - ; a3: 33 32 31 30 - ; b3 b2 b1 b0 - - uxtb16 $b1, $a1 ; xx 12 xx 10 - uxtb16 $b0, $a0 ; xx 02 xx 00 - uxtb16 $b3, $a3 ; xx 32 xx 30 - uxtb16 $b2, $a2 ; xx 22 xx 20 - orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 - orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 - - uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 - uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 - uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 - uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 - orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 - orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 - - pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 - pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 - - pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 - pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 - MEND - - - -src RN r0 -pstep RN r1 - -;r0 unsigned char *src_ptr, -;r1 int src_pixel_step, -;r2 const char *blimit - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp9_loop_filter_simple_horizontal_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - ldrb r12, [r2] ; blimit - ldr r3, [src, -pstep, lsl #1] ; p1 - ldr r4, [src, -pstep] ; p0 - ldr r5, [src] ; q0 - ldr r6, [src, pstep] ; q1 - orr r12, r12, r12, lsl #8 ; blimit - ldr r2, c0x80808080 - orr r12, r12, r12, lsl #16 ; blimit - mov r9, #4 ; double the count. we're doing 4 at a time - mov lr, #0 ; need 0 in a couple places - -|simple_hnext8| - ; vp8_simple_filter_mask() - - uqsub8 r7, r3, r6 ; p1 - q1 - uqsub8 r8, r6, r3 ; q1 - p1 - uqsub8 r10, r4, r5 ; p0 - q0 - uqsub8 r11, r5, r4 ; q0 - p0 - orr r8, r8, r7 ; abs(p1 - q1) - orr r10, r10, r11 ; abs(p0 - q0) - uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2 - uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1 - uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 - mvn r8, #0 - usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags - sel r10, r8, lr ; filter mask: F or 0 - cmp r10, #0 - beq simple_hskip_filter ; skip filtering if all masks are 0x00 - - ;vp8_simple_filter() - - eor r3, r3, r2 ; p1 offset to convert to a signed value - eor r6, r6, r2 ; q1 offset to convert to a signed value - eor r4, r4, r2 ; p0 offset to convert to a signed value - eor r5, r5, r2 ; q0 offset to convert to a signed value - - qsub8 r3, r3, r6 ; vp9_filter = p1 - q1 - qsub8 r6, r5, r4 ; q0 - p0 - qadd8 r3, r3, r6 ; += q0 - p0 - ldr r7, c0x04040404 - qadd8 r3, r3, r6 ; += q0 - p0 - ldr r8, c0x03030303 - qadd8 r3, r3, r6 ; vp9_filter = p1-q1 + 3*(q0-p0)) - ;STALL - and r3, r3, r10 ; vp9_filter &= mask - - qadd8 r7 , r3 , r7 ; Filter1 = vp9_filter + 4 - qadd8 r8 , r3 , r8 ; Filter2 = vp9_filter + 3 - - shadd8 r7 , r7 , lr - shadd8 r8 , r8 , lr - shadd8 r7 , r7 , lr - shadd8 r8 , r8 , lr - shadd8 r7 , r7 , lr ; Filter1 >>= 3 - shadd8 r8 , r8 , lr ; Filter2 >>= 3 - - qsub8 r5 ,r5, r7 ; u = q0 - Filter1 - qadd8 r4, r4, r8 ; u = p0 + Filter2 - eor r5, r5, r2 ; *oq0 = u^0x80 - str r5, [src] ; store oq0 result - eor r4, r4, r2 ; *op0 = u^0x80 - str r4, [src, -pstep] ; store op0 result - -|simple_hskip_filter| - subs r9, r9, #1 - addne src, src, #4 ; next row - - ldrne r3, [src, -pstep, lsl #1] ; p1 - ldrne r4, [src, -pstep] ; p0 - ldrne r5, [src] ; q0 - ldrne r6, [src, pstep] ; q1 - - bne simple_hnext8 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp9_loop_filter_simple_horizontal_edge_armv6| - - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp9_loop_filter_simple_vertical_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - ldrb r12, [r2] ; r12: blimit - ldr r2, c0x80808080 - orr r12, r12, r12, lsl #8 - - ; load soure data to r7, r8, r9, r10 - ldrh r3, [src, #-2] - pld [src, #23] ; preload for next block - ldrh r4, [src], pstep - orr r12, r12, r12, lsl #16 - - ldrh r5, [src, #-2] - pld [src, #23] - ldrh r6, [src], pstep - - pkhbt r7, r3, r4, lsl #16 - - ldrh r3, [src, #-2] - pld [src, #23] - ldrh r4, [src], pstep - - pkhbt r8, r5, r6, lsl #16 - - ldrh r5, [src, #-2] - pld [src, #23] - ldrh r6, [src], pstep - mov r11, #4 ; double the count. we're doing 4 at a time - -|simple_vnext8| - ; vp8_simple_filter_mask() function - pkhbt r9, r3, r4, lsl #16 - pkhbt r10, r5, r6, lsl #16 - - ;transpose r7, r8, r9, r10 to r3, r4, r5, r6 - TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6 - - uqsub8 r7, r3, r6 ; p1 - q1 - uqsub8 r8, r6, r3 ; q1 - p1 - uqsub8 r9, r4, r5 ; p0 - q0 - uqsub8 r10, r5, r4 ; q0 - p0 - orr r7, r7, r8 ; abs(p1 - q1) - orr r9, r9, r10 ; abs(p0 - q0) - mov r8, #0 - uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2 - uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2 - uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 - mvn r10, #0 ; r10 == -1 - - usub8 r7, r12, r7 ; compare to flimit - sel lr, r10, r8 ; filter mask - - cmp lr, #0 - beq simple_vskip_filter ; skip filtering - - ;vp8_simple_filter() function - eor r3, r3, r2 ; p1 offset to convert to a signed value - eor r6, r6, r2 ; q1 offset to convert to a signed value - eor r4, r4, r2 ; p0 offset to convert to a signed value - eor r5, r5, r2 ; q0 offset to convert to a signed value - - qsub8 r3, r3, r6 ; vp9_filter = p1 - q1 - qsub8 r6, r5, r4 ; q0 - p0 - - qadd8 r3, r3, r6 ; vp9_filter += q0 - p0 - ldr r9, c0x03030303 ; r9 = 3 - - qadd8 r3, r3, r6 ; vp9_filter += q0 - p0 - ldr r7, c0x04040404 - - qadd8 r3, r3, r6 ; vp9_filter = p1-q1 + 3*(q0-p0)) - ;STALL - and r3, r3, lr ; vp9_filter &= mask - - qadd8 r9 , r3 , r9 ; Filter2 = vp9_filter + 3 - qadd8 r3 , r3 , r7 ; Filter1 = vp9_filter + 4 - - shadd8 r9 , r9 , r8 - shadd8 r3 , r3 , r8 - shadd8 r9 , r9 , r8 - shadd8 r3 , r3 , r8 - shadd8 r9 , r9 , r8 ; Filter2 >>= 3 - shadd8 r3 , r3 , r8 ; Filter1 >>= 3 - - ;calculate output - sub src, src, pstep, lsl #2 - - qadd8 r4, r4, r9 ; u = p0 + Filter2 - qsub8 r5, r5, r3 ; u = q0 - Filter1 - eor r4, r4, r2 ; *op0 = u^0x80 - eor r5, r5, r2 ; *oq0 = u^0x80 - - strb r4, [src, #-1] ; store the result - mov r4, r4, lsr #8 - strb r5, [src], pstep - mov r5, r5, lsr #8 - - strb r4, [src, #-1] - mov r4, r4, lsr #8 - strb r5, [src], pstep - mov r5, r5, lsr #8 - - strb r4, [src, #-1] - mov r4, r4, lsr #8 - strb r5, [src], pstep - mov r5, r5, lsr #8 - - strb r4, [src, #-1] - strb r5, [src], pstep - -|simple_vskip_filter| - subs r11, r11, #1 - - ; load soure data to r7, r8, r9, r10 - ldrneh r3, [src, #-2] - pld [src, #23] ; preload for next block - ldrneh r4, [src], pstep - - ldrneh r5, [src, #-2] - pld [src, #23] - ldrneh r6, [src], pstep - - pkhbt r7, r3, r4, lsl #16 - - ldrneh r3, [src, #-2] - pld [src, #23] - ldrneh r4, [src], pstep - - pkhbt r8, r5, r6, lsl #16 - - ldrneh r5, [src, #-2] - pld [src, #23] - ldrneh r6, [src], pstep - - bne simple_vnext8 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp9_loop_filter_simple_vertical_edge_armv6| - -; Constant Pool -c0x80808080 DCD 0x80808080 -c0x03030303 DCD 0x03030303 -c0x04040404 DCD 0x04040404 - - END diff --git a/vp9/common/arm/armv6/sixtappredict8x4_v6.asm b/vp9/common/arm/armv6/sixtappredict8x4_v6.asm deleted file mode 100644 index 5bf94e090..000000000 --- a/vp9/common/arm/armv6/sixtappredict8x4_v6.asm +++ /dev/null @@ -1,273 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict8x4_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code -;------------------------------------- -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack unsigned char *dst_ptr, -; stack int dst_pitch -;------------------------------------- -;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184. -;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack, -;and the result is stored in transpose. -|vp8_sixtap_predict8x4_armv6| PROC - stmdb sp!, {r4 - r11, lr} - str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - add lr, sp, #4 ;point to temporary buffer - beq skip_firstpass_filter - -;first-pass filter - adr r12, filter8_coeff - sub r0, r0, r1, lsl #1 - - add r3, r1, #10 ; preload next low - pld [r0, r3] - - add r2, r12, r2, lsl #4 ;calculate filter location - add r0, r0, #3 ;adjust src only for loading convinience - - ldr r3, [r2] ; load up packed filter coefficients - ldr r4, [r2, #4] - ldr r5, [r2, #8] - - mov r2, #0x90000 ; height=9 is top part of counter - - sub r1, r1, #8 - -|first_pass_hloop_v6| - ldrb r6, [r0, #-5] ; load source data - ldrb r7, [r0, #-4] - ldrb r8, [r0, #-3] - ldrb r9, [r0, #-2] - ldrb r10, [r0, #-1] - - orr r2, r2, #0x4 ; construct loop counter. width=8=4x2 - - pkhbt r6, r6, r7, lsl #16 ; r7 | r6 - pkhbt r7, r7, r8, lsl #16 ; r8 | r7 - - pkhbt r8, r8, r9, lsl #16 ; r9 | r8 - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - -|first_pass_wloop_v6| - smuad r11, r6, r3 ; vp9_filter[0], vp9_filter[1] - smuad r12, r7, r3 - - ldrb r6, [r0], #1 - - smlad r11, r8, r4, r11 ; vp9_filter[2], vp9_filter[3] - ldrb r7, [r0], #1 - smlad r12, r9, r4, r12 - - pkhbt r10, r10, r6, lsl #16 ; r10 | r9 - pkhbt r6, r6, r7, lsl #16 ; r11 | r10 - smlad r11, r10, r5, r11 ; vp9_filter[4], vp9_filter[5] - smlad r12, r6, r5, r12 - - sub r2, r2, #1 - - add r11, r11, #0x40 ; round_shift_and_clamp - tst r2, #0xff ; test loop counter - usat r11, #8, r11, asr #7 - add r12, r12, #0x40 - strh r11, [lr], #20 ; result is transposed and stored, which - usat r12, #8, r12, asr #7 - - strh r12, [lr], #20 - - movne r11, r6 - movne r12, r7 - - movne r6, r8 - movne r7, r9 - movne r8, r10 - movne r9, r11 - movne r10, r12 - - bne first_pass_wloop_v6 - - ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines - ;;IF ARCHITECTURE=6 - ;pld [src, ppl] - ;;pld [src, r9] - ;;ENDIF - - subs r2, r2, #0x10000 - - sub lr, lr, #158 - - add r0, r0, r1 ; move to next input line - - add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier - pld [r0, r11] - - bne first_pass_hloop_v6 - -;second pass filter -secondpass_filter - ldr r3, [sp], #4 ; load back yoffset - ldr r0, [sp, #216] ; load dst address from stack 180+36 - ldr r1, [sp, #220] ; load dst stride from stack 180+40 - - cmp r3, #0 - beq skip_secondpass_filter - - adr r12, filter8_coeff - add lr, r12, r3, lsl #4 ;calculate filter location - - mov r2, #0x00080000 - - ldr r3, [lr] ; load up packed filter coefficients - ldr r4, [lr, #4] - ldr r5, [lr, #8] - - pkhbt r12, r4, r3 ; pack the filter differently - pkhbt r11, r5, r4 - -second_pass_hloop_v6 - ldr r6, [sp] ; load the data - ldr r7, [sp, #4] - - orr r2, r2, #2 ; loop counter - -second_pass_wloop_v6 - smuad lr, r3, r6 ; apply filter - smulbt r10, r3, r6 - - ldr r8, [sp, #8] - - smlad lr, r4, r7, lr - smladx r10, r12, r7, r10 - - ldrh r9, [sp, #12] - - smlad lr, r5, r8, lr - smladx r10, r11, r8, r10 - - add sp, sp, #4 - smlatb r10, r5, r9, r10 - - sub r2, r2, #1 - - add lr, lr, #0x40 ; round_shift_and_clamp - tst r2, #0xff - usat lr, #8, lr, asr #7 - add r10, r10, #0x40 - strb lr, [r0], r1 ; the result is transposed back and stored - usat r10, #8, r10, asr #7 - - strb r10, [r0],r1 - - movne r6, r7 - movne r7, r8 - - bne second_pass_wloop_v6 - - subs r2, r2, #0x10000 - add sp, sp, #12 ; updata src for next loop (20-8) - sub r0, r0, r1, lsl #2 - add r0, r0, #1 - - bne second_pass_hloop_v6 - - add sp, sp, #20 - ldmia sp!, {r4 - r11, pc} - -;-------------------- -skip_firstpass_filter - sub r0, r0, r1, lsl #1 - sub r1, r1, #8 - mov r2, #9 - -skip_firstpass_hloop - ldrb r4, [r0], #1 ; load data - subs r2, r2, #1 - ldrb r5, [r0], #1 - strh r4, [lr], #20 ; store it to immediate buffer - ldrb r6, [r0], #1 ; load data - strh r5, [lr], #20 - ldrb r7, [r0], #1 - strh r6, [lr], #20 - ldrb r8, [r0], #1 - strh r7, [lr], #20 - ldrb r9, [r0], #1 - strh r8, [lr], #20 - ldrb r10, [r0], #1 - strh r9, [lr], #20 - ldrb r11, [r0], #1 - strh r10, [lr], #20 - add r0, r0, r1 ; move to next input line - strh r11, [lr], #20 - - sub lr, lr, #158 ; move over to next column - bne skip_firstpass_hloop - - b secondpass_filter - -;-------------------- -skip_secondpass_filter - mov r2, #8 - add sp, sp, #4 ;start from src[0] instead of src[-2] - -skip_secondpass_hloop - ldr r6, [sp], #4 - subs r2, r2, #1 - ldr r8, [sp], #4 - - mov r7, r6, lsr #16 ; unpack - strb r6, [r0], r1 - mov r9, r8, lsr #16 - strb r7, [r0], r1 - add sp, sp, #12 ; 20-8 - strb r8, [r0], r1 - strb r9, [r0], r1 - - sub r0, r0, r1, lsl #2 - add r0, r0, #1 - - bne skip_secondpass_hloop - - add sp, sp, #16 ; 180 - (160 +4) - - ldmia sp!, {r4 - r11, pc} - - ENDP - -;----------------- -;One word each is reserved. Label filter_coeff can be used to access the data. -;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... -filter8_coeff - DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000 - DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000 - DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000 - DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000 - DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000 - DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000 - DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000 - DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000 - - ;DCD 0, 0, 128, 0, 0, 0 - ;DCD 0, -6, 123, 12, -1, 0 - ;DCD 2, -11, 108, 36, -8, 1 - ;DCD 0, -9, 93, 50, -6, 0 - ;DCD 3, -16, 77, 77, -16, 3 - ;DCD 0, -6, 50, 93, -9, 0 - ;DCD 1, -8, 36, 108, -11, 2 - ;DCD 0, -1, 12, 123, -6, 0 - - END diff --git a/vp9/common/arm/armv6/vp9_bilinearfilter_v6.asm b/vp9/common/arm/armv6/vp9_bilinearfilter_v6.asm new file mode 100644 index 000000000..36e391e2b --- /dev/null +++ b/vp9/common/arm/armv6/vp9_bilinearfilter_v6.asm @@ -0,0 +1,237 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_filter_block2d_bil_first_pass_armv6| + EXPORT |vp9_filter_block2d_bil_second_pass_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 unsigned short *dst_ptr, +; r2 unsigned int src_pitch, +; r3 unsigned int height, +; stack unsigned int width, +; stack const short *vp9_filter +;------------------------------------- +; The output is transposed stroed in output array to make it easy for second pass filtering. +|vp9_filter_block2d_bil_first_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp9_filter address + ldr r4, [sp, #36] ; width + + mov r12, r3 ; outer-loop counter + + add r7, r2, r4 ; preload next row + pld [r0, r7] + + sub r2, r2, r4 ; src increment for height loop + + ldr r5, [r11] ; load up filter coefficients + + mov r3, r3, lsl #1 ; height*2 + add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) + + mov r11, r1 ; save dst_ptr for each row + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_1st_filter + +|bil_height_loop_1st_v6| + ldrb r6, [r0] ; load source data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + mov lr, r4, lsr #2 ; 4-in-parellel loop counter + +|bil_width_loop_1st_v6| + ldrb r9, [r0, #3] + ldrb r10, [r0, #4] + + pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] + pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] + + smuad r6, r6, r5 ; apply the filter + pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] + smuad r7, r7, r5 + pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] + + smuad r8, r8, r5 + smuad r9, r9, r5 + + add r0, r0, #4 + subs lr, lr, #1 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #16, r6, asr #7 + usat r7, #16, r7, asr #7 + + strh r6, [r1], r3 ; result is transposed and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strh r7, [r1], r3 + add r9, r9, #0x40 + usat r8, #16, r8, asr #7 + usat r9, #16, r9, asr #7 + + strh r8, [r1], r3 ; result is transposed and stored + + ldrneb r6, [r0] ; load source data + strh r9, [r1], r3 + + ldrneb r7, [r0, #1] + ldrneb r8, [r0, #2] + + bne bil_width_loop_1st_v6 + + add r0, r0, r2 ; move to next input row + subs r12, r12, #1 + + add r9, r2, r4, lsl #1 ; adding back block width + pld [r0, r9] ; preload next row + + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_1st_v6 + + ldmia sp!, {r4 - r11, pc} + +|bil_null_1st_filter| +|bil_height_loop_null_1st| + mov lr, r4, lsr #2 ; loop counter + +|bil_width_loop_null_1st| + ldrb r6, [r0] ; load data + ldrb r7, [r0, #1] + ldrb r8, [r0, #2] + ldrb r9, [r0, #3] + + strh r6, [r1], r3 ; store it to immediate buffer + add r0, r0, #4 + strh r7, [r1], r3 + subs lr, lr, #1 + strh r8, [r1], r3 + strh r9, [r1], r3 + + bne bil_width_loop_null_1st + + subs r12, r12, #1 + add r0, r0, r2 ; move to next input line + add r11, r11, #2 ; move over to next column + mov r1, r11 + + bne bil_height_loop_null_1st + + ldmia sp!, {r4 - r11, pc} + + ENDP ; |vp9_filter_block2d_bil_first_pass_armv6| + + +;--------------------------------- +; r0 unsigned short *src_ptr, +; r1 unsigned char *dst_ptr, +; r2 int dst_pitch, +; r3 unsigned int height, +; stack unsigned int width, +; stack const short *vp9_filter +;--------------------------------- +|vp9_filter_block2d_bil_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp9_filter address + ldr r4, [sp, #36] ; width + + ldr r5, [r11] ; load up filter coefficients + mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix + mov r11, r1 + + cmp r5, #128 ; if filter coef = 128, then skip the filter + beq bil_null_2nd_filter + +|bil_height_loop_2nd| + ldr r6, [r0] ; load the data + ldr r8, [r0, #4] + ldrh r10, [r0, #8] + mov lr, r3, lsr #2 ; loop counter + +|bil_width_loop_2nd| + pkhtb r7, r6, r8 ; src[1] | src[2] + pkhtb r9, r8, r10 ; src[3] | src[4] + + smuad r6, r6, r5 ; apply filter + smuad r8, r8, r5 ; apply filter + + subs lr, lr, #1 + + smuadx r7, r7, r5 ; apply filter + smuadx r9, r9, r5 ; apply filter + + add r0, r0, #8 + + add r6, r6, #0x40 ; round_shift_and_clamp + add r7, r7, #0x40 + usat r6, #8, r6, asr #7 + usat r7, #8, r7, asr #7 + strb r6, [r1], r2 ; the result is transposed back and stored + + add r8, r8, #0x40 ; round_shift_and_clamp + strb r7, [r1], r2 + add r9, r9, #0x40 + usat r8, #8, r8, asr #7 + usat r9, #8, r9, asr #7 + strb r8, [r1], r2 ; the result is transposed back and stored + + ldrne r6, [r0] ; load data + strb r9, [r1], r2 + ldrne r8, [r0, #4] + ldrneh r10, [r0, #8] + + bne bil_width_loop_2nd + + subs r12, r12, #1 + add r0, r0, #4 ; update src for next row + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_2nd + ldmia sp!, {r4 - r11, pc} + +|bil_null_2nd_filter| +|bil_height_loop_null_2nd| + mov lr, r3, lsr #2 + +|bil_width_loop_null_2nd| + ldr r6, [r0], #4 ; load data + subs lr, lr, #1 + ldr r8, [r0], #4 + + strb r6, [r1], r2 ; store data + mov r7, r6, lsr #16 + strb r7, [r1], r2 + mov r9, r8, lsr #16 + strb r8, [r1], r2 + strb r9, [r1], r2 + + bne bil_width_loop_null_2nd + + subs r12, r12, #1 + add r0, r0, #4 + add r11, r11, #1 + mov r1, r11 + + bne bil_height_loop_null_2nd + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp9_filter_block2d_second_pass_armv6| + + END diff --git a/vp9/common/arm/armv6/vp9_copymem16x16_v6.asm b/vp9/common/arm/armv6/vp9_copymem16x16_v6.asm new file mode 100644 index 000000000..44c3c492f --- /dev/null +++ b/vp9/common/arm/armv6/vp9_copymem16x16_v6.asm @@ -0,0 +1,186 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_copy_mem16x16_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp9_copy_mem16x16_v6| PROC + stmdb sp!, {r4 - r7} + ;push {r4-r7} + + ;preload + pld [r0, #31] ; preload for next 16x16 block + + ands r4, r0, #15 + beq copy_mem16x16_fast + + ands r4, r0, #7 + beq copy_mem16x16_8 + + ands r4, r0, #3 + beq copy_mem16x16_4 + + ;copy one byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + ldrb r6, [r0, #2] + ldrb r7, [r0, #3] + + mov r12, #16 + +copy_mem16x16_1_loop + strb r4, [r2] + strb r5, [r2, #1] + strb r6, [r2, #2] + strb r7, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + ldrb r6, [r0, #6] + ldrb r7, [r0, #7] + + subs r12, r12, #1 + + strb r4, [r2, #4] + strb r5, [r2, #5] + strb r6, [r2, #6] + strb r7, [r2, #7] + + ldrb r4, [r0, #8] + ldrb r5, [r0, #9] + ldrb r6, [r0, #10] + ldrb r7, [r0, #11] + + strb r4, [r2, #8] + strb r5, [r2, #9] + strb r6, [r2, #10] + strb r7, [r2, #11] + + ldrb r4, [r0, #12] + ldrb r5, [r0, #13] + ldrb r6, [r0, #14] + ldrb r7, [r0, #15] + + add r0, r0, r1 + + strb r4, [r2, #12] + strb r5, [r2, #13] + strb r6, [r2, #14] + strb r7, [r2, #15] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + ldrneb r6, [r0, #2] + ldrneb r7, [r0, #3] + + pld [r0, #31] ; preload for next 16x16 block + + bne copy_mem16x16_1_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 4 bytes each time +copy_mem16x16_4 + ldr r4, [r0] + ldr r5, [r0, #4] + ldr r6, [r0, #8] + ldr r7, [r0, #12] + + mov r12, #16 + +copy_mem16x16_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + str r6, [r2, #8] + str r7, [r2, #12] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + ldrne r6, [r0, #8] + ldrne r7, [r0, #12] + + pld [r0, #31] ; preload for next 16x16 block + + bne copy_mem16x16_4_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 8 bytes each time +copy_mem16x16_8 + sub r1, r1, #16 + sub r3, r3, #16 + + mov r12, #16 + +copy_mem16x16_8_loop + ldmia r0!, {r4-r5} + ;ldm r0, {r4-r5} + ldmia r0!, {r6-r7} + + add r0, r0, r1 + + stmia r2!, {r4-r5} + subs r12, r12, #1 + ;stm r2, {r4-r5} + stmia r2!, {r6-r7} + + add r2, r2, r3 + + pld [r0, #31] ; preload for next 16x16 block + bne copy_mem16x16_8_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + +;copy 16 bytes each time +copy_mem16x16_fast + ;sub r1, r1, #16 + ;sub r3, r3, #16 + + mov r12, #16 + +copy_mem16x16_fast_loop + ldmia r0, {r4-r7} + ;ldm r0, {r4-r7} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r7} + ;stm r2, {r4-r7} + add r2, r2, r3 + + pld [r0, #31] ; preload for next 16x16 block + bne copy_mem16x16_fast_loop + + ldmia sp!, {r4 - r7} + ;pop {r4-r7} + mov pc, lr + + ENDP ; |vp9_copy_mem16x16_v6| + + END diff --git a/vp9/common/arm/armv6/vp9_copymem8x4_v6.asm b/vp9/common/arm/armv6/vp9_copymem8x4_v6.asm new file mode 100644 index 000000000..45b904367 --- /dev/null +++ b/vp9/common/arm/armv6/vp9_copymem8x4_v6.asm @@ -0,0 +1,128 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_copy_mem8x4_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void vp9_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp9_copy_mem8x4_v6| PROC + ;push {r4-r5} + stmdb sp!, {r4-r5} + + ;preload + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + ands r4, r0, #7 + beq copy_mem8x4_fast + + ands r4, r0, #3 + beq copy_mem8x4_4 + + ;copy 1 byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + + mov r12, #4 + +copy_mem8x4_1_loop + strb r4, [r2] + strb r5, [r2, #1] + + ldrb r4, [r0, #2] + ldrb r5, [r0, #3] + + subs r12, r12, #1 + + strb r4, [r2, #2] + strb r5, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + + strb r4, [r2, #4] + strb r5, [r2, #5] + + ldrb r4, [r0, #6] + ldrb r5, [r0, #7] + + add r0, r0, r1 + + strb r4, [r2, #6] + strb r5, [r2, #7] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + + bne copy_mem8x4_1_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 4 bytes each time +copy_mem8x4_4 + ldr r4, [r0] + ldr r5, [r0, #4] + + mov r12, #4 + +copy_mem8x4_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + + bne copy_mem8x4_4_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + +;copy 8 bytes each time +copy_mem8x4_fast + ;sub r1, r1, #8 + ;sub r3, r3, #8 + + mov r12, #4 + +copy_mem8x4_fast_loop + ldmia r0, {r4-r5} + ;ldm r0, {r4-r5} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r5} + ;stm r2, {r4-r5} + add r2, r2, r3 + + bne copy_mem8x4_fast_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + + ENDP ; |vp9_copy_mem8x4_v6| + + END diff --git a/vp9/common/arm/armv6/vp9_copymem8x8_v6.asm b/vp9/common/arm/armv6/vp9_copymem8x8_v6.asm new file mode 100644 index 000000000..0dd971bfe --- /dev/null +++ b/vp9/common/arm/armv6/vp9_copymem8x8_v6.asm @@ -0,0 +1,128 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_copy_mem8x8_v6| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp9_copy_mem8x8_v6| PROC + ;push {r4-r5} + stmdb sp!, {r4-r5} + + ;preload + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + ands r4, r0, #7 + beq copy_mem8x8_fast + + ands r4, r0, #3 + beq copy_mem8x8_4 + + ;copy 1 byte each time + ldrb r4, [r0] + ldrb r5, [r0, #1] + + mov r12, #8 + +copy_mem8x8_1_loop + strb r4, [r2] + strb r5, [r2, #1] + + ldrb r4, [r0, #2] + ldrb r5, [r0, #3] + + subs r12, r12, #1 + + strb r4, [r2, #2] + strb r5, [r2, #3] + + ldrb r4, [r0, #4] + ldrb r5, [r0, #5] + + strb r4, [r2, #4] + strb r5, [r2, #5] + + ldrb r4, [r0, #6] + ldrb r5, [r0, #7] + + add r0, r0, r1 + + strb r4, [r2, #6] + strb r5, [r2, #7] + + add r2, r2, r3 + + ldrneb r4, [r0] + ldrneb r5, [r0, #1] + + bne copy_mem8x8_1_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 4 bytes each time +copy_mem8x8_4 + ldr r4, [r0] + ldr r5, [r0, #4] + + mov r12, #8 + +copy_mem8x8_4_loop + subs r12, r12, #1 + add r0, r0, r1 + + str r4, [r2] + str r5, [r2, #4] + + add r2, r2, r3 + + ldrne r4, [r0] + ldrne r5, [r0, #4] + + bne copy_mem8x8_4_loop + + ldmia sp!, {r4 - r5} + ;pop {r4-r5} + mov pc, lr + +;copy 8 bytes each time +copy_mem8x8_fast + ;sub r1, r1, #8 + ;sub r3, r3, #8 + + mov r12, #8 + +copy_mem8x8_fast_loop + ldmia r0, {r4-r5} + ;ldm r0, {r4-r5} + add r0, r0, r1 + + subs r12, r12, #1 + stmia r2, {r4-r5} + ;stm r2, {r4-r5} + add r2, r2, r3 + + bne copy_mem8x8_fast_loop + + ldmia sp!, {r4-r5} + ;pop {r4-r5} + mov pc, lr + + ENDP ; |vp9_copy_mem8x8_v6| + + END diff --git a/vp9/common/arm/armv6/vp9_dc_only_idct_add_v6.asm b/vp9/common/arm/armv6/vp9_dc_only_idct_add_v6.asm new file mode 100644 index 000000000..e0660e9fd --- /dev/null +++ b/vp9/common/arm/armv6/vp9_dc_only_idct_add_v6.asm @@ -0,0 +1,67 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + EXPORT |vp8_dc_only_idct_add_v6| + + AREA |.text|, CODE, READONLY + +;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr, +; unsigned char *dst_ptr, int pitch, int stride) +; r0 input_dc +; r1 pred_ptr +; r2 dest_ptr +; r3 pitch +; sp stride + +|vp8_dc_only_idct_add_v6| PROC + stmdb sp!, {r4 - r7, lr} + + add r0, r0, #4 ; input_dc += 4 + ldr r12, c0x0000FFFF + ldr r4, [r1], r3 + ldr r6, [r1], r3 + and r0, r12, r0, asr #3 ; input_dc >> 3 + mask + ldr lr, [sp, #20] + orr r0, r0, r0, lsl #16 ; a1 | a1 + + uxtab16 r5, r0, r4 ; a1+2 | a1+0 + uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1 + uxtab16 r7, r0, r6 + uxtab16 r6, r0, r6, ror #8 + usat16 r5, #8, r5 + usat16 r4, #8, r4 + usat16 r7, #8, r7 + usat16 r6, #8, r6 + orr r5, r5, r4, lsl #8 + orr r7, r7, r6, lsl #8 + ldr r4, [r1], r3 + ldr r6, [r1] + str r5, [r2], lr + str r7, [r2], lr + + uxtab16 r5, r0, r4 + uxtab16 r4, r0, r4, ror #8 + uxtab16 r7, r0, r6 + uxtab16 r6, r0, r6, ror #8 + usat16 r5, #8, r5 + usat16 r4, #8, r4 + usat16 r7, #8, r7 + usat16 r6, #8, r6 + orr r5, r5, r4, lsl #8 + orr r7, r7, r6, lsl #8 + str r5, [r2], lr + str r7, [r2] + + ldmia sp!, {r4 - r7, pc} + + ENDP ; |vp8_dc_only_idct_add_v6| + +; Constant Pool +c0x0000FFFF DCD 0x0000FFFF + END diff --git a/vp9/common/arm/armv6/vp9_filter_v6.asm b/vp9/common/arm/armv6/vp9_filter_v6.asm new file mode 100644 index 000000000..16b321e37 --- /dev/null +++ b/vp9/common/arm/armv6/vp9_filter_v6.asm @@ -0,0 +1,624 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_filter_block2d_first_pass_armv6| + EXPORT |vp9_filter_block2d_first_pass_16x16_armv6| + EXPORT |vp9_filter_block2d_first_pass_8x8_armv6| + EXPORT |vp9_filter_block2d_second_pass_armv6| + EXPORT |vp9_filter4_block2d_second_pass_armv6| + EXPORT |vp9_filter_block2d_first_pass_only_armv6| + EXPORT |vp9_filter_block2d_second_pass_only_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------------- +; r0 unsigned char *src_ptr +; r1 short *output_ptr +; r2 unsigned int src_pixels_per_line +; r3 unsigned int output_width +; stack unsigned int output_height +; stack const short *vp9_filter +;------------------------------------- +; vp9_filter the input and put in the output array. Apply the 6 tap FIR filter with +; the output being a 2 byte value and the intput being a 1 byte value. +|vp9_filter_block2d_first_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp9_filter address + ldr r7, [sp, #36] ; output height + + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts + add r12, r3, #16 ; square off the output + sub sp, sp, #4 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r1, [sp] ; push destination to stack + mov r7, r7, lsl #16 ; height is top part of counter + +; six tap filter +|height_loop_1st_6| + ldrb r8, [r0, #-2] ; load source data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + orr r7, r7, r3, lsr #2 ; construct loop counter + +|width_loop_1st_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + + smuad lr, lr, r4 ; apply the filter + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + sub r7, r7, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r11, r10, r6, r8 + + ands r10, r7, #0xff ; test loop counter + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 + add r11, r11, #0x40 + ldrneb r9, [r0, #-1] + usat r11, #8, r11, asr #7 + + strh lr, [r1], r12 ; result is transposed and stored, which + ; will make second pass filtering easier. + ldrneb r10, [r0], #2 + strh r11, [r1], r12 + + bne width_loop_1st_6 + + ldr r1, [sp] ; load and update dst address + subs r7, r7, #0x10000 + add r0, r0, r2 ; move to next input line + + add r1, r1, #2 ; move over to next column + str r1, [sp] + + bne height_loop_1st_6 + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +; -------------------------- +; 16x16 version +; ----------------------------- +|vp9_filter_block2d_first_pass_16x16_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp9_filter address + ldr r7, [sp, #36] ; output height + + add r4, r2, #18 ; preload next low + pld [r0, r4] + + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts + add r12, r3, #16 ; square off the output + sub sp, sp, #4 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r1, [sp] ; push destination to stack + mov r7, r7, lsl #16 ; height is top part of counter + +; six tap filter +|height_loop_1st_16_6| + ldrb r8, [r0, #-2] ; load source data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + orr r7, r7, r3, lsr #2 ; construct loop counter + +|width_loop_1st_16_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + + smuad lr, lr, r4 ; apply the filter + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + sub r7, r7, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r11, r10, r6, r8 + + ands r10, r7, #0xff ; test loop counter + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 + add r11, r11, #0x40 + ldrneb r9, [r0, #-1] + usat r11, #8, r11, asr #7 + + strh lr, [r1], r12 ; result is transposed and stored, which + ; will make second pass filtering easier. + ldrneb r10, [r0], #2 + strh r11, [r1], r12 + + bne width_loop_1st_16_6 + + ldr r1, [sp] ; load and update dst address + subs r7, r7, #0x10000 + add r0, r0, r2 ; move to next input line + + add r11, r2, #34 ; adding back block width(=16) + pld [r0, r11] ; preload next low + + add r1, r1, #2 ; move over to next column + str r1, [sp] + + bne height_loop_1st_16_6 + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +; -------------------------- +; 8x8 version +; ----------------------------- +|vp9_filter_block2d_first_pass_8x8_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; vp9_filter address + ldr r7, [sp, #36] ; output height + + add r4, r2, #10 ; preload next low + pld [r0, r4] + + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + mov r3, r3, lsl #1 ; multiply width by 2 because using shorts + add r12, r3, #16 ; square off the output + sub sp, sp, #4 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r1, [sp] ; push destination to stack + mov r7, r7, lsl #16 ; height is top part of counter + +; six tap filter +|height_loop_1st_8_6| + ldrb r8, [r0, #-2] ; load source data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + orr r7, r7, r3, lsr #2 ; construct loop counter + +|width_loop_1st_8_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + + smuad lr, lr, r4 ; apply the filter + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + sub r7, r7, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r11, r10, r6, r8 + + ands r10, r7, #0xff ; test loop counter + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 + add r11, r11, #0x40 + ldrneb r9, [r0, #-1] + usat r11, #8, r11, asr #7 + + strh lr, [r1], r12 ; result is transposed and stored, which + ; will make second pass filtering easier. + ldrneb r10, [r0], #2 + strh r11, [r1], r12 + + bne width_loop_1st_8_6 + + ldr r1, [sp] ; load and update dst address + subs r7, r7, #0x10000 + add r0, r0, r2 ; move to next input line + + add r11, r2, #18 ; adding back block width(=8) + pld [r0, r11] ; preload next low + + add r1, r1, #2 ; move over to next column + str r1, [sp] + + bne height_loop_1st_8_6 + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +;--------------------------------- +; r0 short *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int output_pitch, +; r3 unsigned int cnt, +; stack const short *vp9_filter +;--------------------------------- +|vp9_filter_block2d_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #36] ; vp9_filter address + sub sp, sp, #4 + mov r7, r3, lsl #16 ; height is top part of counter + str r1, [sp] ; push destination to stack + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + pkhbt r12, r5, r4 ; pack the filter differently + pkhbt r11, r6, r5 + + sub r0, r0, #4 ; offset input buffer + +|height_loop_2nd| + ldr r8, [r0] ; load the data + ldr r9, [r0, #4] + orr r7, r7, r3, lsr #1 ; loop counter + +|width_loop_2nd| + smuad lr, r4, r8 ; apply filter + sub r7, r7, #1 + smulbt r8, r4, r8 + + ldr r10, [r0, #8] + + smlad lr, r5, r9, lr + smladx r8, r12, r9, r8 + + ldrh r9, [r0, #12] + + smlad lr, r6, r10, lr + smladx r8, r11, r10, r8 + + add r0, r0, #4 + smlatb r10, r6, r9, r8 + + add lr, lr, #0x40 ; round_shift_and_clamp + ands r8, r7, #0xff + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r1], r2 ; the result is transposed back and stored + usat r10, #8, r10, asr #7 + + ldrne r8, [r0] ; load data for next loop + ldrne r9, [r0, #4] + strb r10, [r1], r2 + + bne width_loop_2nd + + ldr r1, [sp] ; update dst for next loop + subs r7, r7, #0x10000 + add r0, r0, #16 ; updata src for next loop + add r1, r1, #1 + str r1, [sp] + + bne height_loop_2nd + + add sp, sp, #4 + ldmia sp!, {r4 - r11, pc} + + ENDP + +;--------------------------------- +; r0 short *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int output_pitch, +; r3 unsigned int cnt, +; stack const short *vp9_filter +;--------------------------------- +|vp9_filter4_block2d_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #36] ; vp9_filter address + mov r7, r3, lsl #16 ; height is top part of counter + + ldr r4, [r11] ; load up packed filter coefficients + add lr, r1, r3 ; save final destination pointer + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + pkhbt r12, r5, r4 ; pack the filter differently + pkhbt r11, r6, r5 + mov r4, #0x40 ; rounding factor (for smlad{x}) + +|height_loop_2nd_4| + ldrd r8, [r0, #-4] ; load the data + orr r7, r7, r3, lsr #1 ; loop counter + +|width_loop_2nd_4| + ldr r10, [r0, #4]! + smladx r6, r9, r12, r4 ; apply filter + pkhbt r8, r9, r8 + smlad r5, r8, r12, r4 + pkhbt r8, r10, r9 + smladx r6, r10, r11, r6 + sub r7, r7, #1 + smlad r5, r8, r11, r5 + + mov r8, r9 ; shift the data for the next loop + mov r9, r10 + + usat r6, #8, r6, asr #7 ; shift and clamp + usat r5, #8, r5, asr #7 + + strb r5, [r1], r2 ; the result is transposed back and stored + tst r7, #0xff + strb r6, [r1], r2 + + bne width_loop_2nd_4 + + subs r7, r7, #0x10000 + add r0, r0, #16 ; update src for next loop + sub r1, lr, r7, lsr #16 ; update dst for next loop + + bne height_loop_2nd_4 + + ldmia sp!, {r4 - r11, pc} + + ENDP + +;------------------------------------ +; r0 unsigned char *src_ptr +; r1 unsigned char *output_ptr, +; r2 unsigned int src_pixels_per_line +; r3 unsigned int cnt, +; stack unsigned int output_pitch, +; stack const short *vp9_filter +;------------------------------------ +|vp9_filter_block2d_first_pass_only_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + add r7, r2, r3 ; preload next low + add r7, r7, #2 + pld [r0, r7] + + ldr r4, [sp, #36] ; output pitch + ldr r11, [sp, #40] ; HFilter address + sub sp, sp, #8 + + mov r7, r3 + sub r2, r2, r3 ; inside loop increments input array, + ; so the height loop only needs to add + ; r2 - width to the input pointer + + sub r4, r4, r3 + str r4, [sp] ; save modified output pitch + str r2, [sp, #4] + + mov r2, #0x40 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + +; six tap filter +|height_loop_1st_only_6| + ldrb r8, [r0, #-2] ; load data + ldrb r9, [r0, #-1] + ldrb r10, [r0], #2 + + mov r12, r3, lsr #1 ; loop counter + +|width_loop_1st_only_6| + ldrb r11, [r0, #-1] + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0] + +;; smuad lr, lr, r4 + smlad lr, lr, r4, r2 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 +;; smuad r8, r8, r4 + smlad r8, r8, r4, r2 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0, #1] + smlad r8, r11, r5, r8 + ldrb r11, [r0, #2] + + subs r12, r12, #1 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r10, r10, r6, r8 + +;; add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0, #-2] ; load data for next loop + usat lr, #8, lr, asr #7 +;; add r10, r10, #0x40 + strb lr, [r1], #1 ; store the result + usat r10, #8, r10, asr #7 + + ldrneb r9, [r0, #-1] + strb r10, [r1], #1 + ldrneb r10, [r0], #2 + + bne width_loop_1st_only_6 + + ldr lr, [sp] ; load back output pitch + ldr r12, [sp, #4] ; load back output pitch + subs r7, r7, #1 + add r0, r0, r12 ; updata src for next loop + + add r11, r12, r3 ; preload next low + add r11, r11, #2 + pld [r0, r11] + + add r1, r1, lr ; update dst for next loop + + bne height_loop_1st_only_6 + + add sp, sp, #8 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp9_filter_block2d_first_pass_only_armv6| + + +;------------------------------------ +; r0 unsigned char *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int src_pixels_per_line +; r3 unsigned int cnt, +; stack unsigned int output_pitch, +; stack const short *vp9_filter +;------------------------------------ +|vp9_filter_block2d_second_pass_only_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #40] ; VFilter address + ldr r12, [sp, #36] ; output pitch + + mov r7, r3, lsl #16 ; height is top part of counter + sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after + + sub sp, sp, #8 + + ldr r4, [r11] ; load up packed filter coefficients + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + str r0, [sp] ; save r0 to stack + str r1, [sp, #4] ; save dst to stack + +; six tap filter +|width_loop_2nd_only_6| + ldrb r8, [r0], r2 ; load data + orr r7, r7, r3 ; loop counter + ldrb r9, [r0], r2 + ldrb r10, [r0], r2 + +|height_loop_2nd_only_6| + ; filter first column in this inner loop, than, move to next colum. + ldrb r11, [r0], r2 + + pkhbt lr, r8, r9, lsl #16 ; r9 | r8 + pkhbt r8, r9, r10, lsl #16 ; r10 | r9 + + ldrb r9, [r0], r2 + + smuad lr, lr, r4 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + smuad r8, r8, r4 + pkhbt r11, r11, r9, lsl #16 ; r9 | r11 + + smlad lr, r10, r5, lr + ldrb r10, [r0], r2 + smlad r8, r11, r5, r8 + ldrb r11, [r0] + + sub r7, r7, #2 + sub r0, r0, r2, lsl #2 + + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + pkhbt r10, r10, r11, lsl #16 ; r11 | r10 + + smlad lr, r9, r6, lr + smlad r10, r10, r6, r8 + + ands r9, r7, #0xff + + add lr, lr, #0x40 ; round_shift_and_clamp + ldrneb r8, [r0], r2 ; load data for next loop + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r1], r12 ; store the result for the column + usat r10, #8, r10, asr #7 + + ldrneb r9, [r0], r2 + strb r10, [r1], r12 + ldrneb r10, [r0], r2 + + bne height_loop_2nd_only_6 + + ldr r0, [sp] + ldr r1, [sp, #4] + subs r7, r7, #0x10000 + add r0, r0, #1 ; move to filter next column + str r0, [sp] + add r1, r1, #1 + str r1, [sp, #4] + + bne width_loop_2nd_only_6 + + add sp, sp, #8 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp9_filter_block2d_second_pass_only_armv6| + + END diff --git a/vp9/common/arm/armv6/vp9_idct_v6.asm b/vp9/common/arm/armv6/vp9_idct_v6.asm new file mode 100644 index 000000000..27215afcd --- /dev/null +++ b/vp9/common/arm/armv6/vp9_idct_v6.asm @@ -0,0 +1,345 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14 + EXPORT |vp8_short_idct4x4llm_1_v6| + EXPORT |vp8_short_idct4x4llm_v6| + EXPORT |vp8_short_idct4x4llm_v6_scott| + EXPORT |vp8_short_idct4x4llm_v6_dual| + + AREA |.text|, CODE, READONLY + +;******************************************************************************** +;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: 3/5 +;******************************************************************************** + +|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit + ; + ldrsh r0, [r0] ; load input[0] 1, r0 un 2 + add r0, r0, #4 ; 1 +4 + stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup + mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3 + pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack + mov r5, r4 ; expand expand + + strd r4, [r1], r2 ; *output = r0, post inc 1 + strd r4, [r1], r2 ; 1 + strd r4, [r1], r2 ; 1 + strd r4, [r1] ; 1 + ; + ldmia sp!, {r4, r5, pc} ; replace vars, return restore + ENDP ; |vp8_short_idct4x4llm_1_v6| +;******************************************************************************** +;******************************************************************************** +;******************************************************************************** + +;******************************************************************************** +;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: +;******************************************************************************** + +|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit + ; + stmdb sp!, {r4-r11, lr} ; backup registers 1 backup + ; + mov r4, #0x00004E00 ; 1 cst + orr r4, r4, #0x0000007B ; cospi8sqrt2minus1 + mov r5, #0x00008A00 ; 1 cst + orr r5, r5, #0x0000008C ; sinpi8sqrt2 + ; + mov r6, #4 ; i=4 1 i +loop1 ; + ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4] + ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12] + ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8] + ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0] + smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1 + smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2 + add r9, r7, r8 ; a1 = [0] + [8] 1 a1 + sub r7, r7, r8 ; b1 = [0] - [8] 1 b1 + add r11, r3, r11 ; temp2 1 + rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1 + smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2 + smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1 + add r8, r7, r11 ; b1 + c1 1 b+c + strh r8, [r1, r2] ; out[pitch] = b1+c1 1 + sub r7, r7, r11 ; b1 - c1 1 b-c + add r10, r12, r10 ; temp1 1 + add r3, r10, r3 ; d1 = temp1 + temp2 1 d1 + add r10, r9, r3 ; a1 + d1 1 a+d + sub r3, r9, r3 ; a1 - d1 1 a-d + add r8, r2, r2 ; pitch * 2 1 p*2 + strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1 + add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3 + strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1 + subs r6, r6, #1 ; i-- 1 -- + strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++ + bne loop1 ; if i>0, continue + ; + sub r1, r1, #8 ; set up out for next loop 1 -4 + ; for this iteration, input=prev output + mov r6, #4 ; i=4 1 i +; b returnfull +loop2 ; + ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1] + ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3] + ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2] + ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0] + smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1 + smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2 + add r7, r0, r3 ; a1 = [0] + [2] 1 a1 + sub r0, r0, r3 ; b1 = [0] - [2] 1 b1 + add r10, r8, r10 ; temp2 1 + rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1 + smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2 + smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1 + add r3, r0, r9 ; b1+c1 1 b+c + add r3, r3, #4 ; b1+c1+4 1 +4 + add r10, r11, r10 ; temp1 1 + mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3 + strh r3, [r1, #2] ; out[1] = b1+c1 1 + add r10, r10, r8 ; d1 = temp1 + temp2 1 d1 + add r3, r7, r10 ; a1+d1 1 a+d + add r3, r3, #4 ; a1+d1+4 1 +4 + sub r7, r7, r10 ; a1-d1 1 a-d + add r7, r7, #4 ; a1-d1+4 1 +4 + mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3 + mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3 + strh r7, [r1, #6] ; out[3] = a1-d1 1 + sub r0, r0, r9 ; b1-c1 1 b-c + add r0, r0, #4 ; b1-c1+4 1 +4 + subs r6, r6, #1 ; i-- 1 -- + mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3 + strh r0, [r1, #4] ; out[2] = b1-c1 1 + strh r3, [r1], r2 ; out[0] = a1+d1 1 +; add r1, r1, r2 ; out += pitch 1 ++ + bne loop2 ; if i>0, continue +returnfull ; + ldmia sp!, {r4 - r11, pc} ; replace vars, return restore + ENDP + +;******************************************************************************** +;******************************************************************************** +;******************************************************************************** + +;******************************************************************************** +;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: +;******************************************************************************** + +|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit +; mov r0, #0 ; +; ldr r0, [r0] ; + stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup + ; + mov r3, #0x00004E00 ; cos + orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 + mov r4, #0x00008A00 ; sin + orr r4, r4, #0x0000008C ; sinpi8sqrt2 + ; + mov r5, #0x2 ; i i + ; +short_idct4x4llm_v6_scott_loop1 ; + ldr r10, [r0, #(4*2)] ; i5 | i4 5,4 + ldr r11, [r0, #(12*2)] ; i13 | i12 13,12 + ; + smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1 + smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2 + ; + smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2 + smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1 + ; + add r6, r6, r7 ; partial c1 lt1-lt2 + add r12, r12, r14 ; partial d1 l2t2+l2t1 + ; + smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1 + smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2 + ; + smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1 + smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2 + ; + add r7, r14, r7 ; partial c1_2 ht1+ht2 + sub r8, r8, r9 ; partial d1_2 h2t1-h2t2 + ; + pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack + pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack + ; + usub16 r6, r6, r10 ; c1_2 | c1_1 c + uadd16 r12, r12, r11 ; d1_2 | d1_1 d + ; + ldr r10, [r0, #0] ; i1 | i0 1,0 + ldr r11, [r0, #(8*2)] ; i9 | i10 9,10 + ; +;;;;;; add r0, r0, #0x4 ; +4 +;;;;;; add r1, r1, #0x4 ; +4 + ; + uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a + usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b + ; + uadd16 r7, r8, r12 ; a1 + d1 pair a+d + usub16 r14, r8, r12 ; a1 - d1 pair a-d + ; + str r7, [r1] ; op[0] = a1 + d1 + str r14, [r1, r2] ; op[pitch*3] = a1 - d1 + ; + add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++ + add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++ + ; + subs r5, r5, #0x1 ; -- + bne short_idct4x4llm_v6_scott_loop1 ; + ; + sub r1, r1, #16 ; reset output ptr + mov r5, #0x4 ; + mov r0, r1 ; input = output + ; +short_idct4x4llm_v6_scott_loop2 ; + ; + subs r5, r5, #0x1 ; + bne short_idct4x4llm_v6_scott_loop2 ; + ; + ldmia sp!, {r4 - r11, pc} ; + ENDP ; + ; +;******************************************************************************** +;******************************************************************************** +;******************************************************************************** + +;******************************************************************************** +;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch) +;* r0 INT16 * input +;* r1 INT16 * output +;* r2 INT32 pitch +;* bench: +;******************************************************************************** + +|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit + ; + stmdb sp!, {r4-r11, lr} ; backup registers 1 backup + mov r3, #0x00004E00 ; cos + orr r3, r3, #0x0000007B ; cospi8sqrt2minus1 + mov r4, #0x00008A00 ; sin + orr r4, r4, #0x0000008C ; sinpi8sqrt2 + mov r5, #0x2 ; i=2 i +loop1_dual + ldr r6, [r0, #(4*2)] ; i5 | i4 5|4 + ldr r12, [r0, #(12*2)] ; i13 | i12 13|12 + ldr r14, [r0, #(8*2)] ; i9 | i8 9|8 + + smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c + smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s + smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s + pkhbt r7, r7, r9, lsl #16 ; 5c | 4c + smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c + pkhbt r8, r8, r10, lsl #16 ; 5s | 4s + uadd16 r6, r6, r7 ; 5c+5 | 4c+4 + smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s + smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c + smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s + subs r5, r5, #0x1 ; i-- -- + pkhbt r9, r9, r11, lsl #16 ; 13c | 12c + ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0 + pkhbt r10, r10, r7, lsl #16 ; 13s | 12s + uadd16 r7, r12, r9 ; 13c+13 | 12c+12 + usub16 r7, r8, r7 ; c c + uadd16 r6, r6, r10 ; d d + uadd16 r10, r11, r14 ; a a + usub16 r8, r11, r14 ; b b + uadd16 r9, r10, r6 ; a+d a+d + usub16 r10, r10, r6 ; a-d a-d + uadd16 r6, r8, r7 ; b+c b+c + usub16 r7, r8, r7 ; b-c b-c + str r6, [r1, r2] ; o5 | o4 + add r6, r2, r2 ; pitch * 2 p2 + str r7, [r1, r6] ; o9 | o8 + add r6, r6, r2 ; pitch * 3 p3 + str r10, [r1, r6] ; o13 | o12 + str r9, [r1], #0x4 ; o1 | o0 ++ + bne loop1_dual ; + mov r5, #0x2 ; i=2 i + sub r0, r1, #8 ; reset input/output i/o +loop2_dual + ldr r6, [r0, r2] ; i5 | i4 5|4 + ldr r1, [r0] ; i1 | i0 1|0 + ldr r12, [r0, #0x4] ; i3 | i2 3|2 + add r14, r2, #0x4 ; pitch + 2 p+2 + ldr r14, [r0, r14] ; i7 | i6 7|6 + smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c + smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c + smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s + smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s + pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4 + pkhbt r7, r9, r7, lsl #16 ; 1c | 5c + pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1 + pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5 + uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2 + pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6 + uadd16 r10, r11, r9 ; a a + usub16 r9, r11, r9 ; b b + pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7 + subs r5, r5, #0x1 ; i-- -- + smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c + smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s + smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c + smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s + + pkhbt r7, r12, r7, lsl #16 ; 3c | 7c + pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1 + uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2 + usub16 r12, r8, r6 ; c (o1 | o5) c + uadd16 r6, r11, r1 ; d (o3 | o7) d + uadd16 r7, r10, r6 ; a+d a+d + mov r8, #0x4 ; set up 4's 4 + orr r8, r8, #0x40000 ; 4|4 + usub16 r6, r10, r6 ; a-d a-d + uadd16 r6, r6, r8 ; a-d+4 3|7 + uadd16 r7, r7, r8 ; a+d+4 0|4 + uadd16 r10, r9, r12 ; b+c b+c + usub16 r1, r9, r12 ; b-c b-c + uadd16 r10, r10, r8 ; b+c+4 1|5 + uadd16 r1, r1, r8 ; b-c+4 2|6 + mov r8, r10, asr #19 ; o1 >> 3 + strh r8, [r0, #2] ; o1 + mov r8, r1, asr #19 ; o2 >> 3 + strh r8, [r0, #4] ; o2 + mov r8, r6, asr #19 ; o3 >> 3 + strh r8, [r0, #6] ; o3 + mov r8, r7, asr #19 ; o0 >> 3 + strh r8, [r0], r2 ; o0 +p + sxth r10, r10 ; + mov r8, r10, asr #3 ; o5 >> 3 + strh r8, [r0, #2] ; o5 + sxth r1, r1 ; + mov r8, r1, asr #3 ; o6 >> 3 + strh r8, [r0, #4] ; o6 + sxth r6, r6 ; + mov r8, r6, asr #3 ; o7 >> 3 + strh r8, [r0, #6] ; o7 + sxth r7, r7 ; + mov r8, r7, asr #3 ; o4 >> 3 + strh r8, [r0], r2 ; o4 +p +;;;;; subs r5, r5, #0x1 ; i-- -- + bne loop2_dual ; + ; + ldmia sp!, {r4 - r11, pc} ; replace vars, return restore + ENDP + + END diff --git a/vp9/common/arm/armv6/vp9_iwalsh_v6.asm b/vp9/common/arm/armv6/vp9_iwalsh_v6.asm new file mode 100644 index 000000000..463bff0f5 --- /dev/null +++ b/vp9/common/arm/armv6/vp9_iwalsh_v6.asm @@ -0,0 +1,152 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp8_short_inv_walsh4x4_v6| + EXPORT |vp8_short_inv_walsh4x4_1_v6| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code + +;short vp8_short_inv_walsh4x4_v6(short *input, short *output) +|vp8_short_inv_walsh4x4_v6| PROC + + stmdb sp!, {r4 - r11, lr} + + ldr r2, [r0], #4 ; [1 | 0] + ldr r3, [r0], #4 ; [3 | 2] + ldr r4, [r0], #4 ; [5 | 4] + ldr r5, [r0], #4 ; [7 | 6] + ldr r6, [r0], #4 ; [9 | 8] + ldr r7, [r0], #4 ; [11 | 10] + ldr r8, [r0], #4 ; [13 | 12] + ldr r9, [r0] ; [15 | 14] + + qadd16 r10, r2, r8 ; a1 [1+13 | 0+12] + qadd16 r11, r4, r6 ; b1 [5+9 | 4+8] + qsub16 r12, r4, r6 ; c1 [5-9 | 4-8] + qsub16 lr, r2, r8 ; d1 [1-13 | 0-12] + + qadd16 r2, r10, r11 ; a1 + b1 [1 | 0] + qadd16 r4, r12, lr ; c1 + d1 [5 | 4] + qsub16 r6, r10, r11 ; a1 - b1 [9 | 8] + qsub16 r8, lr, r12 ; d1 - c1 [13 | 12] + + qadd16 r10, r3, r9 ; a1 [3+15 | 2+14] + qadd16 r11, r5, r7 ; b1 [7+11 | 6+10] + qsub16 r12, r5, r7 ; c1 [7-11 | 6-10] + qsub16 lr, r3, r9 ; d1 [3-15 | 2-14] + + qadd16 r3, r10, r11 ; a1 + b1 [3 | 2] + qadd16 r5, r12, lr ; c1 + d1 [7 | 6] + qsub16 r7, r10, r11 ; a1 - b1 [11 | 10] + qsub16 r9, lr, r12 ; d1 - c1 [15 | 14] + + ; first transform complete + + qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3] + qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3] + qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7] + qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7] + + qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1] + ldr r10, c0x00030003 + qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1] + + qadd16 r2, r2, r10 ; [b2+3|c2+3] + qadd16 r3, r3, r10 ; [a2+3|d2+3] + qadd16 r4, r4, r10 ; [b2+3|c2+3] + qadd16 r5, r5, r10 ; [a2+3|d2+3] + + asr r12, r2, #3 ; [1 | x] + pkhtb r12, r12, r3, asr #19; [1 | 0] + lsl lr, r3, #16 ; [~3 | x] + lsl r2, r2, #16 ; [~2 | x] + asr lr, lr, #3 ; [3 | x] + pkhtb lr, lr, r2, asr #19 ; [3 | 2] + + asr r2, r4, #3 ; [5 | x] + pkhtb r2, r2, r5, asr #19 ; [5 | 4] + lsl r3, r5, #16 ; [~7 | x] + lsl r4, r4, #16 ; [~6 | x] + asr r3, r3, #3 ; [7 | x] + pkhtb r3, r3, r4, asr #19 ; [7 | 6] + + str r12, [r1], #4 + str lr, [r1], #4 + str r2, [r1], #4 + str r3, [r1], #4 + + qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11] + qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11] + qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15] + qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15] + + qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1] + qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1] + qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1] + + qadd16 r6, r6, r10 ; [b2+3|c2+3] + qadd16 r7, r7, r10 ; [a2+3|d2+3] + qadd16 r8, r8, r10 ; [b2+3|c2+3] + qadd16 r9, r9, r10 ; [a2+3|d2+3] + + asr r2, r6, #3 ; [9 | x] + pkhtb r2, r2, r7, asr #19 ; [9 | 8] + lsl r3, r7, #16 ; [~11| x] + lsl r4, r6, #16 ; [~10| x] + asr r3, r3, #3 ; [11 | x] + pkhtb r3, r3, r4, asr #19 ; [11 | 10] + + asr r4, r8, #3 ; [13 | x] + pkhtb r4, r4, r9, asr #19 ; [13 | 12] + lsl r5, r9, #16 ; [~15| x] + lsl r6, r8, #16 ; [~14| x] + asr r5, r5, #3 ; [15 | x] + pkhtb r5, r5, r6, asr #19 ; [15 | 14] + + str r2, [r1], #4 + str r3, [r1], #4 + str r4, [r1], #4 + str r5, [r1] + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_short_inv_walsh4x4_v6| + + +;short vp8_short_inv_walsh4x4_1_v6(short *input, short *output) +|vp8_short_inv_walsh4x4_1_v6| PROC + + ldrsh r2, [r0] ; [0] + add r2, r2, #3 ; [0] + 3 + asr r2, r2, #3 ; a1 ([0]+3) >> 3 + lsl r2, r2, #16 ; [a1 | x] + orr r2, r2, r2, lsr #16 ; [a1 | a1] + + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1], #4 + str r2, [r1] + + bx lr + ENDP ; |vp8_short_inv_walsh4x4_1_v6| + +; Constant Pool +c0x00030003 DCD 0x00030003 + END diff --git a/vp9/common/arm/armv6/vp9_loopfilter_v6.asm b/vp9/common/arm/armv6/vp9_loopfilter_v6.asm new file mode 100644 index 000000000..37b54a39c --- /dev/null +++ b/vp9/common/arm/armv6/vp9_loopfilter_v6.asm @@ -0,0 +1,1282 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_loop_filter_horizontal_edge_armv6| + EXPORT |vp9_mbloop_filter_horizontal_edge_armv6| + EXPORT |vp9_loop_filter_vertical_edge_armv6| + EXPORT |vp9_mbloop_filter_vertical_edge_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + + MACRO + TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 + ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 + ; a0: 03 02 01 00 + ; a1: 13 12 11 10 + ; a2: 23 22 21 20 + ; a3: 33 32 31 30 + ; b3 b2 b1 b0 + + uxtb16 $b1, $a1 ; xx 12 xx 10 + uxtb16 $b0, $a0 ; xx 02 xx 00 + uxtb16 $b3, $a3 ; xx 32 xx 30 + uxtb16 $b2, $a2 ; xx 22 xx 20 + orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 + orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 + + uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 + uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 + uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 + uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 + orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 + orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 + + pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 + pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 + + pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 + pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 + MEND + + +src RN r0 +pstep RN r1 +count RN r5 + +;r0 unsigned char *src_ptr, +;r1 int src_pixel_step, +;r2 const char *blimit, +;r3 const char *limit, +;stack const char *thresh, +;stack int count + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp9_loop_filter_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r6, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r9, [src], pstep ; p3 + ldrb r4, [r2] ; blimit + ldr r10, [src], pstep ; p2 + ldrb r2, [r3] ; limit + ldr r11, [src], pstep ; p1 + orr r4, r4, r4, lsl #8 + ldrb r3, [r6] ; thresh + orr r2, r2, r2, lsl #8 + mov count, count, lsl #1 ; 4-in-parallel + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 + +|Hnext8| + ; vp9_filter_mask() function + ; calculate breakout conditions + ldr r12, [src], pstep ; p0 + + uqsub8 r6, r9, r10 ; p3 - p2 + uqsub8 r7, r10, r9 ; p2 - p3 + uqsub8 r8, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + + orr r6, r6, r7 ; abs (p3-p2) + orr r8, r8, r10 ; abs (p2-p1) + uqsub8 lr, r6, r2 ; compare to limit. lr: vp9_filter_mask + uqsub8 r8, r8, r2 ; compare to limit + uqsub8 r6, r11, r12 ; p1 - p0 + orr lr, lr, r8 + uqsub8 r7, r12, r11 ; p0 - p1 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + orr r6, r6, r7 ; abs (p1-p0) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later + orr lr, lr, r7 + + uqsub8 r6, r11, r10 ; p1 - q1 + uqsub8 r7, r10, r11 ; q1 - p1 + uqsub8 r11, r12, r9 ; p0 - q0 + uqsub8 r12, r9, r12 ; q0 - p0 + orr r6, r6, r7 ; abs (p1-q1) + ldr r7, c0x7F7F7F7F + orr r12, r11, r12 ; abs (p0-q0) + ldr r11, [src], pstep ; q2 + uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 + and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r7, r9, r10 ; q0 - q1 + uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r6, r10, r9 ; q1 - q0 + uqsub8 r12, r12, r4 ; compare to flimit + uqsub8 r9, r11, r10 ; q2 - q1 + + orr lr, lr, r12 + + ldr r12, [src], pstep ; q3 + uqsub8 r10, r10, r11 ; q1 - q2 + orr r6, r7, r6 ; abs (q1-q0) + orr r10, r9, r10 ; abs (q2-q1) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r10, r10, r2 ; compare to limit + uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later + orr lr, lr, r7 + orr lr, lr, r10 + + uqsub8 r10, r12, r11 ; q3 - q2 + uqsub8 r9, r11, r12 ; q2 - q3 + + mvn r11, #0 ; r11 == -1 + + orr r10, r10, r9 ; abs (q3-q2) + uqsub8 r10, r10, r2 ; compare to limit + + mov r12, #0 + orr lr, lr, r10 + sub src, src, pstep, lsl #2 + + usub8 lr, r12, lr ; use usub8 instead of ssub8 + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq hskip_filter ; skip filtering + + sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines + + ;vp8_hevmask() function + ;calculate high edge variance + orr r10, r6, r8 ; calculate vp8_hevmask + + ldr r7, [src], pstep ; p1 + + usub8 r10, r12, r10 ; use usub8 instead of ssub8 + sel r6, r12, r11 ; obtain vp8_hevmask: r6 + + ;vp9_filter() function + ldr r8, [src], pstep ; p0 + ldr r12, c0x80808080 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + + eor r7, r7, r12 ; p1 offset to convert to a signed value + eor r8, r8, r12 ; p0 offset to convert to a signed value + eor r9, r9, r12 ; q0 offset to convert to a signed value + eor r10, r10, r12 ; q1 offset to convert to a signed value + + str r9, [sp] ; store qs0 temporarily + str r8, [sp, #4] ; store ps0 temporarily + str r10, [sp, #8] ; store qs1 temporarily + str r7, [sp, #12] ; store ps1 temporarily + + qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1) + qsub8 r8, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) + + and r7, r7, r6 ; vp9_filter (r7) &= hev + + qadd8 r7, r7, r8 + ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 + + qadd8 r7, r7, r8 + ldr r10, c0x04040404 + + qadd8 r7, r7, r8 + and r7, r7, lr ; vp9_filter &= mask; + + ;modify code for vp8 -- Filter1 = vp9_filter (r7) + qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3) + qadd8 r7 , r7 , r10 ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4) + + mov r9, #0 + shadd8 r8 , r8 , r9 ; Filter2 >>= 3 + shadd8 r7 , r7 , r9 ; vp9_filter >>= 3 + shadd8 r8 , r8 , r9 + shadd8 r7 , r7 , r9 + shadd8 lr , r8 , r9 ; lr: Filter2 + shadd8 r7 , r7 , r9 ; r7: filter + + ;usub8 lr, r8, r10 ; s = (s==4)*-1 + ;sel lr, r11, r9 + ;usub8 r8, r10, r8 + ;sel r8, r11, r9 + ;and r8, r8, lr ; -1 for each element that equals 4 + + ;calculate output + ;qadd8 lr, r8, r7 ; u = vp9_signed_char_clamp(s + vp9_filter) + + ldr r8, [sp] ; load qs0 + ldr r9, [sp, #4] ; load ps0 + + ldr r10, c0x01010101 + + qsub8 r8 ,r8, r7 ; u = vp9_signed_char_clamp(qs0 - vp9_filter) + qadd8 r9, r9, lr ; u = vp9_signed_char_clamp(ps0 + Filter2) + + ;end of modification for vp8 + + mov lr, #0 + sadd8 r7, r7 , r10 ; vp9_filter += 1 + shadd8 r7, r7, lr ; vp9_filter >>= 1 + + ldr r11, [sp, #12] ; load ps1 + ldr r10, [sp, #8] ; load qs1 + + bic r7, r7, r6 ; vp9_filter &= ~hev + sub src, src, pstep, lsl #2 + + qadd8 r11, r11, r7 ; u = vp9_signed_char_clamp(ps1 + vp9_filter) + qsub8 r10, r10,r7 ; u = vp9_signed_char_clamp(qs1 - vp9_filter) + + eor r11, r11, r12 ; *op1 = u^0x80 + str r11, [src], pstep ; store op1 + eor r9, r9, r12 ; *op0 = u^0x80 + str r9, [src], pstep ; store op0 result + eor r8, r8, r12 ; *oq0 = u^0x80 + str r8, [src], pstep ; store oq0 result + eor r10, r10, r12 ; *oq1 = u^0x80 + str r10, [src], pstep ; store oq1 + + sub src, src, pstep, lsl #1 + +|hskip_filter| + add src, src, #4 + sub src, src, pstep, lsl #2 + + subs count, count, #1 + + ldrne r9, [src], pstep ; p3 + ldrne r10, [src], pstep ; p2 + ldrne r11, [src], pstep ; p1 + + bne Hnext8 + + add sp, sp, #16 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp9_loop_filter_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_mbloop_filter_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r6, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r9, [src], pstep ; p3 + ldrb r4, [r2] ; blimit + ldr r10, [src], pstep ; p2 + ldrb r2, [r3] ; limit + ldr r11, [src], pstep ; p1 + orr r4, r4, r4, lsl #8 + ldrb r3, [r6] ; thresh + orr r2, r2, r2, lsl #8 + mov count, count, lsl #1 ; 4-in-parallel + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 + +|MBHnext8| + + ; vp9_filter_mask() function + ; calculate breakout conditions + ldr r12, [src], pstep ; p0 + + uqsub8 r6, r9, r10 ; p3 - p2 + uqsub8 r7, r10, r9 ; p2 - p3 + uqsub8 r8, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + + orr r6, r6, r7 ; abs (p3-p2) + orr r8, r8, r10 ; abs (p2-p1) + uqsub8 lr, r6, r2 ; compare to limit. lr: vp9_filter_mask + uqsub8 r8, r8, r2 ; compare to limit + + uqsub8 r6, r11, r12 ; p1 - p0 + orr lr, lr, r8 + uqsub8 r7, r12, r11 ; p0 - p1 + ldr r9, [src], pstep ; q0 + ldr r10, [src], pstep ; q1 + orr r6, r6, r7 ; abs (p1-p0) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later + orr lr, lr, r7 + + uqsub8 r6, r11, r10 ; p1 - q1 + uqsub8 r7, r10, r11 ; q1 - p1 + uqsub8 r11, r12, r9 ; p0 - q0 + uqsub8 r12, r9, r12 ; q0 - p0 + orr r6, r6, r7 ; abs (p1-q1) + ldr r7, c0x7F7F7F7F + orr r12, r11, r12 ; abs (p0-q0) + ldr r11, [src], pstep ; q2 + uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 + and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r7, r9, r10 ; q0 - q1 + uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r6, r10, r9 ; q1 - q0 + uqsub8 r12, r12, r4 ; compare to flimit + uqsub8 r9, r11, r10 ; q2 - q1 + + orr lr, lr, r12 + + ldr r12, [src], pstep ; q3 + + uqsub8 r10, r10, r11 ; q1 - q2 + orr r6, r7, r6 ; abs (q1-q0) + orr r10, r9, r10 ; abs (q2-q1) + uqsub8 r7, r6, r2 ; compare to limit + uqsub8 r10, r10, r2 ; compare to limit + uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later + orr lr, lr, r7 + orr lr, lr, r10 + + uqsub8 r10, r12, r11 ; q3 - q2 + uqsub8 r9, r11, r12 ; q2 - q3 + + mvn r11, #0 ; r11 == -1 + + orr r10, r10, r9 ; abs (q3-q2) + uqsub8 r10, r10, r2 ; compare to limit + + mov r12, #0 + + orr lr, lr, r10 + + usub8 lr, r12, lr ; use usub8 instead of ssub8 + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq mbhskip_filter ; skip filtering + + ;vp8_hevmask() function + ;calculate high edge variance + sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines + sub src, src, pstep, lsl #1 + + orr r10, r6, r8 + ldr r7, [src], pstep ; p1 + + usub8 r10, r12, r10 + sel r6, r12, r11 ; hev mask: r6 + + ;vp8_mbfilter() function + ;p2, q2 are only needed at the end. Don't need to load them in now. + ldr r8, [src], pstep ; p0 + ldr r12, c0x80808080 + ldr r9, [src], pstep ; q0 + ldr r10, [src] ; q1 + + eor r7, r7, r12 ; ps1 + eor r8, r8, r12 ; ps0 + eor r9, r9, r12 ; qs0 + eor r10, r10, r12 ; qs1 + + qsub8 r12, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) + str r7, [sp, #12] ; store ps1 temporarily + qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1) + str r10, [sp, #8] ; store qs1 temporarily + qadd8 r7, r7, r12 + str r9, [sp] ; store qs0 temporarily + qadd8 r7, r7, r12 + str r8, [sp, #4] ; store ps0 temporarily + qadd8 r7, r7, r12 ; vp9_filter: r7 + + ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 + ldr r9, c0x04040404 + + and r7, r7, lr ; vp9_filter &= mask (lr is free) + + mov r12, r7 ; Filter2: r12 + and r12, r12, r6 ; Filter2 &= hev + + ;modify code for vp8 + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4) + qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3) + + mov r10, #0 + shadd8 r8 , r8 , r10 ; Filter1 >>= 3 + shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + shadd8 r8 , r8 , r10 + shadd8 r12 , r12 , r10 + shadd8 r8 , r8 , r10 ; r8: Filter1 + shadd8 r12 , r12 , r10 ; r12: Filter2 + + ldr r9, [sp] ; load qs0 + ldr r11, [sp, #4] ; load ps0 + + qsub8 r9 , r9, r8 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1) + qadd8 r11, r11, r12 ; ps0 = vp9_signed_char_clamp(ps0 + Filter2) + + ;save bottom 3 bits so that we round one side +4 and the other +3 + ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) + ;qadd8 r12 , r12 , r9 ; Filter2 = vp9_signed_char_clamp(Filter2+4) + ;mov r10, #0 + ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + ;usub8 lr, r8, r9 ; s = (s==4)*-1 + ;sel lr, r11, r10 + ;shadd8 r12 , r12 , r10 + ;usub8 r8, r9, r8 + ;sel r8, r11, r10 + ;ldr r9, [sp] ; load qs0 + ;ldr r11, [sp, #4] ; load ps0 + ;shadd8 r12 , r12 , r10 + ;and r8, r8, lr ; -1 for each element that equals 4 + ;qadd8 r10, r8, r12 ; u = vp9_signed_char_clamp(s + Filter2) + ;qsub8 r9 , r9, r12 ; qs0 = vp9_signed_char_clamp(qs0 - Filter2) + ;qadd8 r11, r11, r10 ; ps0 = vp9_signed_char_clamp(ps0 + u) + + ;end of modification for vp8 + + bic r12, r7, r6 ; vp9_filter &= ~hev ( r6 is free) + ;mov r12, r7 + + ;roughly 3/7th difference across boundary + mov lr, #0x1b ; 27 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r7, r10, lr, r7 + smultb r10, r10, lr + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + add r10, r10, #63 + ssat r7, #8, r7, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r7, r10, lsl #16 + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep + + orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7) + + qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs0 - u) + qadd8 r10, r11, r10 ; s = vp9_signed_char_clamp(ps0 + u) + eor r8, r8, lr ; *oq0 = s^0x80 + str r8, [src] ; store *oq0 + sub src, src, pstep + eor r10, r10, lr ; *op0 = s^0x80 + str r10, [src] ; store *op0 + + ;roughly 2/7th difference across boundary + mov lr, #0x12 ; 18 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r9, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r9, #8, r9, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r9, r10, lsl #16 + + ldr r9, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep + + orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7) + + qadd8 r11, r11, r10 ; s = vp9_signed_char_clamp(ps1 + u) + qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs1 - u) + eor r11, r11, lr ; *op1 = s^0x80 + str r11, [src], pstep ; store *op1 + eor r8, r8, lr ; *oq1 = s^0x80 + add src, src, pstep, lsl #1 + + mov r7, #0x3f ; 63 + + str r8, [src], pstep ; store *oq1 + + ;roughly 1/7th difference across boundary + mov lr, #0x9 ; 9 + ldr r9, [src] ; load q2 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r12, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r12, #8, r12, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r12, r10, lsl #16 + + sub src, src, pstep + ldr lr, c0x80808080 + + ldr r11, [src] ; load p2 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + eor r9, r9, lr + eor r11, r11, lr + + orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7) + + qadd8 r8, r11, r10 ; s = vp9_signed_char_clamp(ps2 + u) + qsub8 r10, r9, r10 ; s = vp9_signed_char_clamp(qs2 - u) + eor r8, r8, lr ; *op2 = s^0x80 + str r8, [src], pstep, lsl #2 ; store *op2 + add src, src, pstep + eor r10, r10, lr ; *oq2 = s^0x80 + str r10, [src], pstep, lsl #1 ; store *oq2 + +|mbhskip_filter| + add src, src, #4 + sub src, src, pstep, lsl #3 + subs count, count, #1 + + ldrne r9, [src], pstep ; p3 + ldrne r10, [src], pstep ; p2 + ldrne r11, [src], pstep ; p1 + + bne MBHnext8 + + add sp, sp, #16 + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp9_loop_filter_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, #4 ; move src pointer down by 4 + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r12, [sp, #36] ; load thresh address + sub sp, sp, #16 ; create temp buffer + + ldr r6, [src], pstep ; load source data + ldrb r4, [r2] ; blimit + ldr r7, [src], pstep + ldrb r2, [r3] ; limit + ldr r8, [src], pstep + orr r4, r4, r4, lsl #8 + ldrb r3, [r12] ; thresh + orr r2, r2, r2, lsl #8 + ldr lr, [src], pstep + mov count, count, lsl #1 ; 4-in-parallel + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 + +|Vnext8| + + ; vp9_filter_mask() function + ; calculate breakout conditions + ; transpose the source data for 4-in-parallel operation + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + uqsub8 r7, r9, r10 ; p3 - p2 + uqsub8 r8, r10, r9 ; p2 - p3 + uqsub8 r9, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + orr r7, r7, r8 ; abs (p3-p2) + orr r10, r9, r10 ; abs (p2-p1) + uqsub8 lr, r7, r2 ; compare to limit. lr: vp9_filter_mask + uqsub8 r10, r10, r2 ; compare to limit + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr lr, lr, r10 + + uqsub8 r6, r11, r12 ; p1 - p0 + uqsub8 r7, r12, r11 ; p0 - p1 + add src, src, #4 ; move src pointer up by 4 + orr r6, r6, r7 ; abs (p1-p0) + str r11, [sp, #12] ; save p1 + uqsub8 r10, r6, r2 ; compare to limit + uqsub8 r11, r6, r3 ; compare to thresh + orr lr, lr, r10 + + ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now + ; transpose the source data for 4-in-parallel operation + ldr r6, [src], pstep ; load source data + str r11, [sp] ; push r11 to stack + ldr r7, [src], pstep + str r12, [sp, #4] ; save current reg before load q0 - q3 data + ldr r8, [src], pstep + str lr, [sp, #8] + ldr lr, [src], pstep + + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + ldr lr, [sp, #8] ; load back (f)limit accumulator + + uqsub8 r6, r12, r11 ; q3 - q2 + uqsub8 r7, r11, r12 ; q2 - q3 + uqsub8 r12, r11, r10 ; q2 - q1 + uqsub8 r11, r10, r11 ; q1 - q2 + orr r6, r6, r7 ; abs (q3-q2) + orr r7, r12, r11 ; abs (q2-q1) + uqsub8 r6, r6, r2 ; compare to limit + uqsub8 r7, r7, r2 ; compare to limit + ldr r11, [sp, #4] ; load back p0 + ldr r12, [sp, #12] ; load back p1 + orr lr, lr, r6 + orr lr, lr, r7 + + uqsub8 r6, r11, r9 ; p0 - q0 + uqsub8 r7, r9, r11 ; q0 - p0 + uqsub8 r8, r12, r10 ; p1 - q1 + uqsub8 r11, r10, r12 ; q1 - p1 + orr r6, r6, r7 ; abs (p0-q0) + ldr r7, c0x7F7F7F7F + orr r8, r8, r11 ; abs (p1-q1) + uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 + and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r11, r10, r9 ; q1 - q0 + uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r12, r9, r10 ; q0 - q1 + uqsub8 r6, r6, r4 ; compare to flimit + + orr r9, r11, r12 ; abs (q1-q0) + uqsub8 r8, r9, r2 ; compare to limit + uqsub8 r10, r9, r3 ; compare to thresh + orr lr, lr, r6 + orr lr, lr, r8 + + mvn r11, #0 ; r11 == -1 + mov r12, #0 + + usub8 lr, r12, lr + ldr r9, [sp] ; load the compared result + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq vskip_filter ; skip filtering + + ;vp8_hevmask() function + ;calculate high edge variance + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r9, r9, r10 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + usub8 r9, r12, r9 + sel r6, r12, r11 ; hev mask: r6 + + ;vp9_filter() function + ; load soure data to r6, r11, r12, lr + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + pkhbt r12, r7, r8, lsl #16 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + pkhbt r11, r9, r10, lsl #16 + + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first + str r6, [sp] + str lr, [sp, #4] + + pkhbt r6, r7, r8, lsl #16 + pkhbt lr, r9, r10, lsl #16 + + ;transpose r12, r11, r6, lr to r7, r8, r9, r10 + TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 + + ;load back hev_mask r6 and filter_mask lr + ldr r12, c0x80808080 + ldr r6, [sp] + ldr lr, [sp, #4] + + eor r7, r7, r12 ; p1 offset to convert to a signed value + eor r8, r8, r12 ; p0 offset to convert to a signed value + eor r9, r9, r12 ; q0 offset to convert to a signed value + eor r10, r10, r12 ; q1 offset to convert to a signed value + + str r9, [sp] ; store qs0 temporarily + str r8, [sp, #4] ; store ps0 temporarily + str r10, [sp, #8] ; store qs1 temporarily + str r7, [sp, #12] ; store ps1 temporarily + + qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1) + qsub8 r8, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) + + and r7, r7, r6 ; vp9_filter (r7) &= hev (r7 : filter) + + qadd8 r7, r7, r8 + ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 + + qadd8 r7, r7, r8 + ldr r10, c0x04040404 + + qadd8 r7, r7, r8 + ;mvn r11, #0 ; r11 == -1 + + and r7, r7, lr ; vp9_filter &= mask + + ;modify code for vp8 -- Filter1 = vp9_filter (r7) + qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp9_signed_char_clamp(vp9_filter+3) + qadd8 r7 , r7 , r10 ; vp9_filter = vp9_signed_char_clamp(vp9_filter+4) + + mov r9, #0 + shadd8 r8 , r8 , r9 ; Filter2 >>= 3 + shadd8 r7 , r7 , r9 ; vp9_filter >>= 3 + shadd8 r8 , r8 , r9 + shadd8 r7 , r7 , r9 + shadd8 lr , r8 , r9 ; lr: filter2 + shadd8 r7 , r7 , r9 ; r7: filter + + ;usub8 lr, r8, r10 ; s = (s==4)*-1 + ;sel lr, r11, r9 + ;usub8 r8, r10, r8 + ;sel r8, r11, r9 + ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s + + ;calculate output + ;qadd8 lr, r8, r7 ; u = vp9_signed_char_clamp(s + vp9_filter) + + ldr r8, [sp] ; load qs0 + ldr r9, [sp, #4] ; load ps0 + + ldr r10, c0x01010101 + + qsub8 r8, r8, r7 ; u = vp9_signed_char_clamp(qs0 - vp9_filter) + qadd8 r9, r9, lr ; u = vp9_signed_char_clamp(ps0 + Filter2) + ;end of modification for vp8 + + eor r8, r8, r12 + eor r9, r9, r12 + + mov lr, #0 + + sadd8 r7, r7, r10 + shadd8 r7, r7, lr + + ldr r10, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + + bic r7, r7, r6 ; r7: vp9_filter + + qsub8 r10 , r10, r7 ; u = vp9_signed_char_clamp(qs1 - vp9_filter) + qadd8 r11, r11, r7 ; u = vp9_signed_char_clamp(ps1 + vp9_filter) + eor r10, r10, r12 + eor r11, r11, r12 + + sub src, src, pstep, lsl #2 + + ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1 + ;output is b0, b1, b2, b3 + ;b0: 03 02 01 00 + ;b1: 13 12 11 10 + ;b2: 23 22 21 20 + ;b3: 33 32 31 30 + ; p1 p0 q0 q1 + ; (a3 a2 a1 a0) + TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr + + strh r6, [src, #-2] ; store the result + mov r6, r6, lsr #16 + strh r6, [src], pstep + + strh r7, [src, #-2] + mov r7, r7, lsr #16 + strh r7, [src], pstep + + strh r12, [src, #-2] + mov r12, r12, lsr #16 + strh r12, [src], pstep + + strh lr, [src, #-2] + mov lr, lr, lsr #16 + strh lr, [src], pstep + +|vskip_filter| + sub src, src, #4 + subs count, count, #1 + + ldrne r6, [src], pstep ; load source data + ldrne r7, [src], pstep + ldrne r8, [src], pstep + ldrne lr, [src], pstep + + bne Vnext8 + + add sp, sp, #16 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp9_loop_filter_vertical_edge_armv6| + + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp8_mbloop_filter_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + sub src, src, #4 ; move src pointer down by 4 + ldr count, [sp, #40] ; count for 8-in-parallel + ldr r12, [sp, #36] ; load thresh address + pld [src, #23] ; preload for next block + sub sp, sp, #16 ; create temp buffer + + ldr r6, [src], pstep ; load source data + ldrb r4, [r2] ; blimit + pld [src, #23] + ldr r7, [src], pstep + ldrb r2, [r3] ; limit + pld [src, #23] + ldr r8, [src], pstep + orr r4, r4, r4, lsl #8 + ldrb r3, [r12] ; thresh + orr r2, r2, r2, lsl #8 + pld [src, #23] + ldr lr, [src], pstep + mov count, count, lsl #1 ; 4-in-parallel + orr r4, r4, r4, lsl #16 + orr r3, r3, r3, lsl #8 + orr r2, r2, r2, lsl #16 + orr r3, r3, r3, lsl #16 + +|MBVnext8| + ; vp9_filter_mask() function + ; calculate breakout conditions + ; transpose the source data for 4-in-parallel operation + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + uqsub8 r7, r9, r10 ; p3 - p2 + uqsub8 r8, r10, r9 ; p2 - p3 + uqsub8 r9, r10, r11 ; p2 - p1 + uqsub8 r10, r11, r10 ; p1 - p2 + orr r7, r7, r8 ; abs (p3-p2) + orr r10, r9, r10 ; abs (p2-p1) + uqsub8 lr, r7, r2 ; compare to limit. lr: vp9_filter_mask + uqsub8 r10, r10, r2 ; compare to limit + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr lr, lr, r10 + + uqsub8 r6, r11, r12 ; p1 - p0 + uqsub8 r7, r12, r11 ; p0 - p1 + add src, src, #4 ; move src pointer up by 4 + orr r6, r6, r7 ; abs (p1-p0) + str r11, [sp, #12] ; save p1 + uqsub8 r10, r6, r2 ; compare to limit + uqsub8 r11, r6, r3 ; compare to thresh + orr lr, lr, r10 + + ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now + ; transpose the source data for 4-in-parallel operation + ldr r6, [src], pstep ; load source data + str r11, [sp] ; push r11 to stack + ldr r7, [src], pstep + str r12, [sp, #4] ; save current reg before load q0 - q3 data + ldr r8, [src], pstep + str lr, [sp, #8] + ldr lr, [src], pstep + + + TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 + + ldr lr, [sp, #8] ; load back (f)limit accumulator + + uqsub8 r6, r12, r11 ; q3 - q2 + uqsub8 r7, r11, r12 ; q2 - q3 + uqsub8 r12, r11, r10 ; q2 - q1 + uqsub8 r11, r10, r11 ; q1 - q2 + orr r6, r6, r7 ; abs (q3-q2) + orr r7, r12, r11 ; abs (q2-q1) + uqsub8 r6, r6, r2 ; compare to limit + uqsub8 r7, r7, r2 ; compare to limit + ldr r11, [sp, #4] ; load back p0 + ldr r12, [sp, #12] ; load back p1 + orr lr, lr, r6 + orr lr, lr, r7 + + uqsub8 r6, r11, r9 ; p0 - q0 + uqsub8 r7, r9, r11 ; q0 - p0 + uqsub8 r8, r12, r10 ; p1 - q1 + uqsub8 r11, r10, r12 ; q1 - p1 + orr r6, r6, r7 ; abs (p0-q0) + ldr r7, c0x7F7F7F7F + orr r8, r8, r11 ; abs (p1-q1) + uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 + and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 + uqsub8 r11, r10, r9 ; q1 - q0 + uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 + uqsub8 r12, r9, r10 ; q0 - q1 + uqsub8 r6, r6, r4 ; compare to flimit + + orr r9, r11, r12 ; abs (q1-q0) + uqsub8 r8, r9, r2 ; compare to limit + uqsub8 r10, r9, r3 ; compare to thresh + orr lr, lr, r6 + orr lr, lr, r8 + + mvn r11, #0 ; r11 == -1 + mov r12, #0 + + usub8 lr, r12, lr + ldr r9, [sp] ; load the compared result + sel lr, r11, r12 ; filter mask: lr + + cmp lr, #0 + beq mbvskip_filter ; skip filtering + + + + ;vp8_hevmask() function + ;calculate high edge variance + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r9, r9, r10 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + usub8 r9, r12, r9 + sel r6, r12, r11 ; hev mask: r6 + + + ; vp8_mbfilter() function + ; p2, q2 are only needed at the end. Don't need to load them in now. + ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first + ; load soure data to r6, r11, r12, lr + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + pkhbt r12, r7, r8, lsl #16 + + ldrh r7, [src, #-2] + ldrh r8, [src], pstep + + pkhbt r11, r9, r10, lsl #16 + + ldrh r9, [src, #-2] + ldrh r10, [src], pstep + + str r6, [sp] ; save r6 + str lr, [sp, #4] ; save lr + + pkhbt r6, r7, r8, lsl #16 + pkhbt lr, r9, r10, lsl #16 + + ;transpose r12, r11, r6, lr to p1, p0, q0, q1 + TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 + + ;load back hev_mask r6 and filter_mask lr + ldr r12, c0x80808080 + ldr r6, [sp] + ldr lr, [sp, #4] + + eor r7, r7, r12 ; ps1 + eor r8, r8, r12 ; ps0 + eor r9, r9, r12 ; qs0 + eor r10, r10, r12 ; qs1 + + qsub8 r12, r9, r8 ; vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) + str r7, [sp, #12] ; store ps1 temporarily + qsub8 r7, r7, r10 ; vp9_signed_char_clamp(ps1-qs1) + str r10, [sp, #8] ; store qs1 temporarily + qadd8 r7, r7, r12 + str r9, [sp] ; store qs0 temporarily + qadd8 r7, r7, r12 + str r8, [sp, #4] ; store ps0 temporarily + qadd8 r7, r7, r12 ; vp9_filter: r7 + + ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 + ldr r9, c0x04040404 + ;mvn r11, #0 ; r11 == -1 + + and r7, r7, lr ; vp9_filter &= mask (lr is free) + + mov r12, r7 ; Filter2: r12 + and r12, r12, r6 ; Filter2 &= hev + + ;modify code for vp8 + ;save bottom 3 bits so that we round one side +4 and the other +3 + qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp9_signed_char_clamp(Filter2+4) + qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp9_signed_char_clamp(Filter2+3) + + mov r10, #0 + shadd8 r8 , r8 , r10 ; Filter1 >>= 3 + shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + shadd8 r8 , r8 , r10 + shadd8 r12 , r12 , r10 + shadd8 r8 , r8 , r10 ; r8: Filter1 + shadd8 r12 , r12 , r10 ; r12: Filter2 + + ldr r9, [sp] ; load qs0 + ldr r11, [sp, #4] ; load ps0 + + qsub8 r9 , r9, r8 ; qs0 = vp9_signed_char_clamp(qs0 - Filter1) + qadd8 r11, r11, r12 ; ps0 = vp9_signed_char_clamp(ps0 + Filter2) + + ;save bottom 3 bits so that we round one side +4 and the other +3 + ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) + ;qadd8 r12 , r12 , r9 ; Filter2 = vp9_signed_char_clamp(Filter2+4) + ;mov r10, #0 + ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 + ;usub8 lr, r8, r9 ; s = (s==4)*-1 + ;sel lr, r11, r10 + ;shadd8 r12 , r12 , r10 + ;usub8 r8, r9, r8 + ;sel r8, r11, r10 + ;ldr r9, [sp] ; load qs0 + ;ldr r11, [sp, #4] ; load ps0 + ;shadd8 r12 , r12 , r10 + ;and r8, r8, lr ; -1 for each element that equals 4 + ;qadd8 r10, r8, r12 ; u = vp9_signed_char_clamp(s + Filter2) + ;qsub8 r9 , r9, r12 ; qs0 = vp9_signed_char_clamp(qs0 - Filter2) + ;qadd8 r11, r11, r10 ; ps0 = vp9_signed_char_clamp(ps0 + u) + + ;end of modification for vp8 + + bic r12, r7, r6 ;vp9_filter &= ~hev ( r6 is free) + ;mov r12, r7 + + ;roughly 3/7th difference across boundary + mov lr, #0x1b ; 27 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r7, r10, lr, r7 + smultb r10, r10, lr + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + add r10, r10, #63 + ssat r7, #8, r7, asr #7 + ssat r10, #8, r10, asr #7 + + ldr lr, c0x80808080 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r7, r10, lsl #16 + uxtb16 r6, r6 + uxtb16 r10, r10 + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 27)>>7) + + qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs0 - u) + qadd8 r10, r11, r10 ; s = vp9_signed_char_clamp(ps0 + u) + eor r8, r8, lr ; *oq0 = s^0x80 + eor r10, r10, lr ; *op0 = s^0x80 + + strb r10, [src, #-1] ; store op0 result + strb r8, [src], pstep ; store oq0 result + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + strb r10, [src, #-1] + strb r8, [src], pstep + + ;roughly 2/7th difference across boundary + mov lr, #0x12 ; 18 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r9, r10, lr, r7 + + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r9, #8, r9, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r9, r10, lsl #16 + + ldr r9, [sp, #8] ; load qs1 + ldr r11, [sp, #12] ; load ps1 + ldr lr, c0x80808080 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + add src, src, #2 + + orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 18)>>7) + + qsub8 r8, r9, r10 ; s = vp9_signed_char_clamp(qs1 - u) + qadd8 r10, r11, r10 ; s = vp9_signed_char_clamp(ps1 + u) + eor r8, r8, lr ; *oq1 = s^0x80 + eor r10, r10, lr ; *op1 = s^0x80 + + ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary + strb r10, [src, #-4] ; store op1 + strb r8, [src, #-1] ; store oq1 + ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + orr r11, r11, r6, lsl #8 + orr r9, r9, r7, lsl #8 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + + mov r10, r10, lsr #8 + mov r8, r8, lsr #8 + orr r11, r11, r6, lsl #16 + orr r9, r9, r7, lsl #16 + + ldrb r6, [src, #-5] + strb r10, [src, #-4] + strb r8, [src, #-1] + ldrb r7, [src], pstep + orr r11, r11, r6, lsl #24 + orr r9, r9, r7, lsl #24 + + ;roughly 1/7th difference across boundary + eor r9, r9, lr + eor r11, r11, lr + + mov lr, #0x9 ; 9 + mov r7, #0x3f ; 63 + + sxtb16 r6, r12 + sxtb16 r10, r12, ror #8 + smlabb r8, r6, lr, r7 + smlatb r6, r6, lr, r7 + smlabb r12, r10, lr, r7 + smlatb r10, r10, lr, r7 + ssat r8, #8, r8, asr #7 + ssat r6, #8, r6, asr #7 + ssat r12, #8, r12, asr #7 + ssat r10, #8, r10, asr #7 + + sub src, src, pstep, lsl #2 + + pkhbt r6, r8, r6, lsl #16 + pkhbt r10, r12, r10, lsl #16 + + uxtb16 r6, r6 + uxtb16 r10, r10 + + ldr lr, c0x80808080 + + orr r10, r6, r10, lsl #8 ; u = vp9_signed_char_clamp((63 + Filter2 * 9)>>7) + + qadd8 r8, r11, r10 ; s = vp9_signed_char_clamp(ps2 + u) + qsub8 r10, r9, r10 ; s = vp9_signed_char_clamp(qs2 - u) + eor r8, r8, lr ; *op2 = s^0x80 + eor r10, r10, lr ; *oq2 = s^0x80 + + strb r8, [src, #-5] ; store *op2 + strb r10, [src], pstep ; store *oq2 + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + mov r8, r8, lsr #8 + mov r10, r10, lsr #8 + strb r8, [src, #-5] + strb r10, [src], pstep + + ;adjust src pointer for next loop + sub src, src, #2 + +|mbvskip_filter| + sub src, src, #4 + subs count, count, #1 + + pld [src, #23] ; preload for next block + ldrne r6, [src], pstep ; load source data + pld [src, #23] + ldrne r7, [src], pstep + pld [src, #23] + ldrne r8, [src], pstep + pld [src, #23] + ldrne lr, [src], pstep + + bne MBVnext8 + + add sp, sp, #16 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp8_mbloop_filter_vertical_edge_armv6| + +; Constant Pool +c0x80808080 DCD 0x80808080 +c0x03030303 DCD 0x03030303 +c0x04040404 DCD 0x04040404 +c0x01010101 DCD 0x01010101 +c0x7F7F7F7F DCD 0x7F7F7F7F + + END diff --git a/vp9/common/arm/armv6/vp9_recon_v6.asm b/vp9/common/arm/armv6/vp9_recon_v6.asm new file mode 100644 index 000000000..99c7bcf2d --- /dev/null +++ b/vp9/common/arm/armv6/vp9_recon_v6.asm @@ -0,0 +1,281 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_recon_b_armv6| + EXPORT |vp8_recon2b_armv6| + EXPORT |vp8_recon4b_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +prd RN r0 +dif RN r1 +dst RN r2 +stride RN r3 + +;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride) +; R0 char* pred_ptr +; R1 short * dif_ptr +; R2 char * dst_ptr +; R3 int stride + +; Description: +; Loop through the block adding the Pred and Diff together. Clamp and then +; store back into the Dst. + +; Restrictions : +; all buffers are expected to be 4 byte aligned coming in and +; going out. +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; +; +; +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp8_recon_b_armv6| PROC + stmdb sp!, {r4 - r9, lr} + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + add dif, dif, #32 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 +;; ldr r6, [dif, #8] ; 1 | 0 +;; ldr r7, [dif, #12] ; 3 | 2 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + add dif, dif, #32 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 +;; ldr r6, [dif, #16] ; 1 | 0 +;; ldr r7, [dif, #20] ; 3 | 2 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + add dif, dif, #32 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ;0, 1, 2, 3 + ldr r4, [prd], #16 ; 3 | 2 | 1 | 0 +;; ldr r6, [dif, #24] ; 1 | 0 +;; ldr r7, [dif, #28] ; 3 | 2 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst], stride + + ldmia sp!, {r4 - r9, pc} + + ENDP ; |recon_b| + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; +; +; +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; R0 char *pred_ptr +; R1 short *dif_ptr +; R2 char *dst_ptr +; R3 int stride +|vp8_recon4b_armv6| PROC + stmdb sp!, {r4 - r9, lr} + + mov lr, #4 + +recon4b_loop + ;0, 1, 2, 3 + ldr r4, [prd], #4 ; 3 | 2 | 1 | 0 + ldr r6, [dif, #0] ; 1 | 0 + ldr r7, [dif, #4] ; 3 | 2 + + pkhbt r8, r6, r7, lsl #16 ; 2 | 0 + pkhtb r9, r7, r6, asr #16 ; 3 | 1 + + uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0 + uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1 + + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst] + + ;4, 5, 6, 7 + ldr r4, [prd], #4 +;; ldr r6, [dif, #32] +;; ldr r7, [dif, #36] + ldr r6, [dif, #8] + ldr r7, [dif, #12] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #4] + + ;8, 9, 10, 11 + ldr r4, [prd], #4 +;; ldr r6, [dif, #64] +;; ldr r7, [dif, #68] + ldr r6, [dif, #16] + ldr r7, [dif, #20] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #8] + + ;12, 13, 14, 15 + ldr r4, [prd], #4 +;; ldr r6, [dif, #96] +;; ldr r7, [dif, #100] + ldr r6, [dif, #24] + ldr r7, [dif, #28] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #12] + + add dst, dst, stride +;; add dif, dif, #8 + add dif, dif, #32 + + subs lr, lr, #1 + bne recon4b_loop + + ldmia sp!, {r4 - r9, pc} + + ENDP ; |Recon4B| + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; +; +; +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +; R0 char *pred_ptr +; R1 short *dif_ptr +; R2 char *dst_ptr +; R3 int stride +|vp8_recon2b_armv6| PROC + stmdb sp!, {r4 - r9, lr} + + mov lr, #4 + +recon2b_loop + ;0, 1, 2, 3 + ldr r4, [prd], #4 + ldr r6, [dif, #0] + ldr r7, [dif, #4] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst] + + ;4, 5, 6, 7 + ldr r4, [prd], #4 +;; ldr r6, [dif, #32] +;; ldr r7, [dif, #36] + ldr r6, [dif, #8] + ldr r7, [dif, #12] + + pkhbt r8, r6, r7, lsl #16 + pkhtb r9, r7, r6, asr #16 + + uxtab16 r8, r8, r4 + uxtab16 r9, r9, r4, ror #8 + usat16 r8, #8, r8 + usat16 r9, #8, r9 + orr r8, r8, r9, lsl #8 + + str r8, [dst, #4] + + add dst, dst, stride +;; add dif, dif, #8 + add dif, dif, #16 + + subs lr, lr, #1 + bne recon2b_loop + + ldmia sp!, {r4 - r9, pc} + + ENDP ; |Recon2B| + + END diff --git a/vp9/common/arm/armv6/vp9_simpleloopfilter_v6.asm b/vp9/common/arm/armv6/vp9_simpleloopfilter_v6.asm new file mode 100644 index 000000000..8306912be --- /dev/null +++ b/vp9/common/arm/armv6/vp9_simpleloopfilter_v6.asm @@ -0,0 +1,286 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_loop_filter_simple_horizontal_edge_armv6| + EXPORT |vp9_loop_filter_simple_vertical_edge_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code + + MACRO + TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 + ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 + ; a0: 03 02 01 00 + ; a1: 13 12 11 10 + ; a2: 23 22 21 20 + ; a3: 33 32 31 30 + ; b3 b2 b1 b0 + + uxtb16 $b1, $a1 ; xx 12 xx 10 + uxtb16 $b0, $a0 ; xx 02 xx 00 + uxtb16 $b3, $a3 ; xx 32 xx 30 + uxtb16 $b2, $a2 ; xx 22 xx 20 + orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 + orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 + + uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 + uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 + uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 + uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 + orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 + orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 + + pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 + pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 + + pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 + pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 + MEND + + + +src RN r0 +pstep RN r1 + +;r0 unsigned char *src_ptr, +;r1 int src_pixel_step, +;r2 const char *blimit + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp9_loop_filter_simple_horizontal_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + ldrb r12, [r2] ; blimit + ldr r3, [src, -pstep, lsl #1] ; p1 + ldr r4, [src, -pstep] ; p0 + ldr r5, [src] ; q0 + ldr r6, [src, pstep] ; q1 + orr r12, r12, r12, lsl #8 ; blimit + ldr r2, c0x80808080 + orr r12, r12, r12, lsl #16 ; blimit + mov r9, #4 ; double the count. we're doing 4 at a time + mov lr, #0 ; need 0 in a couple places + +|simple_hnext8| + ; vp8_simple_filter_mask() + + uqsub8 r7, r3, r6 ; p1 - q1 + uqsub8 r8, r6, r3 ; q1 - p1 + uqsub8 r10, r4, r5 ; p0 - q0 + uqsub8 r11, r5, r4 ; q0 - p0 + orr r8, r8, r7 ; abs(p1 - q1) + orr r10, r10, r11 ; abs(p0 - q0) + uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2 + uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1 + uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 + mvn r8, #0 + usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags + sel r10, r8, lr ; filter mask: F or 0 + cmp r10, #0 + beq simple_hskip_filter ; skip filtering if all masks are 0x00 + + ;vp8_simple_filter() + + eor r3, r3, r2 ; p1 offset to convert to a signed value + eor r6, r6, r2 ; q1 offset to convert to a signed value + eor r4, r4, r2 ; p0 offset to convert to a signed value + eor r5, r5, r2 ; q0 offset to convert to a signed value + + qsub8 r3, r3, r6 ; vp9_filter = p1 - q1 + qsub8 r6, r5, r4 ; q0 - p0 + qadd8 r3, r3, r6 ; += q0 - p0 + ldr r7, c0x04040404 + qadd8 r3, r3, r6 ; += q0 - p0 + ldr r8, c0x03030303 + qadd8 r3, r3, r6 ; vp9_filter = p1-q1 + 3*(q0-p0)) + ;STALL + and r3, r3, r10 ; vp9_filter &= mask + + qadd8 r7 , r3 , r7 ; Filter1 = vp9_filter + 4 + qadd8 r8 , r3 , r8 ; Filter2 = vp9_filter + 3 + + shadd8 r7 , r7 , lr + shadd8 r8 , r8 , lr + shadd8 r7 , r7 , lr + shadd8 r8 , r8 , lr + shadd8 r7 , r7 , lr ; Filter1 >>= 3 + shadd8 r8 , r8 , lr ; Filter2 >>= 3 + + qsub8 r5 ,r5, r7 ; u = q0 - Filter1 + qadd8 r4, r4, r8 ; u = p0 + Filter2 + eor r5, r5, r2 ; *oq0 = u^0x80 + str r5, [src] ; store oq0 result + eor r4, r4, r2 ; *op0 = u^0x80 + str r4, [src, -pstep] ; store op0 result + +|simple_hskip_filter| + subs r9, r9, #1 + addne src, src, #4 ; next row + + ldrne r3, [src, -pstep, lsl #1] ; p1 + ldrne r4, [src, -pstep] ; p0 + ldrne r5, [src] ; q0 + ldrne r6, [src, pstep] ; q1 + + bne simple_hnext8 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp9_loop_filter_simple_horizontal_edge_armv6| + + +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +|vp9_loop_filter_simple_vertical_edge_armv6| PROC +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- + stmdb sp!, {r4 - r11, lr} + + ldrb r12, [r2] ; r12: blimit + ldr r2, c0x80808080 + orr r12, r12, r12, lsl #8 + + ; load soure data to r7, r8, r9, r10 + ldrh r3, [src, #-2] + pld [src, #23] ; preload for next block + ldrh r4, [src], pstep + orr r12, r12, r12, lsl #16 + + ldrh r5, [src, #-2] + pld [src, #23] + ldrh r6, [src], pstep + + pkhbt r7, r3, r4, lsl #16 + + ldrh r3, [src, #-2] + pld [src, #23] + ldrh r4, [src], pstep + + pkhbt r8, r5, r6, lsl #16 + + ldrh r5, [src, #-2] + pld [src, #23] + ldrh r6, [src], pstep + mov r11, #4 ; double the count. we're doing 4 at a time + +|simple_vnext8| + ; vp8_simple_filter_mask() function + pkhbt r9, r3, r4, lsl #16 + pkhbt r10, r5, r6, lsl #16 + + ;transpose r7, r8, r9, r10 to r3, r4, r5, r6 + TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6 + + uqsub8 r7, r3, r6 ; p1 - q1 + uqsub8 r8, r6, r3 ; q1 - p1 + uqsub8 r9, r4, r5 ; p0 - q0 + uqsub8 r10, r5, r4 ; q0 - p0 + orr r7, r7, r8 ; abs(p1 - q1) + orr r9, r9, r10 ; abs(p0 - q0) + mov r8, #0 + uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2 + uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2 + uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 + mvn r10, #0 ; r10 == -1 + + usub8 r7, r12, r7 ; compare to flimit + sel lr, r10, r8 ; filter mask + + cmp lr, #0 + beq simple_vskip_filter ; skip filtering + + ;vp8_simple_filter() function + eor r3, r3, r2 ; p1 offset to convert to a signed value + eor r6, r6, r2 ; q1 offset to convert to a signed value + eor r4, r4, r2 ; p0 offset to convert to a signed value + eor r5, r5, r2 ; q0 offset to convert to a signed value + + qsub8 r3, r3, r6 ; vp9_filter = p1 - q1 + qsub8 r6, r5, r4 ; q0 - p0 + + qadd8 r3, r3, r6 ; vp9_filter += q0 - p0 + ldr r9, c0x03030303 ; r9 = 3 + + qadd8 r3, r3, r6 ; vp9_filter += q0 - p0 + ldr r7, c0x04040404 + + qadd8 r3, r3, r6 ; vp9_filter = p1-q1 + 3*(q0-p0)) + ;STALL + and r3, r3, lr ; vp9_filter &= mask + + qadd8 r9 , r3 , r9 ; Filter2 = vp9_filter + 3 + qadd8 r3 , r3 , r7 ; Filter1 = vp9_filter + 4 + + shadd8 r9 , r9 , r8 + shadd8 r3 , r3 , r8 + shadd8 r9 , r9 , r8 + shadd8 r3 , r3 , r8 + shadd8 r9 , r9 , r8 ; Filter2 >>= 3 + shadd8 r3 , r3 , r8 ; Filter1 >>= 3 + + ;calculate output + sub src, src, pstep, lsl #2 + + qadd8 r4, r4, r9 ; u = p0 + Filter2 + qsub8 r5, r5, r3 ; u = q0 - Filter1 + eor r4, r4, r2 ; *op0 = u^0x80 + eor r5, r5, r2 ; *oq0 = u^0x80 + + strb r4, [src, #-1] ; store the result + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + mov r4, r4, lsr #8 + strb r5, [src], pstep + mov r5, r5, lsr #8 + + strb r4, [src, #-1] + strb r5, [src], pstep + +|simple_vskip_filter| + subs r11, r11, #1 + + ; load soure data to r7, r8, r9, r10 + ldrneh r3, [src, #-2] + pld [src, #23] ; preload for next block + ldrneh r4, [src], pstep + + ldrneh r5, [src, #-2] + pld [src, #23] + ldrneh r6, [src], pstep + + pkhbt r7, r3, r4, lsl #16 + + ldrneh r3, [src, #-2] + pld [src, #23] + ldrneh r4, [src], pstep + + pkhbt r8, r5, r6, lsl #16 + + ldrneh r5, [src, #-2] + pld [src, #23] + ldrneh r6, [src], pstep + + bne simple_vnext8 + + ldmia sp!, {r4 - r11, pc} + ENDP ; |vp9_loop_filter_simple_vertical_edge_armv6| + +; Constant Pool +c0x80808080 DCD 0x80808080 +c0x03030303 DCD 0x03030303 +c0x04040404 DCD 0x04040404 + + END diff --git a/vp9/common/arm/armv6/vp9_sixtappredict8x4_v6.asm b/vp9/common/arm/armv6/vp9_sixtappredict8x4_v6.asm new file mode 100644 index 000000000..5bf94e090 --- /dev/null +++ b/vp9/common/arm/armv6/vp9_sixtappredict8x4_v6.asm @@ -0,0 +1,273 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict8x4_armv6| + + AREA |.text|, CODE, READONLY ; name this block of code +;------------------------------------- +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack unsigned char *dst_ptr, +; stack int dst_pitch +;------------------------------------- +;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184. +;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack, +;and the result is stored in transpose. +|vp8_sixtap_predict8x4_armv6| PROC + stmdb sp!, {r4 - r11, lr} + str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + add lr, sp, #4 ;point to temporary buffer + beq skip_firstpass_filter + +;first-pass filter + adr r12, filter8_coeff + sub r0, r0, r1, lsl #1 + + add r3, r1, #10 ; preload next low + pld [r0, r3] + + add r2, r12, r2, lsl #4 ;calculate filter location + add r0, r0, #3 ;adjust src only for loading convinience + + ldr r3, [r2] ; load up packed filter coefficients + ldr r4, [r2, #4] + ldr r5, [r2, #8] + + mov r2, #0x90000 ; height=9 is top part of counter + + sub r1, r1, #8 + +|first_pass_hloop_v6| + ldrb r6, [r0, #-5] ; load source data + ldrb r7, [r0, #-4] + ldrb r8, [r0, #-3] + ldrb r9, [r0, #-2] + ldrb r10, [r0, #-1] + + orr r2, r2, #0x4 ; construct loop counter. width=8=4x2 + + pkhbt r6, r6, r7, lsl #16 ; r7 | r6 + pkhbt r7, r7, r8, lsl #16 ; r8 | r7 + + pkhbt r8, r8, r9, lsl #16 ; r9 | r8 + pkhbt r9, r9, r10, lsl #16 ; r10 | r9 + +|first_pass_wloop_v6| + smuad r11, r6, r3 ; vp9_filter[0], vp9_filter[1] + smuad r12, r7, r3 + + ldrb r6, [r0], #1 + + smlad r11, r8, r4, r11 ; vp9_filter[2], vp9_filter[3] + ldrb r7, [r0], #1 + smlad r12, r9, r4, r12 + + pkhbt r10, r10, r6, lsl #16 ; r10 | r9 + pkhbt r6, r6, r7, lsl #16 ; r11 | r10 + smlad r11, r10, r5, r11 ; vp9_filter[4], vp9_filter[5] + smlad r12, r6, r5, r12 + + sub r2, r2, #1 + + add r11, r11, #0x40 ; round_shift_and_clamp + tst r2, #0xff ; test loop counter + usat r11, #8, r11, asr #7 + add r12, r12, #0x40 + strh r11, [lr], #20 ; result is transposed and stored, which + usat r12, #8, r12, asr #7 + + strh r12, [lr], #20 + + movne r11, r6 + movne r12, r7 + + movne r6, r8 + movne r7, r9 + movne r8, r10 + movne r9, r11 + movne r10, r12 + + bne first_pass_wloop_v6 + + ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines + ;;IF ARCHITECTURE=6 + ;pld [src, ppl] + ;;pld [src, r9] + ;;ENDIF + + subs r2, r2, #0x10000 + + sub lr, lr, #158 + + add r0, r0, r1 ; move to next input line + + add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier + pld [r0, r11] + + bne first_pass_hloop_v6 + +;second pass filter +secondpass_filter + ldr r3, [sp], #4 ; load back yoffset + ldr r0, [sp, #216] ; load dst address from stack 180+36 + ldr r1, [sp, #220] ; load dst stride from stack 180+40 + + cmp r3, #0 + beq skip_secondpass_filter + + adr r12, filter8_coeff + add lr, r12, r3, lsl #4 ;calculate filter location + + mov r2, #0x00080000 + + ldr r3, [lr] ; load up packed filter coefficients + ldr r4, [lr, #4] + ldr r5, [lr, #8] + + pkhbt r12, r4, r3 ; pack the filter differently + pkhbt r11, r5, r4 + +second_pass_hloop_v6 + ldr r6, [sp] ; load the data + ldr r7, [sp, #4] + + orr r2, r2, #2 ; loop counter + +second_pass_wloop_v6 + smuad lr, r3, r6 ; apply filter + smulbt r10, r3, r6 + + ldr r8, [sp, #8] + + smlad lr, r4, r7, lr + smladx r10, r12, r7, r10 + + ldrh r9, [sp, #12] + + smlad lr, r5, r8, lr + smladx r10, r11, r8, r10 + + add sp, sp, #4 + smlatb r10, r5, r9, r10 + + sub r2, r2, #1 + + add lr, lr, #0x40 ; round_shift_and_clamp + tst r2, #0xff + usat lr, #8, lr, asr #7 + add r10, r10, #0x40 + strb lr, [r0], r1 ; the result is transposed back and stored + usat r10, #8, r10, asr #7 + + strb r10, [r0],r1 + + movne r6, r7 + movne r7, r8 + + bne second_pass_wloop_v6 + + subs r2, r2, #0x10000 + add sp, sp, #12 ; updata src for next loop (20-8) + sub r0, r0, r1, lsl #2 + add r0, r0, #1 + + bne second_pass_hloop_v6 + + add sp, sp, #20 + ldmia sp!, {r4 - r11, pc} + +;-------------------- +skip_firstpass_filter + sub r0, r0, r1, lsl #1 + sub r1, r1, #8 + mov r2, #9 + +skip_firstpass_hloop + ldrb r4, [r0], #1 ; load data + subs r2, r2, #1 + ldrb r5, [r0], #1 + strh r4, [lr], #20 ; store it to immediate buffer + ldrb r6, [r0], #1 ; load data + strh r5, [lr], #20 + ldrb r7, [r0], #1 + strh r6, [lr], #20 + ldrb r8, [r0], #1 + strh r7, [lr], #20 + ldrb r9, [r0], #1 + strh r8, [lr], #20 + ldrb r10, [r0], #1 + strh r9, [lr], #20 + ldrb r11, [r0], #1 + strh r10, [lr], #20 + add r0, r0, r1 ; move to next input line + strh r11, [lr], #20 + + sub lr, lr, #158 ; move over to next column + bne skip_firstpass_hloop + + b secondpass_filter + +;-------------------- +skip_secondpass_filter + mov r2, #8 + add sp, sp, #4 ;start from src[0] instead of src[-2] + +skip_secondpass_hloop + ldr r6, [sp], #4 + subs r2, r2, #1 + ldr r8, [sp], #4 + + mov r7, r6, lsr #16 ; unpack + strb r6, [r0], r1 + mov r9, r8, lsr #16 + strb r7, [r0], r1 + add sp, sp, #12 ; 20-8 + strb r8, [r0], r1 + strb r9, [r0], r1 + + sub r0, r0, r1, lsl #2 + add r0, r0, #1 + + bne skip_secondpass_hloop + + add sp, sp, #16 ; 180 - (160 +4) + + ldmia sp!, {r4 - r11, pc} + + ENDP + +;----------------- +;One word each is reserved. Label filter_coeff can be used to access the data. +;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... +filter8_coeff + DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000 + DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000 + DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000 + DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000 + DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000 + DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000 + DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000 + DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000 + + ;DCD 0, 0, 128, 0, 0, 0 + ;DCD 0, -6, 123, 12, -1, 0 + ;DCD 2, -11, 108, 36, -8, 1 + ;DCD 0, -9, 93, 50, -6, 0 + ;DCD 3, -16, 77, 77, -16, 3 + ;DCD 0, -6, 50, 93, -9, 0 + ;DCD 1, -8, 36, 108, -11, 2 + ;DCD 0, -1, 12, 123, -6, 0 + + END diff --git a/vp9/common/arm/bilinearfilter_arm.c b/vp9/common/arm/bilinearfilter_arm.c deleted file mode 100644 index 287fdf04d..000000000 --- a/vp9/common/arm/bilinearfilter_arm.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include -#include "vp9/common/filter.h" -#include "vp9/common/subpixel.h" -#include "bilinearfilter_arm.h" - -void vp9_filter_block2d_bil_armv6 -( - unsigned char *src_ptr, - unsigned char *dst_ptr, - unsigned int src_pitch, - unsigned int dst_pitch, - const short *HFilter, - const short *VFilter, - int Width, - int Height -) { - unsigned short FData[36 * 16]; /* Temp data buffer used in filtering */ - - /* First filter 1-D horizontally... */ - vp9_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); - - /* then 1-D vertically... */ - vp9_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter); -} - - -void vp9_bilinear_predict4x4_armv6 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); -} - -void vp9_bilinear_predict8x8_armv6 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); -} - -void vp9_bilinear_predict8x4_armv6 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); -} - -void vp9_bilinear_predict16x16_armv6 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); -} diff --git a/vp9/common/arm/bilinearfilter_arm.h b/vp9/common/arm/bilinearfilter_arm.h deleted file mode 100644 index b6d9cfc2d..000000000 --- a/vp9/common/arm/bilinearfilter_arm.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef BILINEARFILTER_ARM_H -#define BILINEARFILTER_ARM_H - -extern void vp9_filter_block2d_bil_first_pass_armv6 -( - const unsigned char *src_ptr, - unsigned short *dst_ptr, - unsigned int src_pitch, - unsigned int height, - unsigned int width, - const short *vp9_filter -); - -extern void vp9_filter_block2d_bil_second_pass_armv6 -( - const unsigned short *src_ptr, - unsigned char *dst_ptr, - int dst_pitch, - unsigned int height, - unsigned int width, - const short *vp9_filter -); - -#endif /* BILINEARFILTER_ARM_H */ diff --git a/vp9/common/arm/filter_arm.c b/vp9/common/arm/filter_arm.c deleted file mode 100644 index ccb34e236..000000000 --- a/vp9/common/arm/filter_arm.c +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include -#include "vp9/common/filter.h" -#include "vp9/common/subpixel.h" -#include "vpx_ports/mem.h" - -extern void vp9_filter_block2d_first_pass_armv6 -( - unsigned char *src_ptr, - short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int output_width, - unsigned int output_height, - const short *vp9_filter -); - -// 8x8 -extern void vp9_filter_block2d_first_pass_8x8_armv6 -( - unsigned char *src_ptr, - short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int output_width, - unsigned int output_height, - const short *vp9_filter -); - -// 16x16 -extern void vp9_filter_block2d_first_pass_16x16_armv6 -( - unsigned char *src_ptr, - short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int output_width, - unsigned int output_height, - const short *vp9_filter -); - -extern void vp9_filter_block2d_second_pass_armv6 -( - short *src_ptr, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int cnt, - const short *vp9_filter -); - -extern void vp9_filter4_block2d_second_pass_armv6 -( - short *src_ptr, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int cnt, - const short *vp9_filter -); - -extern void vp9_filter_block2d_first_pass_only_armv6 -( - unsigned char *src_ptr, - unsigned char *output_ptr, - unsigned int src_pixels_per_line, - unsigned int cnt, - unsigned int output_pitch, - const short *vp9_filter -); - - -extern void vp9_filter_block2d_second_pass_only_armv6 -( - unsigned char *src_ptr, - unsigned char *output_ptr, - unsigned int src_pixels_per_line, - unsigned int cnt, - unsigned int output_pitch, - const short *vp9_filter -); - -#if HAVE_ARMV6 -void vp9_sixtap_predict_armv6 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - DECLARE_ALIGNED_ARRAY(4, short, FData, 12 * 4); /* Temp data buffer used in filtering */ - - - HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ - VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ - - /* Vfilter is null. First pass only */ - if (xoffset && !yoffset) { - /*vp9_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter ); - vp9_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/ - - vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter); - } - /* Hfilter is null. Second pass only */ - else if (!xoffset && yoffset) { - vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter); - } else { - /* Vfilter is a 4 tap filter */ - if (yoffset & 0x1) { - vp9_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter); - vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter); - } - /* Vfilter is 6 tap filter */ - else { - vp9_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter); - vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter); - } - } -} - -void vp9_sixtap_predict8x8_armv6 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - DECLARE_ALIGNED_ARRAY(4, short, FData, 16 * 8); /* Temp data buffer used in filtering */ - - HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ - VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ - - if (xoffset && !yoffset) { - vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter); - } - /* Hfilter is null. Second pass only */ - else if (!xoffset && yoffset) { - vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter); - } else { - if (yoffset & 0x1) { - vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter); - vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); - } else { - vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter); - vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); - } - } -} - - -void vp9_sixtap_predict16x16_armv6 -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - DECLARE_ALIGNED_ARRAY(4, short, FData, 24 * 16); /* Temp data buffer used in filtering */ - - HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ - VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ - - if (xoffset && !yoffset) { - vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter); - } - /* Hfilter is null. Second pass only */ - else if (!xoffset && yoffset) { - vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter); - } else { - if (yoffset & 0x1) { - vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter); - vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); - } else { - vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter); - vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); - } - } - -} -#endif diff --git a/vp9/common/arm/idct_arm.h b/vp9/common/arm/idct_arm.h deleted file mode 100644 index 2fc4cf7fc..000000000 --- a/vp9/common/arm/idct_arm.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef IDCT_ARM_H -#define IDCT_ARM_H - -#if HAVE_ARMV6 -extern prototype_idct(vp9_short_idct4x4llm_1_v6); -extern prototype_idct(vp9_short_idct4x4llm_v6_dual); -extern prototype_idct_scalar_add(vp9_dc_only_idct_add_v6); -extern prototype_second_order(vp9_short_inv_walsh4x4_1_v6); -extern prototype_second_order(vp9_short_inv_walsh4x4_v6); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_idct_idct1 -#define vp9_idct_idct1 vp9_short_idct4x4llm_1_v6 - -#undef vp9_idct_idct16 -#define vp9_idct_idct16 vp9_short_idct4x4llm_v6_dual - -#undef vp9_idct_idct1_scalar_add -#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_v6 - -#undef vp8_idct_iwalsh1 -#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_v6 - -#undef vp8_idct_iwalsh16 -#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_v6 -#endif -#endif - -#if HAVE_ARMV7 -extern prototype_idct(vp9_short_idct4x4llm_1_neon); -extern prototype_idct(vp9_short_idct4x4llm_neon); -extern prototype_idct_scalar_add(vp9_dc_only_idct_add_neon); -extern prototype_second_order(vp9_short_inv_walsh4x4_1_neon); -extern prototype_second_order(vp9_short_inv_walsh4x4_neon); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_idct_idct1 -#define vp9_idct_idct1 vp9_short_idct4x4llm_1_neon - -#undef vp9_idct_idct16 -#define vp9_idct_idct16 vp9_short_idct4x4llm_neon - -#undef vp9_idct_idct1_scalar_add -#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_neon - -#undef vp8_idct_iwalsh1 -#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_neon - -#undef vp8_idct_iwalsh16 -#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_neon -#endif -#endif - -#endif diff --git a/vp9/common/arm/loopfilter_arm.c b/vp9/common/arm/loopfilter_arm.c deleted file mode 100644 index abcb4c7cc..000000000 --- a/vp9/common/arm/loopfilter_arm.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_config.h" -#include "vp9/common/loopfilter.h" -#include "vp9/common/onyxc_int.h" - -#if HAVE_ARMV6 -extern prototype_loopfilter(vp9_loop_filter_horizontal_edge_armv6); -extern prototype_loopfilter(vp9_loop_filter_vertical_edge_armv6); -extern prototype_loopfilter(vp9_mbloop_filter_horizontal_edge_armv6); -extern prototype_loopfilter(vp9_mbloop_filter_vertical_edge_armv6); -#endif - -#if HAVE_ARMV7 -typedef void loopfilter_y_neon(unsigned char *src, int pitch, - unsigned char blimit, unsigned char limit, unsigned char thresh); -typedef void loopfilter_uv_neon(unsigned char *u, int pitch, - unsigned char blimit, unsigned char limit, unsigned char thresh, - unsigned char *v); - -extern loopfilter_y_neon vp9_loop_filter_horizontal_edge_y_neon; -extern loopfilter_y_neon vp9_loop_filter_vertical_edge_y_neon; -extern loopfilter_y_neon vp9_mbloop_filter_horizontal_edge_y_neon; -extern loopfilter_y_neon vp9_mbloop_filter_vertical_edge_y_neon; - -extern loopfilter_uv_neon vp9_loop_filter_horizontal_edge_uv_neon; -extern loopfilter_uv_neon vp9_loop_filter_vertical_edge_uv_neon; -extern loopfilter_uv_neon vp9_mbloop_filter_horizontal_edge_uv_neon; -extern loopfilter_uv_neon vp9_mbloop_filter_vertical_edge_uv_neon; -#endif - -#if HAVE_ARMV6 -/*ARMV6 loopfilter functions*/ -/* Horizontal MB filtering */ -void vp9_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - vp9_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp9_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); -} - -/* Vertical MB Filtering */ -void vp9_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - vp9_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp9_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); -} - -/* Horizontal B Filtering */ -void vp9_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - vp9_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp9_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); -} - -void vp9_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit); -} - -/* Vertical B Filtering */ -void vp9_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - vp9_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp9_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); -} - -void vp9_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit); -} -#endif - -#if HAVE_ARMV7 -/* NEON loopfilter functions */ -/* Horizontal MB filtering */ -void vp9_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - unsigned char mblim = *lfi->mblim; - unsigned char lim = *lfi->lim; - unsigned char hev_thr = *lfi->hev_thr; - vp9_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); - - if (u_ptr) - vp9_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); -} - -/* Vertical MB Filtering */ -void vp9_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - unsigned char mblim = *lfi->mblim; - unsigned char lim = *lfi->lim; - unsigned char hev_thr = *lfi->hev_thr; - - vp9_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); - - if (u_ptr) - vp9_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); -} - -/* Horizontal B Filtering */ -void vp9_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - unsigned char blim = *lfi->blim; - unsigned char lim = *lfi->lim; - unsigned char hev_thr = *lfi->hev_thr; - - vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr); - vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr); - vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr); - - if (u_ptr) - vp9_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride); -} - -/* Vertical B Filtering */ -void vp9_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - unsigned char blim = *lfi->blim; - unsigned char lim = *lfi->lim; - unsigned char hev_thr = *lfi->hev_thr; - - vp9_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr); - vp9_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr); - vp9_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr); - - if (u_ptr) - vp9_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4); -} -#endif diff --git a/vp9/common/arm/loopfilter_arm.h b/vp9/common/arm/loopfilter_arm.h deleted file mode 100644 index de6b7ffbc..000000000 --- a/vp9/common/arm/loopfilter_arm.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef LOOPFILTER_ARM_H -#define LOOPFILTER_ARM_H - -#include "vpx_config.h" - -#if HAVE_ARMV6 -extern prototype_loopfilter_block(vp9_loop_filter_mbv_armv6); -extern prototype_loopfilter_block(vp9_loop_filter_bv_armv6); -extern prototype_loopfilter_block(vp9_loop_filter_mbh_armv6); -extern prototype_loopfilter_block(vp9_loop_filter_bh_armv6); -extern prototype_simple_loopfilter(vp9_loop_filter_bvs_armv6); -extern prototype_simple_loopfilter(vp9_loop_filter_bhs_armv6); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_armv6); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_armv6); - -#endif /* HAVE_ARMV6 */ - -#if HAVE_ARMV7 -extern prototype_loopfilter_block(vp9_loop_filter_mbv_neon); -extern prototype_loopfilter_block(vp9_loop_filter_bv_neon); -extern prototype_loopfilter_block(vp9_loop_filter_mbh_neon); -extern prototype_loopfilter_block(vp9_loop_filter_bh_neon); -extern prototype_simple_loopfilter(vp9_loop_filter_mbvs_neon); -extern prototype_simple_loopfilter(vp9_loop_filter_bvs_neon); -extern prototype_simple_loopfilter(vp9_loop_filter_mbhs_neon); -extern prototype_simple_loopfilter(vp9_loop_filter_bhs_neon); - -#endif /* HAVE_ARMV7 */ - -#endif /* LOOPFILTER_ARM_H */ diff --git a/vp9/common/arm/neon/bilinearpredict16x16_neon.asm b/vp9/common/arm/neon/bilinearpredict16x16_neon.asm deleted file mode 100644 index 2528be7c3..000000000 --- a/vp9/common/arm/neon/bilinearpredict16x16_neon.asm +++ /dev/null @@ -1,357 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict16x16_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -|vp8_bilinear_predict16x16_neon| PROC - push {r4-r5, lr} - - adr r12, bifilter16_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_bfilter16x16_only - - add r2, r12, r2, lsl #3 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {d31}, [r2] ;load first_pass filter - - beq firstpass_bfilter16x16_only - - sub sp, sp, #272 ;reserve space on stack for temporary storage - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - mov lr, sp - vld1.u8 {d5, d6, d7}, [r0], r1 - - mov r2, #3 ;loop counter - vld1.u8 {d8, d9, d10}, [r0], r1 - - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {d11, d12, d13}, [r0], r1 - - vdup.8 d1, d31[4] - -;First Pass: output_height lines x output_width columns (17x16) -filt_blk2d_fp16x16_loop_neon - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vqrshrn.u16 d21, q14, #7 - vld1.u8 {d5, d6, d7}, [r0], r1 - - vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result - vld1.u8 {d8, d9, d10}, [r0], r1 - vst1.u8 {d18, d19, d20, d21}, [lr]! - vld1.u8 {d11, d12, d13}, [r0], r1 - - bne filt_blk2d_fp16x16_loop_neon - -;First-pass filtering for rest 5 lines - vld1.u8 {d14, d15, d16}, [r0], r1 - - vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q10, d3, d0 - vmull.u8 q11, d5, d0 - vmull.u8 q12, d6, d0 - vmull.u8 q13, d8, d0 - vmull.u8 q14, d9, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - - vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q11, d5, d1 - vmlal.u8 q13, d8, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - - vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q12, d6, d1 - vmlal.u8 q14, d9, d1 - - vmull.u8 q1, d11, d0 - vmull.u8 q2, d12, d0 - vmull.u8 q3, d14, d0 - vmull.u8 q4, d15, d0 - - vext.8 d11, d11, d12, #1 ;construct src_ptr[1] - vext.8 d14, d14, d15, #1 - - vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q3, d14, d1 - - vext.8 d12, d12, d13, #1 - vext.8 d15, d15, d16, #1 - - vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q4, d15, d1 - - vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d11, q10, #7 - vqrshrn.u16 d12, q11, #7 - vqrshrn.u16 d13, q12, #7 - vqrshrn.u16 d14, q13, #7 - vqrshrn.u16 d15, q14, #7 - vqrshrn.u16 d16, q1, #7 - vqrshrn.u16 d17, q2, #7 - vqrshrn.u16 d18, q3, #7 - vqrshrn.u16 d19, q4, #7 - - vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result - vst1.u8 {d14, d15, d16, d17}, [lr]! - vst1.u8 {d18, d19}, [lr]! - -;Second pass: 16x16 -;secondpass_filter - add r3, r12, r3, lsl #3 - sub lr, lr, #272 - - vld1.u32 {d31}, [r3] ;load second_pass filter - - vld1.u8 {d22, d23}, [lr]! ;load src data - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - mov r12, #4 ;loop counter - -filt_blk2d_sp16x16_loop_neon - vld1.u8 {d24, d25}, [lr]! - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0]) - vld1.u8 {d26, d27}, [lr]! - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [lr]! - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [lr]! - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp9_filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - subs r12, r12, #1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r4], r5 ;store result - vst1.u8 {d4, d5}, [r4], r5 - vst1.u8 {d6, d7}, [r4], r5 - vmov q11, q15 - vst1.u8 {d8, d9}, [r4], r5 - - bne filt_blk2d_sp16x16_loop_neon - - add sp, sp, #272 - - pop {r4-r5,pc} - -;-------------------- -firstpass_bfilter16x16_only - mov r2, #4 ;loop counter - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vdup.8 d1, d31[4] - -;First Pass: output_height lines x output_width columns (16x16) -filt_blk2d_fpo16x16_loop_neon - vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data - vld1.u8 {d5, d6, d7}, [r0], r1 - vld1.u8 {d8, d9, d10}, [r0], r1 - vld1.u8 {d11, d12, d13}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q8, d3, d0 - vmull.u8 q9, d5, d0 - vmull.u8 q10, d6, d0 - vmull.u8 q11, d8, d0 - vmull.u8 q12, d9, d0 - vmull.u8 q13, d11, d0 - vmull.u8 q14, d12, d0 - - vext.8 d2, d2, d3, #1 ;construct src_ptr[1] - vext.8 d5, d5, d6, #1 - vext.8 d8, d8, d9, #1 - vext.8 d11, d11, d12, #1 - - vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q9, d5, d1 - vmlal.u8 q11, d8, d1 - vmlal.u8 q13, d11, d1 - - vext.8 d3, d3, d4, #1 - vext.8 d6, d6, d7, #1 - vext.8 d9, d9, d10, #1 - vext.8 d12, d12, d13, #1 - - vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp9_filter[1]) - vmlal.u8 q10, d6, d1 - vmlal.u8 q12, d9, d1 - vmlal.u8 q14, d12, d1 - - subs r2, r2, #1 - - vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d15, q8, #7 - vqrshrn.u16 d16, q9, #7 - vqrshrn.u16 d17, q10, #7 - vqrshrn.u16 d18, q11, #7 - vqrshrn.u16 d19, q12, #7 - vqrshrn.u16 d20, q13, #7 - vst1.u8 {d14, d15}, [r4], r5 ;store result - vqrshrn.u16 d21, q14, #7 - - vst1.u8 {d16, d17}, [r4], r5 - vst1.u8 {d18, d19}, [r4], r5 - vst1.u8 {d20, d21}, [r4], r5 - - bne filt_blk2d_fpo16x16_loop_neon - pop {r4-r5,pc} - -;--------------------- -secondpass_bfilter16x16_only -;Second pass: 16x16 -;secondpass_filter - add r3, r12, r3, lsl #3 - mov r12, #4 ;loop counter - vld1.u32 {d31}, [r3] ;load second_pass filter - vld1.u8 {d22, d23}, [r0], r1 ;load src data - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - -filt_blk2d_spo16x16_loop_neon - vld1.u8 {d24, d25}, [r0], r1 - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0]) - vld1.u8 {d26, d27}, [r0], r1 - vmull.u8 q2, d23, d0 - vld1.u8 {d28, d29}, [r0], r1 - vmull.u8 q3, d24, d0 - vld1.u8 {d30, d31}, [r0], r1 - - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp9_filter[1]) - vmlal.u8 q2, d25, d1 - vmlal.u8 q3, d26, d1 - vmlal.u8 q4, d27, d1 - vmlal.u8 q5, d28, d1 - vmlal.u8 q6, d29, d1 - vmlal.u8 q7, d30, d1 - vmlal.u8 q8, d31, d1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2, d3}, [r4], r5 ;store result - subs r12, r12, #1 - vst1.u8 {d4, d5}, [r4], r5 - vmov q11, q15 - vst1.u8 {d6, d7}, [r4], r5 - vst1.u8 {d8, d9}, [r4], r5 - - bne filt_blk2d_spo16x16_loop_neon - pop {r4-r5,pc} - - ENDP - -;----------------- - -bifilter16_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/vp9/common/arm/neon/bilinearpredict4x4_neon.asm b/vp9/common/arm/neon/bilinearpredict4x4_neon.asm deleted file mode 100644 index 01eedf8e9..000000000 --- a/vp9/common/arm/neon/bilinearpredict4x4_neon.asm +++ /dev/null @@ -1,130 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict4x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_bilinear_predict4x4_neon| PROC - push {r4, lr} - - adr r12, bifilter4_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (5x4) - vld1.u8 {d2}, [r0], r1 ;load src data - add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes) - - vld1.u8 {d3}, [r0], r1 - vld1.u32 {d31}, [r2] ;first_pass filter - - vld1.u8 {d4}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0-d1) - vld1.u8 {d5}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {d6}, [r0], r1 - - vshr.u64 q4, q1, #8 ;construct src_ptr[1] - vshr.u64 q5, q2, #8 - vshr.u64 d12, d6, #8 - - vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d4, d5 - vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - - vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q8, d4, d0 - vmull.u8 q9, d6, d0 - - vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp9_filter[1]) - vmlal.u8 q8, d10, d1 - vmlal.u8 q9, d12, d1 - - vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d29, q8, #7 - vqrshrn.u16 d30, q9, #7 - -;Second pass: 4x4 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq skip_secondpass_filter - - add r3, r12, r3, lsl #3 ;calculate Vfilter location - vld1.u32 {d31}, [r3] ;load second_pass filter - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d31[4] - - vmull.u8 q1, d28, d0 - vmull.u8 q2, d29, d0 - - vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step] - vext.8 d27, d29, d30, #4 - - vmlal.u8 q1, d26, d1 - vmlal.u8 q2, d27, d1 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - - vst1.32 {d2[0]}, [r4] ;store result - vst1.32 {d2[1]}, [r0] - vst1.32 {d3[0]}, [r1] - vst1.32 {d3[1]}, [r2] - - pop {r4, pc} - -;-------------------- -skip_firstpass_filter - - vld1.32 {d28[0]}, [r0], r1 ;load src data - vld1.32 {d28[1]}, [r0], r1 - vld1.32 {d29[0]}, [r0], r1 - vld1.32 {d29[1]}, [r0], r1 - vld1.32 {d30[0]}, [r0], r1 - - b secondpass_filter - -;--------------------- -skip_secondpass_filter - vst1.32 {d28[0]}, [r4], lr ;store result - vst1.32 {d28[1]}, [r4], lr - vst1.32 {d29[0]}, [r4], lr - vst1.32 {d29[1]}, [r4], lr - - pop {r4, pc} - - ENDP - -;----------------- - -bifilter4_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/vp9/common/arm/neon/bilinearpredict8x4_neon.asm b/vp9/common/arm/neon/bilinearpredict8x4_neon.asm deleted file mode 100644 index 8f49345ff..000000000 --- a/vp9/common/arm/neon/bilinearpredict8x4_neon.asm +++ /dev/null @@ -1,135 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict8x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_bilinear_predict8x4_neon| PROC - push {r4, lr} - - adr r12, bifilter8x4_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (5x8) - add r2, r12, r2, lsl #3 ;calculate filter location - - vld1.u8 {q1}, [r0], r1 ;load src data - vld1.u32 {d31}, [r2] ;load first_pass filter - vld1.u8 {q2}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {q3}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {q4}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0]) - vld1.u8 {q5}, [r0], r1 - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - vext.8 d11, d10, d11, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - vmlal.u8 q10, d11, d1 - - vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d23, q7, #7 - vqrshrn.u16 d24, q8, #7 - vqrshrn.u16 d25, q9, #7 - vqrshrn.u16 d26, q10, #7 - -;Second pass: 4x8 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq skip_secondpass_filter - - add r3, r12, r3, lsl #3 - add r0, r4, lr - - vld1.u32 {d31}, [r3] ;load second_pass filter - add r1, r0, lr - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q2, d23, d0 - vmull.u8 q3, d24, d0 - vmull.u8 q4, d25, d0 - - vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp9_filter[1]) - vmlal.u8 q2, d24, d1 - vmlal.u8 q3, d25, d1 - vmlal.u8 q4, d26, d1 - - add r2, r1, lr - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - - vst1.u8 {d2}, [r4] ;store result - vst1.u8 {d3}, [r0] - vst1.u8 {d4}, [r1] - vst1.u8 {d5}, [r2] - - pop {r4, pc} - -;-------------------- -skip_firstpass_filter - vld1.u8 {d22}, [r0], r1 ;load src data - vld1.u8 {d23}, [r0], r1 - vld1.u8 {d24}, [r0], r1 - vld1.u8 {d25}, [r0], r1 - vld1.u8 {d26}, [r0], r1 - - b secondpass_filter - -;--------------------- -skip_secondpass_filter - vst1.u8 {d22}, [r4], lr ;store result - vst1.u8 {d23}, [r4], lr - vst1.u8 {d24}, [r4], lr - vst1.u8 {d25}, [r4], lr - - pop {r4, pc} - - ENDP - -;----------------- - -bifilter8x4_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/vp9/common/arm/neon/bilinearpredict8x8_neon.asm b/vp9/common/arm/neon/bilinearpredict8x8_neon.asm deleted file mode 100644 index 6967f1950..000000000 --- a/vp9/common/arm/neon/bilinearpredict8x8_neon.asm +++ /dev/null @@ -1,183 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_bilinear_predict8x8_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_bilinear_predict8x8_neon| PROC - push {r4, lr} - - adr r12, bifilter8_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq skip_firstpass_filter - -;First pass: output_height lines x output_width columns (9x8) - add r2, r12, r2, lsl #3 ;calculate filter location - - vld1.u8 {q1}, [r0], r1 ;load src data - vld1.u32 {d31}, [r2] ;load first_pass filter - vld1.u8 {q2}, [r0], r1 - vdup.8 d0, d31[0] ;first_pass filter (d0 d1) - vld1.u8 {q3}, [r0], r1 - vdup.8 d1, d31[4] - vld1.u8 {q4}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - - vld1.u8 {q1}, [r0], r1 ;load src data - vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 - vld1.u8 {q2}, [r0], r1 - vqrshrn.u16 d23, q7, #7 - vld1.u8 {q3}, [r0], r1 - vqrshrn.u16 d24, q8, #7 - vld1.u8 {q4}, [r0], r1 - vqrshrn.u16 d25, q9, #7 - - ;first_pass filtering on the rest 5-line data - vld1.u8 {q5}, [r0], r1 - - vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q7, d4, d0 - vmull.u8 q8, d6, d0 - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - - vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vext.8 d9, d8, d9, #1 - vext.8 d11, d10, d11, #1 - - vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1]) - vmlal.u8 q7, d5, d1 - vmlal.u8 q8, d7, d1 - vmlal.u8 q9, d9, d1 - vmlal.u8 q10, d11, d1 - - vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d27, q7, #7 - vqrshrn.u16 d28, q8, #7 - vqrshrn.u16 d29, q9, #7 - vqrshrn.u16 d30, q10, #7 - -;Second pass: 8x8 -secondpass_filter - cmp r3, #0 ;skip second_pass filter if yoffset=0 - beq skip_secondpass_filter - - add r3, r12, r3, lsl #3 - add r0, r4, lr - - vld1.u32 {d31}, [r3] ;load second_pass filter - add r1, r0, lr - - vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) - vdup.8 d1, d31[4] - - vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0]) - vmull.u8 q2, d23, d0 - vmull.u8 q3, d24, d0 - vmull.u8 q4, d25, d0 - vmull.u8 q5, d26, d0 - vmull.u8 q6, d27, d0 - vmull.u8 q7, d28, d0 - vmull.u8 q8, d29, d0 - - vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp9_filter[1]) - vmlal.u8 q2, d24, d1 - vmlal.u8 q3, d25, d1 - vmlal.u8 q4, d26, d1 - vmlal.u8 q5, d27, d1 - vmlal.u8 q6, d28, d1 - vmlal.u8 q7, d29, d1 - vmlal.u8 q8, d30, d1 - - vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 - vqrshrn.u16 d3, q2, #7 - vqrshrn.u16 d4, q3, #7 - vqrshrn.u16 d5, q4, #7 - vqrshrn.u16 d6, q5, #7 - vqrshrn.u16 d7, q6, #7 - vqrshrn.u16 d8, q7, #7 - vqrshrn.u16 d9, q8, #7 - - vst1.u8 {d2}, [r4] ;store result - vst1.u8 {d3}, [r0] - vst1.u8 {d4}, [r1], lr - vst1.u8 {d5}, [r1], lr - vst1.u8 {d6}, [r1], lr - vst1.u8 {d7}, [r1], lr - vst1.u8 {d8}, [r1], lr - vst1.u8 {d9}, [r1], lr - - pop {r4, pc} - -;-------------------- -skip_firstpass_filter - vld1.u8 {d22}, [r0], r1 ;load src data - vld1.u8 {d23}, [r0], r1 - vld1.u8 {d24}, [r0], r1 - vld1.u8 {d25}, [r0], r1 - vld1.u8 {d26}, [r0], r1 - vld1.u8 {d27}, [r0], r1 - vld1.u8 {d28}, [r0], r1 - vld1.u8 {d29}, [r0], r1 - vld1.u8 {d30}, [r0], r1 - - b secondpass_filter - -;--------------------- -skip_secondpass_filter - vst1.u8 {d22}, [r4], lr ;store result - vst1.u8 {d23}, [r4], lr - vst1.u8 {d24}, [r4], lr - vst1.u8 {d25}, [r4], lr - vst1.u8 {d26}, [r4], lr - vst1.u8 {d27}, [r4], lr - vst1.u8 {d28}, [r4], lr - vst1.u8 {d29}, [r4], lr - - pop {r4, pc} - - ENDP - -;----------------- - -bifilter8_coeff - DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 - - END diff --git a/vp9/common/arm/neon/buildintrapredictorsmby_neon.asm b/vp9/common/arm/neon/buildintrapredictorsmby_neon.asm deleted file mode 100644 index e3ea91fe6..000000000 --- a/vp9/common/arm/neon/buildintrapredictorsmby_neon.asm +++ /dev/null @@ -1,584 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_build_intra_predictors_mby_neon_func| - EXPORT |vp8_build_intra_predictors_mby_s_neon_func| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -; r0 unsigned char *y_buffer -; r1 unsigned char *ypred_ptr -; r2 int y_stride -; r3 int mode -; stack int Up -; stack int Left - -|vp8_build_intra_predictors_mby_neon_func| PROC - push {r4-r8, lr} - - cmp r3, #0 - beq case_dc_pred - cmp r3, #1 - beq case_v_pred - cmp r3, #2 - beq case_h_pred - cmp r3, #3 - beq case_tm_pred - -case_dc_pred - ldr r4, [sp, #24] ; Up - ldr r5, [sp, #28] ; Left - - ; Default the DC average to 128 - mov r12, #128 - vdup.u8 q0, r12 - - ; Zero out running sum - mov r12, #0 - - ; compute shift and jump - adds r7, r4, r5 - beq skip_dc_pred_up_left - - ; Load above row, if it exists - cmp r4, #0 - beq skip_dc_pred_up - - sub r6, r0, r2 - vld1.8 {q1}, [r6] - vpaddl.u8 q2, q1 - vpaddl.u16 q3, q2 - vpaddl.u32 q4, q3 - - vmov.32 r4, d8[0] - vmov.32 r6, d9[0] - - add r12, r4, r6 - - ; Move back to interger registers - -skip_dc_pred_up - - cmp r5, #0 - beq skip_dc_pred_left - - sub r0, r0, #1 - - ; Load left row, if it exists - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0] - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - -skip_dc_pred_left - add r7, r7, #3 ; Shift - sub r4, r7, #1 - mov r5, #1 - add r12, r12, r5, lsl r4 - mov r5, r12, lsr r7 ; expected_dc - - vdup.u8 q0, r5 - -skip_dc_pred_up_left - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - - pop {r4-r8,pc} -case_v_pred - ; Copy down above row - sub r6, r0, r2 - vld1.8 {q0}, [r6] - - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - vst1.u8 {q0}, [r1]! - pop {r4-r8,pc} - -case_h_pred - ; Load 4x yleft_col - sub r0, r0, #1 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - pop {r4-r8,pc} - -case_tm_pred - ; Load yabove_row - sub r3, r0, r2 - vld1.8 {q8}, [r3] - - ; Load ytop_left - sub r3, r3, #1 - ldrb r7, [r3] - - vdup.u16 q7, r7 - - ; Compute yabove_row - ytop_left - mov r3, #1 - vdup.u8 q0, r3 - - vmull.u8 q4, d16, d0 - vmull.u8 q5, d17, d0 - - vsub.s16 q4, q4, q7 - vsub.s16 q5, q5, q7 - - ; Load 4x yleft_col - sub r0, r0, #1 - mov r12, #4 - -case_tm_pred_loop - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u16 q0, r3 - vdup.u16 q1, r4 - vdup.u16 q2, r5 - vdup.u16 q3, r6 - - vqadd.s16 q8, q0, q4 - vqadd.s16 q9, q0, q5 - - vqadd.s16 q10, q1, q4 - vqadd.s16 q11, q1, q5 - - vqadd.s16 q12, q2, q4 - vqadd.s16 q13, q2, q5 - - vqadd.s16 q14, q3, q4 - vqadd.s16 q15, q3, q5 - - vqshrun.s16 d0, q8, #0 - vqshrun.s16 d1, q9, #0 - - vqshrun.s16 d2, q10, #0 - vqshrun.s16 d3, q11, #0 - - vqshrun.s16 d4, q12, #0 - vqshrun.s16 d5, q13, #0 - - vqshrun.s16 d6, q14, #0 - vqshrun.s16 d7, q15, #0 - - vst1.u8 {q0}, [r1]! - vst1.u8 {q1}, [r1]! - vst1.u8 {q2}, [r1]! - vst1.u8 {q3}, [r1]! - - subs r12, r12, #1 - bne case_tm_pred_loop - - pop {r4-r8,pc} - - ENDP - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; r0 unsigned char *y_buffer -; r1 unsigned char *ypred_ptr -; r2 int y_stride -; r3 int mode -; stack int Up -; stack int Left - -|vp8_build_intra_predictors_mby_s_neon_func| PROC - push {r4-r8, lr} - - mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor; - - cmp r3, #0 - beq case_dc_pred_s - cmp r3, #1 - beq case_v_pred_s - cmp r3, #2 - beq case_h_pred_s - cmp r3, #3 - beq case_tm_pred_s - -case_dc_pred_s - ldr r4, [sp, #24] ; Up - ldr r5, [sp, #28] ; Left - - ; Default the DC average to 128 - mov r12, #128 - vdup.u8 q0, r12 - - ; Zero out running sum - mov r12, #0 - - ; compute shift and jump - adds r7, r4, r5 - beq skip_dc_pred_up_left_s - - ; Load above row, if it exists - cmp r4, #0 - beq skip_dc_pred_up_s - - sub r6, r0, r2 - vld1.8 {q1}, [r6] - vpaddl.u8 q2, q1 - vpaddl.u16 q3, q2 - vpaddl.u32 q4, q3 - - vmov.32 r4, d8[0] - vmov.32 r6, d9[0] - - add r12, r4, r6 - - ; Move back to interger registers - -skip_dc_pred_up_s - - cmp r5, #0 - beq skip_dc_pred_left_s - - sub r0, r0, #1 - - ; Load left row, if it exists - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0] - - add r12, r12, r3 - add r12, r12, r4 - add r12, r12, r5 - add r12, r12, r6 - -skip_dc_pred_left_s - add r7, r7, #3 ; Shift - sub r4, r7, #1 - mov r5, #1 - add r12, r12, r5, lsl r4 - mov r5, r12, lsr r7 ; expected_dc - - vdup.u8 q0, r5 - -skip_dc_pred_up_left_s - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - - pop {r4-r8,pc} -case_v_pred_s - ; Copy down above row - sub r6, r0, r2 - vld1.8 {q0}, [r6] - - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q0}, [r1], r2 - pop {r4-r8,pc} - -case_h_pred_s - ; Load 4x yleft_col - sub r0, r0, #1 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u8 q0, r3 - vdup.u8 q1, r4 - vdup.u8 q2, r5 - vdup.u8 q3, r6 - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - pop {r4-r8,pc} - -case_tm_pred_s - ; Load yabove_row - sub r3, r0, r2 - vld1.8 {q8}, [r3] - - ; Load ytop_left - sub r3, r3, #1 - ldrb r7, [r3] - - vdup.u16 q7, r7 - - ; Compute yabove_row - ytop_left - mov r3, #1 - vdup.u8 q0, r3 - - vmull.u8 q4, d16, d0 - vmull.u8 q5, d17, d0 - - vsub.s16 q4, q4, q7 - vsub.s16 q5, q5, q7 - - ; Load 4x yleft_col - sub r0, r0, #1 - mov r12, #4 - -case_tm_pred_loop_s - ldrb r3, [r0], r2 - ldrb r4, [r0], r2 - ldrb r5, [r0], r2 - ldrb r6, [r0], r2 - vdup.u16 q0, r3 - vdup.u16 q1, r4 - vdup.u16 q2, r5 - vdup.u16 q3, r6 - - vqadd.s16 q8, q0, q4 - vqadd.s16 q9, q0, q5 - - vqadd.s16 q10, q1, q4 - vqadd.s16 q11, q1, q5 - - vqadd.s16 q12, q2, q4 - vqadd.s16 q13, q2, q5 - - vqadd.s16 q14, q3, q4 - vqadd.s16 q15, q3, q5 - - vqshrun.s16 d0, q8, #0 - vqshrun.s16 d1, q9, #0 - - vqshrun.s16 d2, q10, #0 - vqshrun.s16 d3, q11, #0 - - vqshrun.s16 d4, q12, #0 - vqshrun.s16 d5, q13, #0 - - vqshrun.s16 d6, q14, #0 - vqshrun.s16 d7, q15, #0 - - vst1.u8 {q0}, [r1], r2 - vst1.u8 {q1}, [r1], r2 - vst1.u8 {q2}, [r1], r2 - vst1.u8 {q3}, [r1], r2 - - subs r12, r12, #1 - bne case_tm_pred_loop_s - - pop {r4-r8,pc} - - ENDP - - - END diff --git a/vp9/common/arm/neon/copymem16x16_neon.asm b/vp9/common/arm/neon/copymem16x16_neon.asm deleted file mode 100644 index bff8156d9..000000000 --- a/vp9/common/arm/neon/copymem16x16_neon.asm +++ /dev/null @@ -1,59 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_copy_mem16x16_neon| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp9_copy_mem16x16_neon| PROC - - vld1.u8 {q0}, [r0], r1 - vld1.u8 {q1}, [r0], r1 - vld1.u8 {q2}, [r0], r1 - vst1.u8 {q0}, [r2], r3 - vld1.u8 {q3}, [r0], r1 - vst1.u8 {q1}, [r2], r3 - vld1.u8 {q4}, [r0], r1 - vst1.u8 {q2}, [r2], r3 - vld1.u8 {q5}, [r0], r1 - vst1.u8 {q3}, [r2], r3 - vld1.u8 {q6}, [r0], r1 - vst1.u8 {q4}, [r2], r3 - vld1.u8 {q7}, [r0], r1 - vst1.u8 {q5}, [r2], r3 - vld1.u8 {q8}, [r0], r1 - vst1.u8 {q6}, [r2], r3 - vld1.u8 {q9}, [r0], r1 - vst1.u8 {q7}, [r2], r3 - vld1.u8 {q10}, [r0], r1 - vst1.u8 {q8}, [r2], r3 - vld1.u8 {q11}, [r0], r1 - vst1.u8 {q9}, [r2], r3 - vld1.u8 {q12}, [r0], r1 - vst1.u8 {q10}, [r2], r3 - vld1.u8 {q13}, [r0], r1 - vst1.u8 {q11}, [r2], r3 - vld1.u8 {q14}, [r0], r1 - vst1.u8 {q12}, [r2], r3 - vld1.u8 {q15}, [r0], r1 - vst1.u8 {q13}, [r2], r3 - vst1.u8 {q14}, [r2], r3 - vst1.u8 {q15}, [r2], r3 - - mov pc, lr - - ENDP ; |vp9_copy_mem16x16_neon| - - END diff --git a/vp9/common/arm/neon/copymem8x4_neon.asm b/vp9/common/arm/neon/copymem8x4_neon.asm deleted file mode 100644 index ffd2df8e1..000000000 --- a/vp9/common/arm/neon/copymem8x4_neon.asm +++ /dev/null @@ -1,34 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_copy_mem8x4_neon| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp9_copy_mem8x4_neon| PROC - vld1.u8 {d0}, [r0], r1 - vld1.u8 {d1}, [r0], r1 - vst1.u8 {d0}, [r2], r3 - vld1.u8 {d2}, [r0], r1 - vst1.u8 {d1}, [r2], r3 - vld1.u8 {d3}, [r0], r1 - vst1.u8 {d2}, [r2], r3 - vst1.u8 {d3}, [r2], r3 - - mov pc, lr - - ENDP ; |vp9_copy_mem8x4_neon| - - END diff --git a/vp9/common/arm/neon/copymem8x8_neon.asm b/vp9/common/arm/neon/copymem8x8_neon.asm deleted file mode 100644 index 2d394c043..000000000 --- a/vp9/common/arm/neon/copymem8x8_neon.asm +++ /dev/null @@ -1,43 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_copy_mem8x8_neon| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp9_copy_mem8x8_neon| PROC - - vld1.u8 {d0}, [r0], r1 - vld1.u8 {d1}, [r0], r1 - vst1.u8 {d0}, [r2], r3 - vld1.u8 {d2}, [r0], r1 - vst1.u8 {d1}, [r2], r3 - vld1.u8 {d3}, [r0], r1 - vst1.u8 {d2}, [r2], r3 - vld1.u8 {d4}, [r0], r1 - vst1.u8 {d3}, [r2], r3 - vld1.u8 {d5}, [r0], r1 - vst1.u8 {d4}, [r2], r3 - vld1.u8 {d6}, [r0], r1 - vst1.u8 {d5}, [r2], r3 - vld1.u8 {d7}, [r0], r1 - vst1.u8 {d6}, [r2], r3 - vst1.u8 {d7}, [r2], r3 - - mov pc, lr - - ENDP ; |vp9_copy_mem8x8_neon| - - END diff --git a/vp9/common/arm/neon/dc_only_idct_add_neon.asm b/vp9/common/arm/neon/dc_only_idct_add_neon.asm deleted file mode 100644 index 49ba05fb0..000000000 --- a/vp9/common/arm/neon/dc_only_idct_add_neon.asm +++ /dev/null @@ -1,49 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vp8_dc_only_idct_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr, -; unsigned char *dst_ptr, int pitch, int stride) -; r0 input_dc -; r1 pred_ptr -; r2 dst_ptr -; r3 pitch -; sp stride -|vp8_dc_only_idct_add_neon| PROC - add r0, r0, #4 - asr r0, r0, #3 - ldr r12, [sp] - vdup.16 q0, r0 - - vld1.32 {d2[0]}, [r1], r3 - vld1.32 {d2[1]}, [r1], r3 - vld1.32 {d4[0]}, [r1], r3 - vld1.32 {d4[1]}, [r1] - - vaddw.u8 q1, q0, d2 - vaddw.u8 q2, q0, d4 - - vqmovun.s16 d2, q1 - vqmovun.s16 d4, q2 - - vst1.32 {d2[0]}, [r2], r12 - vst1.32 {d2[1]}, [r2], r12 - vst1.32 {d4[0]}, [r2], r12 - vst1.32 {d4[1]}, [r2] - - bx lr - - ENDP - END diff --git a/vp9/common/arm/neon/iwalsh_neon.asm b/vp9/common/arm/neon/iwalsh_neon.asm deleted file mode 100644 index 01c79d937..000000000 --- a/vp9/common/arm/neon/iwalsh_neon.asm +++ /dev/null @@ -1,80 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - EXPORT |vp8_short_inv_walsh4x4_neon| - EXPORT |vp8_short_inv_walsh4x4_1_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY ; name this block of code - -;short vp8_short_inv_walsh4x4_neon(short *input, short *output) -|vp8_short_inv_walsh4x4_neon| PROC - - ; read in all four lines of values: d0->d3 - vld1.i16 {q0-q1}, [r0@128] - - ; first for loop - vadd.s16 d4, d0, d3 ;a = [0] + [12] - vadd.s16 d6, d1, d2 ;b = [4] + [8] - vsub.s16 d5, d0, d3 ;d = [0] - [12] - vsub.s16 d7, d1, d2 ;c = [4] - [8] - - vadd.s16 q0, q2, q3 ; a+b d+c - vsub.s16 q1, q2, q3 ; a-b d-c - - vtrn.32 d0, d2 ;d0: 0 1 8 9 - ;d2: 2 3 10 11 - vtrn.32 d1, d3 ;d1: 4 5 12 13 - ;d3: 6 7 14 15 - - vtrn.16 d0, d1 ;d0: 0 4 8 12 - ;d1: 1 5 9 13 - vtrn.16 d2, d3 ;d2: 2 6 10 14 - ;d3: 3 7 11 15 - - ; second for loop - - vadd.s16 d4, d0, d3 ;a = [0] + [3] - vadd.s16 d6, d1, d2 ;b = [1] + [2] - vsub.s16 d5, d0, d3 ;d = [0] - [3] - vsub.s16 d7, d1, d2 ;c = [1] - [2] - - vmov.i16 q8, #3 - - vadd.s16 q0, q2, q3 ; a+b d+c - vsub.s16 q1, q2, q3 ; a-b d-c - - vadd.i16 q0, q0, q8 ;e/f += 3 - vadd.i16 q1, q1, q8 ;g/h += 3 - - vshr.s16 q0, q0, #3 ;e/f >> 3 - vshr.s16 q1, q1, #3 ;g/h >> 3 - - vst4.i16 {d0,d1,d2,d3}, [r1@128] - - bx lr - ENDP ; |vp8_short_inv_walsh4x4_neon| - - -;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output) -|vp8_short_inv_walsh4x4_1_neon| PROC - ldrsh r2, [r0] ; load input[0] - add r3, r2, #3 ; add 3 - add r2, r1, #16 ; base for last 8 output - asr r0, r3, #3 ; right shift 3 - vdup.16 q0, r0 ; load and duplicate - vst1.16 {q0}, [r1@128] ; write back 8 - vst1.16 {q0}, [r2@128] ; write back last 8 - bx lr - ENDP ; |vp8_short_inv_walsh4x4_1_neon| - - END diff --git a/vp9/common/arm/neon/loopfilter_neon.asm b/vp9/common/arm/neon/loopfilter_neon.asm deleted file mode 100644 index bc6616734..000000000 --- a/vp9/common/arm/neon/loopfilter_neon.asm +++ /dev/null @@ -1,397 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_loop_filter_horizontal_edge_y_neon| - EXPORT |vp9_loop_filter_horizontal_edge_uv_neon| - EXPORT |vp9_loop_filter_vertical_edge_y_neon| - EXPORT |vp9_loop_filter_vertical_edge_uv_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src -; r1 int pitch -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -|vp9_loop_filter_horizontal_edge_y_neon| PROC - push {lr} - vdup.u8 q0, r2 ; duplicate blimit - vdup.u8 q1, r3 ; duplicate limit - sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines - ldr r3, [sp, #4] ; load thresh - add r12, r2, r1 - add r1, r1, r1 - - vdup.u8 q2, r3 ; duplicate thresh - - vld1.u8 {q3}, [r2@128], r1 ; p3 - vld1.u8 {q4}, [r12@128], r1 ; p2 - vld1.u8 {q5}, [r2@128], r1 ; p1 - vld1.u8 {q6}, [r12@128], r1 ; p0 - vld1.u8 {q7}, [r2@128], r1 ; q0 - vld1.u8 {q8}, [r12@128], r1 ; q1 - vld1.u8 {q9}, [r2@128] ; q2 - vld1.u8 {q10}, [r12@128] ; q3 - - sub r2, r2, r1, lsl #1 - sub r12, r12, r1, lsl #1 - - bl vp9_loop_filter_neon - - vst1.u8 {q5}, [r2@128], r1 ; store op1 - vst1.u8 {q6}, [r12@128], r1 ; store op0 - vst1.u8 {q7}, [r2@128], r1 ; store oq0 - vst1.u8 {q8}, [r12@128], r1 ; store oq1 - - pop {pc} - ENDP ; |vp9_loop_filter_horizontal_edge_y_neon| - - -; r0 unsigned char *u, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -; sp+4 unsigned char *v -|vp9_loop_filter_horizontal_edge_uv_neon| PROC - push {lr} - vdup.u8 q0, r2 ; duplicate blimit - vdup.u8 q1, r3 ; duplicate limit - ldr r12, [sp, #4] ; load thresh - ldr r2, [sp, #8] ; load v ptr - vdup.u8 q2, r12 ; duplicate thresh - - sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines - sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines - - vld1.u8 {d6}, [r3@64], r1 ; p3 - vld1.u8 {d7}, [r12@64], r1 ; p3 - vld1.u8 {d8}, [r3@64], r1 ; p2 - vld1.u8 {d9}, [r12@64], r1 ; p2 - vld1.u8 {d10}, [r3@64], r1 ; p1 - vld1.u8 {d11}, [r12@64], r1 ; p1 - vld1.u8 {d12}, [r3@64], r1 ; p0 - vld1.u8 {d13}, [r12@64], r1 ; p0 - vld1.u8 {d14}, [r3@64], r1 ; q0 - vld1.u8 {d15}, [r12@64], r1 ; q0 - vld1.u8 {d16}, [r3@64], r1 ; q1 - vld1.u8 {d17}, [r12@64], r1 ; q1 - vld1.u8 {d18}, [r3@64], r1 ; q2 - vld1.u8 {d19}, [r12@64], r1 ; q2 - vld1.u8 {d20}, [r3@64] ; q3 - vld1.u8 {d21}, [r12@64] ; q3 - - bl vp9_loop_filter_neon - - sub r0, r0, r1, lsl #1 - sub r2, r2, r1, lsl #1 - - vst1.u8 {d10}, [r0@64], r1 ; store u op1 - vst1.u8 {d11}, [r2@64], r1 ; store v op1 - vst1.u8 {d12}, [r0@64], r1 ; store u op0 - vst1.u8 {d13}, [r2@64], r1 ; store v op0 - vst1.u8 {d14}, [r0@64], r1 ; store u oq0 - vst1.u8 {d15}, [r2@64], r1 ; store v oq0 - vst1.u8 {d16}, [r0@64] ; store u oq1 - vst1.u8 {d17}, [r2@64] ; store v oq1 - - pop {pc} - ENDP ; |vp9_loop_filter_horizontal_edge_uv_neon| - -; void vp9_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) -; r0 unsigned char *src -; r1 int pitch -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, - -|vp9_loop_filter_vertical_edge_y_neon| PROC - push {lr} - vdup.u8 q0, r2 ; duplicate blimit - vdup.u8 q1, r3 ; duplicate limit - sub r2, r0, #4 ; src ptr down by 4 columns - add r1, r1, r1 - ldr r3, [sp, #4] ; load thresh - add r12, r2, r1, asr #1 - - vld1.u8 {d6}, [r2], r1 - vld1.u8 {d8}, [r12], r1 - vld1.u8 {d10}, [r2], r1 - vld1.u8 {d12}, [r12], r1 - vld1.u8 {d14}, [r2], r1 - vld1.u8 {d16}, [r12], r1 - vld1.u8 {d18}, [r2], r1 - vld1.u8 {d20}, [r12], r1 - - vld1.u8 {d7}, [r2], r1 ; load second 8-line src data - vld1.u8 {d9}, [r12], r1 - vld1.u8 {d11}, [r2], r1 - vld1.u8 {d13}, [r12], r1 - vld1.u8 {d15}, [r2], r1 - vld1.u8 {d17}, [r12], r1 - vld1.u8 {d19}, [r2] - vld1.u8 {d21}, [r12] - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vdup.u8 q2, r3 ; duplicate thresh - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - bl vp9_loop_filter_neon - - vswp d12, d11 - vswp d16, d13 - - sub r0, r0, #2 ; dst ptr - - vswp d14, d12 - vswp d16, d15 - - add r12, r0, r1, asr #1 - - ;store op1, op0, oq0, oq1 - vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1 - vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1 - vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1 - vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1 - - vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1 - vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1 - vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1 - vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1 - vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1 - vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1 - vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0] - vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12] - - pop {pc} - ENDP ; |vp9_loop_filter_vertical_edge_y_neon| - -; void vp9_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; unsigned char *v) -; r0 unsigned char *u, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -; sp+4 unsigned char *v -|vp9_loop_filter_vertical_edge_uv_neon| PROC - push {lr} - vdup.u8 q0, r2 ; duplicate blimit - sub r12, r0, #4 ; move u pointer down by 4 columns - ldr r2, [sp, #8] ; load v ptr - vdup.u8 q1, r3 ; duplicate limit - sub r3, r2, #4 ; move v pointer down by 4 columns - - vld1.u8 {d6}, [r12], r1 ;load u data - vld1.u8 {d7}, [r3], r1 ;load v data - vld1.u8 {d8}, [r12], r1 - vld1.u8 {d9}, [r3], r1 - vld1.u8 {d10}, [r12], r1 - vld1.u8 {d11}, [r3], r1 - vld1.u8 {d12}, [r12], r1 - vld1.u8 {d13}, [r3], r1 - vld1.u8 {d14}, [r12], r1 - vld1.u8 {d15}, [r3], r1 - vld1.u8 {d16}, [r12], r1 - vld1.u8 {d17}, [r3], r1 - vld1.u8 {d18}, [r12], r1 - vld1.u8 {d19}, [r3], r1 - vld1.u8 {d20}, [r12] - vld1.u8 {d21}, [r3] - - ldr r12, [sp, #4] ; load thresh - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vdup.u8 q2, r12 ; duplicate thresh - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - bl vp9_loop_filter_neon - - vswp d12, d11 - vswp d16, d13 - vswp d14, d12 - vswp d16, d15 - - sub r0, r0, #2 - sub r2, r2, #2 - - ;store op1, op0, oq0, oq1 - vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1 - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 - vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1 - vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1 - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 - vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1 - vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1 - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 - vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1 - vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1 - vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0] - vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2] - - pop {pc} - ENDP ; |vp9_loop_filter_vertical_edge_uv_neon| - -; void vp9_loop_filter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. - -; r0-r3 PRESERVE -; q0 flimit -; q1 limit -; q2 thresh -; q3 p3 -; q4 p2 -; q5 p1 -; q6 p0 -; q7 q0 -; q8 q1 -; q9 q2 -; q10 q3 -|vp9_loop_filter_neon| PROC - - ; vp9_filter_mask - vabd.u8 q11, q3, q4 ; abs(p3 - p2) - vabd.u8 q12, q4, q5 ; abs(p2 - p1) - vabd.u8 q13, q5, q6 ; abs(p1 - p0) - vabd.u8 q14, q8, q7 ; abs(q1 - q0) - vabd.u8 q3, q9, q8 ; abs(q2 - q1) - vabd.u8 q4, q10, q9 ; abs(q3 - q2) - - vmax.u8 q11, q11, q12 - vmax.u8 q12, q13, q14 - vmax.u8 q3, q3, q4 - vmax.u8 q15, q11, q12 - - vabd.u8 q9, q6, q7 ; abs(p0 - q0) - - ; vp8_hevmask - vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 q15, q15, q3 - - vmov.u8 q10, #0x80 ; 0x80 - - vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) - vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 - - vcge.u8 q15, q1, q15 - - ; vp9_filter() function - ; convert to signed - veor q7, q7, q10 ; qs0 - vshr.u8 q2, q2, #1 ; a = a / 2 - veor q6, q6, q10 ; ps0 - - veor q5, q5, q10 ; ps1 - vqadd.u8 q9, q9, q2 ; a = b + a - - veor q8, q8, q10 ; qs1 - - vmov.u8 q10, #3 ; #3 - - vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) - vsubl.s8 q11, d15, d13 - - vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 - - vmovl.u8 q4, d20 - - vqsub.s8 q1, q5, q8 ; vp9_filter = clamp(ps1-qs1) - vorr q14, q13, q14 ; vp8_hevmask - - vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) - vmul.i16 q11, q11, q4 - - vand q1, q1, q14 ; vp9_filter &= hev - vand q15, q15, q9 ; vp9_filter_mask - - vaddw.s8 q2, q2, d2 - vaddw.s8 q11, q11, d3 - - vmov.u8 q9, #4 ; #4 - - ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d2, q2 - vqmovn.s16 d3, q11 - vand q1, q1, q15 ; vp9_filter &= mask - - vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp9_filter+3) - vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp9_filter+4) - vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q1, q1, #3 ; Filter1 >>= 3 - - - vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2) - vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1) - - ; outer tap adjustments: ++vp9_filter >> 1 - vrshr.s8 q1, q1, #1 - vbic q1, q1, q14 ; vp9_filter &= ~hev - vmov.u8 q0, #0x80 ; 0x80 - vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp9_filter) - vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp9_filter) - - veor q6, q11, q0 ; *op0 = u^0x80 - veor q7, q10, q0 ; *oq0 = u^0x80 - veor q5, q13, q0 ; *op1 = u^0x80 - veor q8, q12, q0 ; *oq1 = u^0x80 - - bx lr - ENDP ; |vp9_loop_filter_horizontal_edge_y_neon| - -;----------------- - - END diff --git a/vp9/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp9/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm deleted file mode 100644 index eb07ce0d5..000000000 --- a/vp9/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm +++ /dev/null @@ -1,117 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - ;EXPORT |vp9_loop_filter_simple_horizontal_edge_neon| - EXPORT |vp9_loop_filter_bhs_neon| - EXPORT |vp9_loop_filter_mbhs_neon| - ARM - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *s, PRESERVE -; r1 int p, PRESERVE -; q1 limit, PRESERVE - -|vp9_loop_filter_simple_horizontal_edge_neon| PROC - - sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines - - vld1.u8 {q7}, [r0@128], r1 ; q0 - vld1.u8 {q5}, [r3@128], r1 ; p0 - vld1.u8 {q8}, [r0@128] ; q1 - vld1.u8 {q6}, [r3@128] ; p1 - - vabd.u8 q15, q6, q7 ; abs(p0 - q0) - vabd.u8 q14, q5, q8 ; abs(p1 - q1) - - vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 - vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 - vmov.u8 q0, #0x80 ; 0x80 - vmov.s16 q13, #3 - vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 - - veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value - veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value - veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value - veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value - - vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1 - - vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) - vsubl.s8 q3, d15, d13 - - vqsub.s8 q4, q5, q8 ; q4: vp9_filter = vp9_signed_char_clamp(ps1-qs1) - - vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0) - vmul.s16 q3, q3, q13 - - vmov.u8 q10, #0x03 ; 0x03 - vmov.u8 q9, #0x04 ; 0x04 - - vaddw.s8 q2, q2, d8 ; vp9_filter + 3 * ( qs0 - ps0) - vaddw.s8 q3, q3, d9 - - vqmovn.s16 d8, q2 ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d9, q3 - - vand q14, q4, q15 ; vp9_filter &= mask - - vqadd.s8 q2, q14, q10 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3) - vqadd.s8 q3, q14, q9 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4) - vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q4, q3, #3 ; Filter1 >>= 3 - - sub r0, r0, r1 - - ;calculate output - vqadd.s8 q11, q6, q2 ; u = vp9_signed_char_clamp(ps0 + Filter2) - vqsub.s8 q10, q7, q4 ; u = vp9_signed_char_clamp(qs0 - Filter1) - - veor q6, q11, q0 ; *op0 = u^0x80 - veor q7, q10, q0 ; *oq0 = u^0x80 - - vst1.u8 {q6}, [r3@128] ; store op0 - vst1.u8 {q7}, [r0@128] ; store oq0 - - bx lr - ENDP ; |vp9_loop_filter_simple_horizontal_edge_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp9_loop_filter_bhs_neon| PROC - push {r4, lr} - ldrb r3, [r2] ; load blim from mem - vdup.s8 q1, r3 ; duplicate blim - - add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride - bl vp9_loop_filter_simple_horizontal_edge_neon - ; vp9_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1 - add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride - bl vp9_loop_filter_simple_horizontal_edge_neon - add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride - pop {r4, lr} - b vp9_loop_filter_simple_horizontal_edge_neon - ENDP ;|vp9_loop_filter_bhs_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp9_loop_filter_mbhs_neon| PROC - ldrb r3, [r2] ; load blim from mem - vdup.s8 q1, r3 ; duplicate mblim - b vp9_loop_filter_simple_horizontal_edge_neon - ENDP ;|vp9_loop_filter_bhs_neon| - - END diff --git a/vp9/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp9/common/arm/neon/loopfiltersimpleverticaledge_neon.asm deleted file mode 100644 index d5cf8c2b5..000000000 --- a/vp9/common/arm/neon/loopfiltersimpleverticaledge_neon.asm +++ /dev/null @@ -1,154 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - ;EXPORT |vp9_loop_filter_simple_vertical_edge_neon| - EXPORT |vp9_loop_filter_bvs_neon| - EXPORT |vp9_loop_filter_mbvs_neon| - ARM - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *s, PRESERVE -; r1 int p, PRESERVE -; q1 limit, PRESERVE - -|vp9_loop_filter_simple_vertical_edge_neon| PROC - sub r0, r0, #2 ; move src pointer down by 2 columns - add r12, r1, r1 - add r3, r0, r1 - - vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12 - vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12 - vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12 - vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12 - vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12 - vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12 - vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12 - vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12 - - vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12 - vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12 - vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12 - vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12 - vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12 - vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12 - vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12 - vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3] - - vswp d7, d10 - vswp d12, d9 - - ;vp9_filter_mask() function - ;vp8_hevmask() function - sub r0, r0, r1, lsl #4 - vabd.u8 q15, q5, q4 ; abs(p0 - q0) - vabd.u8 q14, q3, q6 ; abs(p1 - q1) - - vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 - vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 - vmov.u8 q0, #0x80 ; 0x80 - vmov.s16 q11, #3 - vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 - - veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value - veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value - veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value - veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value - - vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 - - vsubl.s8 q2, d8, d10 ; ( qs0 - ps0) - vsubl.s8 q13, d9, d11 - - vqsub.s8 q14, q3, q6 ; vp9_filter = vp9_signed_char_clamp(ps1-qs1) - - vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0) - vmul.s16 q13, q13, q11 - - vmov.u8 q11, #0x03 ; 0x03 - vmov.u8 q12, #0x04 ; 0x04 - - vaddw.s8 q2, q2, d28 ; vp9_filter + 3 * ( qs0 - ps0) - vaddw.s8 q13, q13, d29 - - vqmovn.s16 d28, q2 ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d29, q13 - - add r0, r0, #1 - add r3, r0, r1 - - vand q14, q14, q15 ; vp9_filter &= mask - - vqadd.s8 q2, q14, q11 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3) - vqadd.s8 q3, q14, q12 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4) - vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q14, q3, #3 ; Filter1 >>= 3 - - ;calculate output - vqadd.s8 q11, q5, q2 ; u = vp9_signed_char_clamp(ps0 + Filter2) - vqsub.s8 q10, q4, q14 ; u = vp9_signed_char_clamp(qs0 - Filter1) - - veor q6, q11, q0 ; *op0 = u^0x80 - veor q7, q10, q0 ; *oq0 = u^0x80 - add r12, r1, r1 - vswp d13, d14 - - ;store op1, op0, oq0, oq1 - vst2.8 {d12[0], d13[0]}, [r0], r12 - vst2.8 {d12[1], d13[1]}, [r3], r12 - vst2.8 {d12[2], d13[2]}, [r0], r12 - vst2.8 {d12[3], d13[3]}, [r3], r12 - vst2.8 {d12[4], d13[4]}, [r0], r12 - vst2.8 {d12[5], d13[5]}, [r3], r12 - vst2.8 {d12[6], d13[6]}, [r0], r12 - vst2.8 {d12[7], d13[7]}, [r3], r12 - vst2.8 {d14[0], d15[0]}, [r0], r12 - vst2.8 {d14[1], d15[1]}, [r3], r12 - vst2.8 {d14[2], d15[2]}, [r0], r12 - vst2.8 {d14[3], d15[3]}, [r3], r12 - vst2.8 {d14[4], d15[4]}, [r0], r12 - vst2.8 {d14[5], d15[5]}, [r3], r12 - vst2.8 {d14[6], d15[6]}, [r0], r12 - vst2.8 {d14[7], d15[7]}, [r3] - - bx lr - ENDP ; |vp9_loop_filter_simple_vertical_edge_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp9_loop_filter_bvs_neon| PROC - push {r4, lr} - ldrb r3, [r2] ; load blim from mem - mov r4, r0 - add r0, r0, #4 - vdup.s8 q1, r3 ; duplicate blim - bl vp9_loop_filter_simple_vertical_edge_neon - ; vp9_loop_filter_simple_vertical_edge_neon preserves r1 and q1 - add r0, r4, #8 - bl vp9_loop_filter_simple_vertical_edge_neon - add r0, r4, #12 - pop {r4, lr} - b vp9_loop_filter_simple_vertical_edge_neon - ENDP ;|vp9_loop_filter_bvs_neon| - -; r0 unsigned char *y -; r1 int ystride -; r2 const unsigned char *blimit - -|vp9_loop_filter_mbvs_neon| PROC - ldrb r3, [r2] ; load mblim from mem - vdup.s8 q1, r3 ; duplicate mblim - b vp9_loop_filter_simple_vertical_edge_neon - ENDP ;|vp9_loop_filter_bvs_neon| - END diff --git a/vp9/common/arm/neon/mbloopfilter_neon.asm b/vp9/common/arm/neon/mbloopfilter_neon.asm deleted file mode 100644 index 19b67f47d..000000000 --- a/vp9/common/arm/neon/mbloopfilter_neon.asm +++ /dev/null @@ -1,469 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon| - EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon| - EXPORT |vp8_mbloop_filter_vertical_edge_y_neon| - EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh) -; r0 unsigned char *src, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -|vp8_mbloop_filter_horizontal_edge_y_neon| PROC - push {lr} - add r1, r1, r1 ; double stride - ldr r12, [sp, #4] ; load thresh - sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines - vdup.u8 q2, r12 ; thresh - add r12, r0, r1, lsr #1 ; move src pointer up by 1 line - - vld1.u8 {q3}, [r0@128], r1 ; p3 - vld1.u8 {q4}, [r12@128], r1 ; p2 - vld1.u8 {q5}, [r0@128], r1 ; p1 - vld1.u8 {q6}, [r12@128], r1 ; p0 - vld1.u8 {q7}, [r0@128], r1 ; q0 - vld1.u8 {q8}, [r12@128], r1 ; q1 - vld1.u8 {q9}, [r0@128], r1 ; q2 - vld1.u8 {q10}, [r12@128], r1 ; q3 - - bl vp8_mbloop_filter_neon - - sub r12, r12, r1, lsl #2 - add r0, r12, r1, lsr #1 - - vst1.u8 {q4}, [r12@128],r1 ; store op2 - vst1.u8 {q5}, [r0@128],r1 ; store op1 - vst1.u8 {q6}, [r12@128], r1 ; store op0 - vst1.u8 {q7}, [r0@128],r1 ; store oq0 - vst1.u8 {q8}, [r12@128] ; store oq1 - vst1.u8 {q9}, [r0@128] ; store oq2 - - pop {pc} - ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon| - -; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh, -; unsigned char *v) -; r0 unsigned char *u, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -; sp+4 unsigned char *v - -|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC - push {lr} - ldr r12, [sp, #4] ; load thresh - sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines - vdup.u8 q2, r12 ; thresh - ldr r12, [sp, #8] ; load v ptr - sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines - - vld1.u8 {d6}, [r0@64], r1 ; p3 - vld1.u8 {d7}, [r12@64], r1 ; p3 - vld1.u8 {d8}, [r0@64], r1 ; p2 - vld1.u8 {d9}, [r12@64], r1 ; p2 - vld1.u8 {d10}, [r0@64], r1 ; p1 - vld1.u8 {d11}, [r12@64], r1 ; p1 - vld1.u8 {d12}, [r0@64], r1 ; p0 - vld1.u8 {d13}, [r12@64], r1 ; p0 - vld1.u8 {d14}, [r0@64], r1 ; q0 - vld1.u8 {d15}, [r12@64], r1 ; q0 - vld1.u8 {d16}, [r0@64], r1 ; q1 - vld1.u8 {d17}, [r12@64], r1 ; q1 - vld1.u8 {d18}, [r0@64], r1 ; q2 - vld1.u8 {d19}, [r12@64], r1 ; q2 - vld1.u8 {d20}, [r0@64], r1 ; q3 - vld1.u8 {d21}, [r12@64], r1 ; q3 - - bl vp8_mbloop_filter_neon - - sub r0, r0, r1, lsl #3 - sub r12, r12, r1, lsl #3 - - add r0, r0, r1 - add r12, r12, r1 - - vst1.u8 {d8}, [r0@64], r1 ; store u op2 - vst1.u8 {d9}, [r12@64], r1 ; store v op2 - vst1.u8 {d10}, [r0@64], r1 ; store u op1 - vst1.u8 {d11}, [r12@64], r1 ; store v op1 - vst1.u8 {d12}, [r0@64], r1 ; store u op0 - vst1.u8 {d13}, [r12@64], r1 ; store v op0 - vst1.u8 {d14}, [r0@64], r1 ; store u oq0 - vst1.u8 {d15}, [r12@64], r1 ; store v oq0 - vst1.u8 {d16}, [r0@64], r1 ; store u oq1 - vst1.u8 {d17}, [r12@64], r1 ; store v oq1 - vst1.u8 {d18}, [r0@64], r1 ; store u oq2 - vst1.u8 {d19}, [r12@64], r1 ; store v oq2 - - pop {pc} - ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon| - -; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh) -; r0 unsigned char *src, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -|vp8_mbloop_filter_vertical_edge_y_neon| PROC - push {lr} - ldr r12, [sp, #4] ; load thresh - sub r0, r0, #4 ; move src pointer down by 4 columns - vdup.s8 q2, r12 ; thresh - add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines - - vld1.u8 {d6}, [r0], r1 ; load first 8-line src data - vld1.u8 {d7}, [r12], r1 ; load second 8-line src data - vld1.u8 {d8}, [r0], r1 - vld1.u8 {d9}, [r12], r1 - vld1.u8 {d10}, [r0], r1 - vld1.u8 {d11}, [r12], r1 - vld1.u8 {d12}, [r0], r1 - vld1.u8 {d13}, [r12], r1 - vld1.u8 {d14}, [r0], r1 - vld1.u8 {d15}, [r12], r1 - vld1.u8 {d16}, [r0], r1 - vld1.u8 {d17}, [r12], r1 - vld1.u8 {d18}, [r0], r1 - vld1.u8 {d19}, [r12], r1 - vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r12], r1 - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - sub r0, r0, r1, lsl #3 - - bl vp8_mbloop_filter_neon - - sub r12, r12, r1, lsl #3 - - ;transpose to 16x8 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - ;store op2, op1, op0, oq0, oq1, oq2 - vst1.8 {d6}, [r0], r1 - vst1.8 {d7}, [r12], r1 - vst1.8 {d8}, [r0], r1 - vst1.8 {d9}, [r12], r1 - vst1.8 {d10}, [r0], r1 - vst1.8 {d11}, [r12], r1 - vst1.8 {d12}, [r0], r1 - vst1.8 {d13}, [r12], r1 - vst1.8 {d14}, [r0], r1 - vst1.8 {d15}, [r12], r1 - vst1.8 {d16}, [r0], r1 - vst1.8 {d17}, [r12], r1 - vst1.8 {d18}, [r0], r1 - vst1.8 {d19}, [r12], r1 - vst1.8 {d20}, [r0] - vst1.8 {d21}, [r12] - - pop {pc} - ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon| - -; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, -; const unsigned char *blimit, -; const unsigned char *limit, -; const unsigned char *thresh, -; unsigned char *v) -; r0 unsigned char *u, -; r1 int pitch, -; r2 const signed char *flimit, -; r3 const signed char *limit, -; sp const signed char *thresh, -; sp+4 unsigned char *v -|vp8_mbloop_filter_vertical_edge_uv_neon| PROC - push {lr} - ldr r12, [sp, #4] ; load thresh - sub r0, r0, #4 ; move u pointer down by 4 columns - vdup.u8 q2, r12 ; thresh - ldr r12, [sp, #8] ; load v ptr - sub r12, r12, #4 ; move v pointer down by 4 columns - - vld1.u8 {d6}, [r0], r1 ;load u data - vld1.u8 {d7}, [r12], r1 ;load v data - vld1.u8 {d8}, [r0], r1 - vld1.u8 {d9}, [r12], r1 - vld1.u8 {d10}, [r0], r1 - vld1.u8 {d11}, [r12], r1 - vld1.u8 {d12}, [r0], r1 - vld1.u8 {d13}, [r12], r1 - vld1.u8 {d14}, [r0], r1 - vld1.u8 {d15}, [r12], r1 - vld1.u8 {d16}, [r0], r1 - vld1.u8 {d17}, [r12], r1 - vld1.u8 {d18}, [r0], r1 - vld1.u8 {d19}, [r12], r1 - vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r12], r1 - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - sub r0, r0, r1, lsl #3 - - bl vp8_mbloop_filter_neon - - sub r12, r12, r1, lsl #3 - - ;transpose to 16x8 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - ;store op2, op1, op0, oq0, oq1, oq2 - vst1.8 {d6}, [r0], r1 - vst1.8 {d7}, [r12], r1 - vst1.8 {d8}, [r0], r1 - vst1.8 {d9}, [r12], r1 - vst1.8 {d10}, [r0], r1 - vst1.8 {d11}, [r12], r1 - vst1.8 {d12}, [r0], r1 - vst1.8 {d13}, [r12], r1 - vst1.8 {d14}, [r0], r1 - vst1.8 {d15}, [r12], r1 - vst1.8 {d16}, [r0], r1 - vst1.8 {d17}, [r12], r1 - vst1.8 {d18}, [r0], r1 - vst1.8 {d19}, [r12], r1 - vst1.8 {d20}, [r0] - vst1.8 {d21}, [r12] - - pop {pc} - ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon| - -; void vp8_mbloop_filter_neon() -; This is a helper function for the macroblock loopfilters. The individual -; functions do the necessary load, transpose (if necessary), preserve (if -; necessary) and store. - -; r0,r1 PRESERVE -; r2 mblimit -; r3 limit - -; q2 thresh -; q3 p3 PRESERVE -; q4 p2 -; q5 p1 -; q6 p0 -; q7 q0 -; q8 q1 -; q9 q2 -; q10 q3 PRESERVE - -|vp8_mbloop_filter_neon| PROC - - ; vp9_filter_mask - vabd.u8 q11, q3, q4 ; abs(p3 - p2) - vabd.u8 q12, q4, q5 ; abs(p2 - p1) - vabd.u8 q13, q5, q6 ; abs(p1 - p0) - vabd.u8 q14, q8, q7 ; abs(q1 - q0) - vabd.u8 q1, q9, q8 ; abs(q2 - q1) - vabd.u8 q0, q10, q9 ; abs(q3 - q2) - - vmax.u8 q11, q11, q12 - vmax.u8 q12, q13, q14 - vmax.u8 q1, q1, q0 - vmax.u8 q15, q11, q12 - - vabd.u8 q12, q6, q7 ; abs(p0 - q0) - - ; vp8_hevmask - vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1 - vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1 - vmax.u8 q15, q15, q1 - - vdup.u8 q1, r3 ; limit - vdup.u8 q2, r2 ; mblimit - - vmov.u8 q0, #0x80 ; 0x80 - - vcge.u8 q15, q1, q15 - - vabd.u8 q1, q5, q8 ; a = abs(p1 - q1) - vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2 - vmov.u16 q11, #3 ; #3 - - ; vp9_filter - ; convert to signed - veor q7, q7, q0 ; qs0 - vshr.u8 q1, q1, #1 ; a = a / 2 - veor q6, q6, q0 ; ps0 - veor q5, q5, q0 ; ps1 - - vqadd.u8 q12, q12, q1 ; a = b + a - - veor q8, q8, q0 ; qs1 - veor q4, q4, q0 ; ps2 - veor q9, q9, q0 ; qs2 - - vorr q14, q13, q14 ; vp8_hevmask - - vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1 - - vsubl.s8 q2, d14, d12 ; qs0 - ps0 - vsubl.s8 q13, d15, d13 - - vqsub.s8 q1, q5, q8 ; vp9_filter = clamp(ps1-qs1) - - vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0) - - vand q15, q15, q12 ; vp9_filter_mask - - vmul.i16 q13, q13, q11 - - vmov.u8 q12, #3 ; #3 - - vaddw.s8 q2, q2, d2 ; vp9_filter + 3 * ( qs0 - ps0) - vaddw.s8 q13, q13, d3 - - vmov.u8 q11, #4 ; #4 - - ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d2, q2 - vqmovn.s16 d3, q13 - - vand q1, q1, q15 ; vp9_filter &= mask - - vmov.u16 q15, #63 ; #63 - - vand q13, q1, q14 ; Filter2 &= hev - - vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4) - vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3) - - vmov q0, q15 - - vshr.s8 q2, q2, #3 ; Filter1 >>= 3 - vshr.s8 q13, q13, #3 ; Filter2 >>= 3 - - vmov q11, q15 - vmov q12, q15 - - vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1) - - vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2) - - vbic q1, q1, q14 ; vp9_filter &= ~hev - - ; roughly 1/7th difference across boundary - ; roughly 2/7th difference across boundary - ; roughly 3/7th difference across boundary - - vmov.u8 d5, #9 ; #9 - vmov.u8 d4, #18 ; #18 - - vmov q13, q15 - vmov q14, q15 - - vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9 - vmlal.s8 q11, d3, d5 - vmov.u8 d5, #27 ; #27 - vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18 - vmlal.s8 q13, d3, d4 - vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27 - vmlal.s8 q15, d3, d5 - - vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7) - vqshrn.s16 d1, q11, #7 - vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7) - vqshrn.s16 d25, q13, #7 - vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7) - vqshrn.s16 d29, q15, #7 - - vmov.u8 q1, #0x80 ; 0x80 - - vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u) - vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u) - vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u) - vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u) - vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u) - vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u) - - veor q9, q11, q1 ; *oq2 = s^0x80 - veor q4, q0, q1 ; *op2 = s^0x80 - veor q8, q13, q1 ; *oq1 = s^0x80 - veor q5, q12, q1 ; *op2 = s^0x80 - veor q7, q15, q1 ; *oq0 = s^0x80 - veor q6, q14, q1 ; *op0 = s^0x80 - - bx lr - ENDP ; |vp8_mbloop_filter_neon| - -;----------------- - - END diff --git a/vp9/common/arm/neon/recon16x16mb_neon.asm b/vp9/common/arm/neon/recon16x16mb_neon.asm deleted file mode 100644 index 3f1a30f48..000000000 --- a/vp9/common/arm/neon/recon16x16mb_neon.asm +++ /dev/null @@ -1,131 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_recon16x16mb_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *pred_ptr, -; r1 short *diff_ptr, -; r2 unsigned char *dst_ptr, -; r3 int ystride, -; stack unsigned char *udst_ptr, -; stack unsigned char *vdst_ptr - -|vp8_recon16x16mb_neon| PROC - mov r12, #4 ;loop counter for Y loop - -recon16x16mb_loop_y - vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr - vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr - vld1.u8 {q14, q15}, [r0]! - vld1.16 {q10, q11}, [r1]! - - vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits - vmovl.u8 q1, d25 - vmovl.u8 q2, d26 - vmovl.u8 q3, d27 - vmovl.u8 q4, d28 - vmovl.u8 q5, d29 - vmovl.u8 q6, d30 - vld1.16 {q12, q13}, [r1]! - vmovl.u8 q7, d31 - vld1.16 {q14, q15}, [r1]! - - pld [r0] - pld [r1] - pld [r1, #64] - - vadd.s16 q0, q0, q8 ;add Diff data and Pred data together - vadd.s16 q1, q1, q9 - vadd.s16 q2, q2, q10 - vadd.s16 q3, q3, q11 - vadd.s16 q4, q4, q12 - vadd.s16 q5, q5, q13 - vadd.s16 q6, q6, q14 - vadd.s16 q7, q7, q15 - - vqmovun.s16 d0, q0 ;CLAMP() saturation - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - vqmovun.s16 d3, q3 - vqmovun.s16 d4, q4 - vqmovun.s16 d5, q5 - vst1.u8 {q0}, [r2], r3 ;store result - vqmovun.s16 d6, q6 - vst1.u8 {q1}, [r2], r3 - vqmovun.s16 d7, q7 - vst1.u8 {q2}, [r2], r3 - subs r12, r12, #1 - - moveq r12, #2 ;loop counter for UV loop - - vst1.u8 {q3}, [r2], r3 - bne recon16x16mb_loop_y - - mov r3, r3, lsr #1 ;uv_stride = ystride>>1 - ldr r2, [sp] ;load upred_ptr - -recon16x16mb_loop_uv - vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr - vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr - vld1.u8 {q14, q15}, [r0]! - vld1.16 {q10, q11}, [r1]! - - vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits - vmovl.u8 q1, d25 - vmovl.u8 q2, d26 - vmovl.u8 q3, d27 - vmovl.u8 q4, d28 - vmovl.u8 q5, d29 - vmovl.u8 q6, d30 - vld1.16 {q12, q13}, [r1]! - vmovl.u8 q7, d31 - vld1.16 {q14, q15}, [r1]! - - vadd.s16 q0, q0, q8 ;add Diff data and Pred data together - vadd.s16 q1, q1, q9 - vadd.s16 q2, q2, q10 - vadd.s16 q3, q3, q11 - vadd.s16 q4, q4, q12 - vadd.s16 q5, q5, q13 - vadd.s16 q6, q6, q14 - - vqmovun.s16 d0, q0 ;CLAMP() saturation - vadd.s16 q7, q7, q15 - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - vqmovun.s16 d3, q3 - vst1.u8 {d0}, [r2], r3 ;store result - vqmovun.s16 d4, q4 - vst1.u8 {d1}, [r2], r3 - vqmovun.s16 d5, q5 - vst1.u8 {d2}, [r2], r3 - vqmovun.s16 d6, q6 - vst1.u8 {d3}, [r2], r3 - vqmovun.s16 d7, q7 - vst1.u8 {d4}, [r2], r3 - subs r12, r12, #1 - - vst1.u8 {d5}, [r2], r3 - vst1.u8 {d6}, [r2], r3 - vst1.u8 {d7}, [r2], r3 - - ldrne r2, [sp, #4] ;load vpred_ptr - bne recon16x16mb_loop_uv - - bx lr - - ENDP - END diff --git a/vp9/common/arm/neon/recon2b_neon.asm b/vp9/common/arm/neon/recon2b_neon.asm deleted file mode 100644 index 99b251c91..000000000 --- a/vp9/common/arm/neon/recon2b_neon.asm +++ /dev/null @@ -1,54 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_recon2b_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *pred_ptr, -; r1 short *diff_ptr, -; r2 unsigned char *dst_ptr, -; r3 int stride - -|vp8_recon2b_neon| PROC - vld1.u8 {q8, q9}, [r0] ;load data from pred_ptr - vld1.16 {q4, q5}, [r1]! ;load data from diff_ptr - - vmovl.u8 q0, d16 ;modify Pred data from 8 bits to 16 bits - vld1.16 {q6, q7}, [r1]! - vmovl.u8 q1, d17 - vmovl.u8 q2, d18 - vmovl.u8 q3, d19 - - vadd.s16 q0, q0, q4 ;add Diff data and Pred data together - vadd.s16 q1, q1, q5 - vadd.s16 q2, q2, q6 - vadd.s16 q3, q3, q7 - - vqmovun.s16 d0, q0 ;CLAMP() saturation - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - vqmovun.s16 d3, q3 - add r0, r2, r3 - - vst1.u8 {d0}, [r2] ;store result - vst1.u8 {d1}, [r0], r3 - add r2, r0, r3 - vst1.u8 {d2}, [r0] - vst1.u8 {d3}, [r2], r3 - - bx lr - - ENDP - END diff --git a/vp9/common/arm/neon/recon4b_neon.asm b/vp9/common/arm/neon/recon4b_neon.asm deleted file mode 100644 index 991727746..000000000 --- a/vp9/common/arm/neon/recon4b_neon.asm +++ /dev/null @@ -1,69 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_recon4b_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *pred_ptr, -; r1 short *diff_ptr, -; r2 unsigned char *dst_ptr, -; r3 int stride - -|vp8_recon4b_neon| PROC - vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr - vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr - vld1.u8 {q14, q15}, [r0] - vld1.16 {q10, q11}, [r1]! - - vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits - vmovl.u8 q1, d25 - vmovl.u8 q2, d26 - vmovl.u8 q3, d27 - vmovl.u8 q4, d28 - vmovl.u8 q5, d29 - vmovl.u8 q6, d30 - vld1.16 {q12, q13}, [r1]! - vmovl.u8 q7, d31 - vld1.16 {q14, q15}, [r1] - - vadd.s16 q0, q0, q8 ;add Diff data and Pred data together - vadd.s16 q1, q1, q9 - vadd.s16 q2, q2, q10 - vadd.s16 q3, q3, q11 - vadd.s16 q4, q4, q12 - vadd.s16 q5, q5, q13 - vadd.s16 q6, q6, q14 - vadd.s16 q7, q7, q15 - - vqmovun.s16 d0, q0 ;CLAMP() saturation - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - vqmovun.s16 d3, q3 - vqmovun.s16 d4, q4 - vqmovun.s16 d5, q5 - vqmovun.s16 d6, q6 - vqmovun.s16 d7, q7 - add r0, r2, r3 - - vst1.u8 {q0}, [r2] ;store result - vst1.u8 {q1}, [r0], r3 - add r2, r0, r3 - vst1.u8 {q2}, [r0] - vst1.u8 {q3}, [r2], r3 - - bx lr - - ENDP - END diff --git a/vp9/common/arm/neon/recon_neon.c b/vp9/common/arm/neon/recon_neon.c deleted file mode 100644 index 10fd46feb..000000000 --- a/vp9/common/arm/neon/recon_neon.c +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include "vp9/common/recon.h" -#include "vp9/common/blockd.h" - -extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr); - -void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd) { - unsigned char *pred_ptr = &xd->predictor[0]; - short *diff_ptr = &xd->diff[0]; - unsigned char *dst_ptr = xd->dst.y_buffer; - unsigned char *udst_ptr = xd->dst.u_buffer; - unsigned char *vdst_ptr = xd->dst.v_buffer; - int ystride = xd->dst.y_stride; - /*int uv_stride = xd->dst.uv_stride;*/ - - vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride, - udst_ptr, vdst_ptr); -} diff --git a/vp9/common/arm/neon/reconb_neon.asm b/vp9/common/arm/neon/reconb_neon.asm deleted file mode 100644 index 288c0ef01..000000000 --- a/vp9/common/arm/neon/reconb_neon.asm +++ /dev/null @@ -1,61 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_recon_b_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *pred_ptr, -; r1 short *diff_ptr, -; r2 unsigned char *dst_ptr, -; r3 int stride - -|vp8_recon_b_neon| PROC - mov r12, #16 - - vld1.u8 {d28}, [r0], r12 ;load 4 data/line from pred_ptr - vld1.16 {q10, q11}, [r1]! ;load data from diff_ptr - vld1.u8 {d29}, [r0], r12 - vld1.16 {q11, q12}, [r1]! - vld1.u8 {d30}, [r0], r12 - vld1.16 {q12, q13}, [r1]! - vld1.u8 {d31}, [r0], r12 - vld1.16 {q13}, [r1] - - vmovl.u8 q0, d28 ;modify Pred data from 8 bits to 16 bits - vmovl.u8 q1, d29 ;Pred data in d0, d2, d4, d6 - vmovl.u8 q2, d30 - vmovl.u8 q3, d31 - - vadd.s16 d0, d0, d20 ;add Diff data and Pred data together - vadd.s16 d2, d2, d22 - vadd.s16 d4, d4, d24 - vadd.s16 d6, d6, d26 - - vqmovun.s16 d0, q0 ;CLAMP() saturation - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q2 - vqmovun.s16 d3, q3 - add r1, r2, r3 - - vst1.32 {d0[0]}, [r2] ;store result - vst1.32 {d1[0]}, [r1], r3 - add r2, r1, r3 - vst1.32 {d2[0]}, [r1] - vst1.32 {d3[0]}, [r2], r3 - - bx lr - - ENDP - END diff --git a/vp9/common/arm/neon/save_neon_reg.asm b/vp9/common/arm/neon/save_neon_reg.asm deleted file mode 100644 index 71c3e7077..000000000 --- a/vp9/common/arm/neon/save_neon_reg.asm +++ /dev/null @@ -1,36 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp9_push_neon| - EXPORT |vp9_pop_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -|vp9_push_neon| PROC - vst1.i64 {d8, d9, d10, d11}, [r0]! - vst1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - - ENDP - -|vp9_pop_neon| PROC - vld1.i64 {d8, d9, d10, d11}, [r0]! - vld1.i64 {d12, d13, d14, d15}, [r0]! - bx lr - - ENDP - - END - diff --git a/vp9/common/arm/neon/shortidct4x4llm_1_neon.asm b/vp9/common/arm/neon/shortidct4x4llm_1_neon.asm deleted file mode 100644 index d7bdbae75..000000000 --- a/vp9/common/arm/neon/shortidct4x4llm_1_neon.asm +++ /dev/null @@ -1,67 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_idct4x4llm_1_neon| - EXPORT |vp8_dc_only_idct_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch); -; r0 short *input; -; r1 short *output; -; r2 int pitch; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -|vp8_short_idct4x4llm_1_neon| PROC - vld1.16 {d0[]}, [r0] ;load input[0] - - add r3, r1, r2 - add r12, r3, r2 - - vrshr.s16 d0, d0, #3 - - add r0, r12, r2 - - vst1.16 {d0}, [r1] - vst1.16 {d0}, [r3] - vst1.16 {d0}, [r12] - vst1.16 {d0}, [r0] - - bx lr - ENDP - -;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch); -; r0 short input_dc; -; r1 short *output; -; r2 int pitch; -;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -|vp8_dc_only_idct_neon| PROC - vdup.16 d0, r0 - - add r3, r1, r2 - add r12, r3, r2 - - vrshr.s16 d0, d0, #3 - - add r0, r12, r2 - - vst1.16 {d0}, [r1] - vst1.16 {d0}, [r3] - vst1.16 {d0}, [r12] - vst1.16 {d0}, [r0] - - bx lr - - ENDP - END diff --git a/vp9/common/arm/neon/shortidct4x4llm_neon.asm b/vp9/common/arm/neon/shortidct4x4llm_neon.asm deleted file mode 100644 index b74c31521..000000000 --- a/vp9/common/arm/neon/shortidct4x4llm_neon.asm +++ /dev/null @@ -1,122 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_idct4x4llm_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;************************************************************* -;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) -;r0 short * input -;r1 short * output -;r2 int pitch -;************************************************************* -;static const int cospi8sqrt2minus1=20091; -;static const int sinpi8sqrt2 =35468; -;static const int rounding = 0; -;Optimization note: The resulted data from dequantization are signed 13-bit data that is -;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since -;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half -;result of the multiplication that is needed in IDCT. - -|vp8_short_idct4x4llm_neon| PROC - adr r12, idct_coeff - vld1.16 {q1, q2}, [r0] - vld1.16 {d0}, [r12] - - vswp d3, d4 ;q2(vp[4] vp[12]) - - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) - vqadd.s16 q4, q4, q2 - - ;d6 - c1:temp1 - ;d7 - d1:temp2 - ;d8 - d1:temp1 - ;d9 - c1:temp2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - - vswp d3, d4 - - vqdmulh.s16 q3, q2, d0[2] - vqdmulh.s16 q4, q2, d0[0] - - vqadd.s16 d12, d2, d3 ;a1 - vqsub.s16 d13, d2, d3 ;b1 - - vshr.s16 q3, q3, #1 - vshr.s16 q4, q4, #1 - - vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) - vqadd.s16 q4, q4, q2 - - vqsub.s16 d10, d6, d9 ;c1 - vqadd.s16 d11, d7, d8 ;d1 - - vqadd.s16 d2, d12, d11 - vqadd.s16 d3, d13, d10 - vqsub.s16 d4, d13, d10 - vqsub.s16 d5, d12, d11 - - vrshr.s16 d2, d2, #3 - vrshr.s16 d3, d3, #3 - vrshr.s16 d4, d4, #3 - vrshr.s16 d5, d5, #3 - - add r3, r1, r2 - add r12, r3, r2 - add r0, r12, r2 - - vtrn.32 d2, d4 - vtrn.32 d3, d5 - vtrn.16 d2, d3 - vtrn.16 d4, d5 - - vst1.16 {d2}, [r1] - vst1.16 {d3}, [r3] - vst1.16 {d4}, [r12] - vst1.16 {d5}, [r0] - - bx lr - - ENDP - -;----------------- - -idct_coeff - DCD 0x4e7b4e7b, 0x8a8c8a8c - -;20091, 20091, 35468, 35468 - - END diff --git a/vp9/common/arm/neon/sixtappredict16x16_neon.asm b/vp9/common/arm/neon/sixtappredict16x16_neon.asm deleted file mode 100644 index 5e83f49f5..000000000 --- a/vp9/common/arm/neon/sixtappredict16x16_neon.asm +++ /dev/null @@ -1,490 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict16x16_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter16_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to -; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication, -; the result can be negtive. So, I treat the result as s16. But, since it is also possible -; that the result can be a large positive number (> 2^15-1), which could be confused as a -; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2, -; which ensures that the result stays in s16 range. Finally, saturated add the result by -; applying 3rd filter coeff. Same applys to other filter functions. - -|vp8_sixtap_predict16x16_neon| PROC - push {r4-r5, lr} - - adr r12, filter16_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter16x16_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter16x16_only - - sub sp, sp, #336 ;reserve space on stack for temporary storage - mov lr, sp - - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #7 ;loop counter - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - sub r0, r0, r1, lsl #1 - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First Pass: output_height lines x output_width columns (21x16) -filt_blk2d_fp16x16_loop_neon - vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data - vld1.u8 {d9, d10, d11}, [r0], r1 - vld1.u8 {d12, d13, d14}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q9, d7, d0 - vmull.u8 q10, d9, d0 - vmull.u8 q11, d10, d0 - vmull.u8 q12, d12, d0 - vmull.u8 q13, d13, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d9, d10, #1 - vext.8 d30, d12, d13, #1 - - vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q10, d29, d1 - vmlsl.u8 q12, d30, d1 - - vext.8 d28, d7, d8, #1 - vext.8 d29, d10, d11, #1 - vext.8 d30, d13, d14, #1 - - vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q11, d29, d1 - vmlsl.u8 q13, d30, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d9, d10, #4 - vext.8 d30, d12, d13, #4 - - vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q10, d29, d4 - vmlsl.u8 q12, d30, d4 - - vext.8 d28, d7, d8, #4 - vext.8 d29, d10, d11, #4 - vext.8 d30, d13, d14, #4 - - vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q11, d29, d4 - vmlsl.u8 q13, d30, d4 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d9, d10, #5 - vext.8 d30, d12, d13, #5 - - vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q10, d29, d5 - vmlal.u8 q12, d30, d5 - - vext.8 d28, d7, d8, #5 - vext.8 d29, d10, d11, #5 - vext.8 d30, d13, d14, #5 - - vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q11, d29, d5 - vmlal.u8 q13, d30, d5 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d9, d10, #2 - vext.8 d30, d12, d13, #2 - - vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q10, d29, d2 - vmlal.u8 q12, d30, d2 - - vext.8 d28, d7, d8, #2 - vext.8 d29, d10, d11, #2 - vext.8 d30, d13, d14, #2 - - vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q11, d29, d2 - vmlal.u8 q13, d30, d2 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d9, d10, #3 - vext.8 d30, d12, d13, #3 - - vext.8 d15, d7, d8, #3 - vext.8 d31, d10, d11, #3 - vext.8 d6, d13, d14, #3 - - vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q5, d29, d3 - vmull.u8 q6, d30, d3 - - vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters) - vqadd.s16 q10, q5 - vqadd.s16 q12, q6 - - vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q7, d31, d3 - vmull.u8 q3, d6, d3 - - subs r2, r2, #1 - - vqadd.s16 q9, q6 - vqadd.s16 q11, q7 - vqadd.s16 q13, q3 - - vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q9, #7 - vqrshrun.s16 d8, q10, #7 - vqrshrun.s16 d9, q11, #7 - vqrshrun.s16 d10, q12, #7 - vqrshrun.s16 d11, q13, #7 - - vst1.u8 {d6, d7, d8}, [lr]! ;store result - vst1.u8 {d9, d10, d11}, [lr]! - - bne filt_blk2d_fp16x16_loop_neon - -;Second pass: 16x16 -;secondpass_filter - do first 8-columns and then second 8-columns - add r3, r12, r3, lsl #5 - sub lr, lr, #336 - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - mov r3, #2 ;loop counter - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - mov r2, #16 - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - -filt_blk2d_sp16x16_outloop_neon - vld1.u8 {d18}, [lr], r2 ;load src data - vld1.u8 {d19}, [lr], r2 - vld1.u8 {d20}, [lr], r2 - vld1.u8 {d21}, [lr], r2 - mov r12, #4 ;loop counter - vld1.u8 {d22}, [lr], r2 - -secondpass_inner_loop_neon - vld1.u8 {d23}, [lr], r2 ;load src data - vld1.u8 {d24}, [lr], r2 - vld1.u8 {d25}, [lr], r2 - vld1.u8 {d26}, [lr], r2 - - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r12, r12, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vmov q9, q11 - vst1.u8 {d7}, [r4], r5 - vmov q10, q12 - vst1.u8 {d8}, [r4], r5 - vmov d22, d26 - vst1.u8 {d9}, [r4], r5 - - bne secondpass_inner_loop_neon - - subs r3, r3, #1 - sub lr, lr, #336 - add lr, lr, #8 - - sub r4, r4, r5, lsl #4 - add r4, r4, #8 - - bne filt_blk2d_sp16x16_outloop_neon - - add sp, sp, #336 - pop {r4-r5,pc} - -;-------------------- -firstpass_filter16x16_only - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #8 ;loop counter - sub r0, r0, #2 ;move srcptr back to (column-2) - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First Pass: output_height lines x output_width columns (16x16) -filt_blk2d_fpo16x16_loop_neon - vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data - vld1.u8 {d9, d10, d11}, [r0], r1 - - pld [r0] - pld [r0, r1] - - vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q7, d7, d0 - vmull.u8 q8, d9, d0 - vmull.u8 q9, d10, d0 - - vext.8 d20, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d21, d9, d10, #1 - vext.8 d22, d7, d8, #1 - vext.8 d23, d10, d11, #1 - vext.8 d24, d6, d7, #4 ;construct src_ptr[2] - vext.8 d25, d9, d10, #4 - vext.8 d26, d7, d8, #4 - vext.8 d27, d10, d11, #4 - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d9, d10, #5 - - vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d21, d1 - vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q9, d23, d1 - vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d25, d4 - vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q9, d27, d4 - vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q8, d29, d5 - - vext.8 d20, d7, d8, #5 - vext.8 d21, d10, d11, #5 - vext.8 d22, d6, d7, #2 ;construct src_ptr[0] - vext.8 d23, d9, d10, #2 - vext.8 d24, d7, d8, #2 - vext.8 d25, d10, d11, #2 - - vext.8 d26, d6, d7, #3 ;construct src_ptr[1] - vext.8 d27, d9, d10, #3 - vext.8 d28, d7, d8, #3 - vext.8 d29, d10, d11, #3 - - vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q9, d21, d5 - vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d23, d2 - vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q9, d25, d2 - - vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q11, d27, d3 - vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q15, d29, d3 - - vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q11 - vqadd.s16 q7, q12 - vqadd.s16 q9, q15 - - subs r2, r2, #1 - - vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q7, #7 - vqrshrun.s16 d8, q8, #7 - vqrshrun.s16 d9, q9, #7 - - vst1.u8 {q3}, [r4], r5 ;store result - vst1.u8 {q4}, [r4], r5 - - bne filt_blk2d_fpo16x16_loop_neon - - pop {r4-r5,pc} - -;-------------------- -secondpass_filter16x16_only -;Second pass: 16x16 - add r3, r12, r3, lsl #5 - sub r0, r0, r1, lsl #1 - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - mov r3, #2 ;loop counter - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - -filt_blk2d_spo16x16_outloop_neon - vld1.u8 {d18}, [r0], r1 ;load src data - vld1.u8 {d19}, [r0], r1 - vld1.u8 {d20}, [r0], r1 - vld1.u8 {d21}, [r0], r1 - mov r12, #4 ;loop counter - vld1.u8 {d22}, [r0], r1 - -secondpass_only_inner_loop_neon - vld1.u8 {d23}, [r0], r1 ;load src data - vld1.u8 {d24}, [r0], r1 - vld1.u8 {d25}, [r0], r1 - vld1.u8 {d26}, [r0], r1 - - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r12, r12, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vmov q9, q11 - vst1.u8 {d7}, [r4], r5 - vmov q10, q12 - vst1.u8 {d8}, [r4], r5 - vmov d22, d26 - vst1.u8 {d9}, [r4], r5 - - bne secondpass_only_inner_loop_neon - - subs r3, r3, #1 - sub r0, r0, r1, lsl #4 - sub r0, r0, r1, lsl #2 - sub r0, r0, r1 - add r0, r0, #8 - - sub r4, r4, r5, lsl #4 - add r4, r4, #8 - - bne filt_blk2d_spo16x16_outloop_neon - - pop {r4-r5,pc} - - ENDP - -;----------------- - END diff --git a/vp9/common/arm/neon/sixtappredict4x4_neon.asm b/vp9/common/arm/neon/sixtappredict4x4_neon.asm deleted file mode 100644 index 5966b642f..000000000 --- a/vp9/common/arm/neon/sixtappredict4x4_neon.asm +++ /dev/null @@ -1,422 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter4_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack(r4) unsigned char *dst_ptr, -; stack(lr) int dst_pitch - -|vp8_sixtap_predict_neon| PROC - push {r4, lr} - - adr r12, filter4_coeff - ldr r4, [sp, #8] ;load parameters from stack - ldr lr, [sp, #12] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter4x4_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter4x4_only - - vabs.s32 q12, q14 ;get abs(filer_parameters) - vabs.s32 q13, q15 - - sub r0, r0, #2 ;go back 2 columns of src data - sub r0, r0, r1, lsl #1 ;go back 2 lines of src data - -;First pass: output_height lines x output_width columns (9x4) - vld1.u8 {q3}, [r0], r1 ;load first 4-line src data - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vld1.u8 {q4}, [r0], r1 - vdup.8 d1, d24[4] - vld1.u8 {q5}, [r0], r1 - vdup.8 d2, d25[0] - vld1.u8 {q6}, [r0], r1 - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vext.8 d18, d6, d7, #5 ;construct src_ptr[3] - vext.8 d19, d8, d9, #5 - vext.8 d20, d10, d11, #5 - vext.8 d21, d12, d13, #5 - - vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done - vswp d11, d12 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) - vzip.32 d20, d21 - vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5]) - vmull.u8 q8, d20, d5 - - vmov q4, q3 ;keep original src data in q4 q6 - vmov q6, q5 - - vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together - vzip.32 d10, d11 - vshr.u64 q9, q4, #8 ;construct src_ptr[-1] - vshr.u64 q10, q6, #8 - vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0]) - vmlal.u8 q8, d10, d0 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #32 ;construct src_ptr[2] - vshr.u64 q5, q6, #32 - vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d20, d1 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) - vzip.32 d10, d11 - vshr.u64 q9, q4, #16 ;construct src_ptr[0] - vshr.u64 q10, q6, #16 - vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d10, d4 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #24 ;construct src_ptr[1] - vshr.u64 q5, q6, #24 - vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d20, d2 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q10, d10, d3 - - vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data - vld1.u8 {q4}, [r0], r1 - - vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q10 - - vld1.u8 {q5}, [r0], r1 - vld1.u8 {q6}, [r0], r1 - - vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d28, q8, #7 - - ;First Pass on rest 5-line data - vld1.u8 {q11}, [r0], r1 - - vext.8 d18, d6, d7, #5 ;construct src_ptr[3] - vext.8 d19, d8, d9, #5 - vext.8 d20, d10, d11, #5 - vext.8 d21, d12, d13, #5 - - vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done - vswp d11, d12 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) - vzip.32 d20, d21 - vext.8 d31, d22, d23, #5 ;construct src_ptr[3] - vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5]) - vmull.u8 q8, d20, d5 - vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp9_filter[5]) - - vmov q4, q3 ;keep original src data in q4 q6 - vmov q6, q5 - - vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together - vzip.32 d10, d11 - vshr.u64 q9, q4, #8 ;construct src_ptr[-1] - vshr.u64 q10, q6, #8 - - vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0]) - vmlal.u8 q8, d10, d0 - vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp9_filter[0]) - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #32 ;construct src_ptr[2] - vshr.u64 q5, q6, #32 - vext.8 d31, d22, d23, #1 ;construct src_ptr[-1] - - vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d20, d1 - vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp9_filter[1]) - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) - vzip.32 d10, d11 - vshr.u64 q9, q4, #16 ;construct src_ptr[0] - vshr.u64 q10, q6, #16 - vext.8 d31, d22, d23, #4 ;construct src_ptr[2] - - vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d10, d4 - vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp9_filter[4]) - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #24 ;construct src_ptr[1] - vshr.u64 q5, q6, #24 - vext.8 d31, d22, d23, #2 ;construct src_ptr[0] - - vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d20, d2 - vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp9_filter[2]) - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - vext.8 d31, d22, d23, #3 ;construct src_ptr[1] - vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q10, d10, d3 - vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp9_filter[3]) - - add r3, r12, r3, lsl #5 - - vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q10 - vqadd.s16 q12, q11 - - vext.8 d23, d27, d28, #4 - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - - vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d30, q8, #7 - vqrshrun.s16 d31, q12, #7 - -;Second pass: 4x4 - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vext.8 d24, d28, d29, #4 - vext.8 d25, d29, d30, #4 - vext.8 d26, d30, d31, #4 - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - - vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d28, d0 - - vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp9_filter[5]) - vmull.u8 q6, d26, d5 - - vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d30, d4 - - vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q6, d24, d1 - - vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d29, d2 - - vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp9_filter[3]) - vmlal.u8 q6, d25, d3 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q6, q4 - - vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d4, q6, #7 - - vst1.32 {d3[0]}, [r4] ;store result - vst1.32 {d3[1]}, [r0] - vst1.32 {d4[0]}, [r1] - vst1.32 {d4[1]}, [r2] - - pop {r4, pc} - - -;--------------------- -firstpass_filter4x4_only - vabs.s32 q12, q14 ;get abs(filer_parameters) - vabs.s32 q13, q15 - - sub r0, r0, #2 ;go back 2 columns of src data - -;First pass: output_height lines x output_width columns (4x4) - vld1.u8 {q3}, [r0], r1 ;load first 4-line src data - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vld1.u8 {q4}, [r0], r1 - vdup.8 d1, d24[4] - vld1.u8 {q5}, [r0], r1 - vdup.8 d2, d25[0] - vld1.u8 {q6}, [r0], r1 - - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - - vext.8 d18, d6, d7, #5 ;construct src_ptr[3] - vext.8 d19, d8, d9, #5 - vext.8 d20, d10, d11, #5 - vext.8 d21, d12, d13, #5 - - vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done - vswp d11, d12 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) - vzip.32 d20, d21 - vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5]) - vmull.u8 q8, d20, d5 - - vmov q4, q3 ;keep original src data in q4 q6 - vmov q6, q5 - - vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together - vzip.32 d10, d11 - vshr.u64 q9, q4, #8 ;construct src_ptr[-1] - vshr.u64 q10, q6, #8 - vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0]) - vmlal.u8 q8, d10, d0 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #32 ;construct src_ptr[2] - vshr.u64 q5, q6, #32 - vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d20, d1 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) - vzip.32 d10, d11 - vshr.u64 q9, q4, #16 ;construct src_ptr[0] - vshr.u64 q10, q6, #16 - vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d10, d4 - - vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) - vzip.32 d20, d21 - vshr.u64 q3, q4, #24 ;construct src_ptr[1] - vshr.u64 q5, q6, #24 - vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d20, d2 - - vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) - vzip.32 d10, d11 - vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q10, d10, d3 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q10 - - vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d28, q8, #7 - - vst1.32 {d27[0]}, [r4] ;store result - vst1.32 {d27[1]}, [r0] - vst1.32 {d28[0]}, [r1] - vst1.32 {d28[1]}, [r2] - - pop {r4, pc} - - -;--------------------- -secondpass_filter4x4_only - sub r0, r0, r1, lsl #1 - add r3, r12, r3, lsl #5 - - vld1.32 {d27[0]}, [r0], r1 ;load src data - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vld1.32 {d27[1]}, [r0], r1 - vabs.s32 q7, q5 - vld1.32 {d28[0]}, [r0], r1 - vabs.s32 q8, q6 - vld1.32 {d28[1]}, [r0], r1 - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vld1.32 {d29[0]}, [r0], r1 - vdup.8 d1, d14[4] - vld1.32 {d29[1]}, [r0], r1 - vdup.8 d2, d15[0] - vld1.32 {d30[0]}, [r0], r1 - vdup.8 d3, d15[4] - vld1.32 {d30[1]}, [r0], r1 - vdup.8 d4, d16[0] - vld1.32 {d31[0]}, [r0], r1 - vdup.8 d5, d16[4] - - vext.8 d23, d27, d28, #4 - vext.8 d24, d28, d29, #4 - vext.8 d25, d29, d30, #4 - vext.8 d26, d30, d31, #4 - - vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d28, d0 - - vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp9_filter[5]) - vmull.u8 q6, d26, d5 - - vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d30, d4 - - vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q6, d24, d1 - - vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d29, d2 - - vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp9_filter[3]) - vmlal.u8 q6, d25, d3 - - add r0, r4, lr - add r1, r0, lr - add r2, r1, lr - - vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q6, q4 - - vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d4, q6, #7 - - vst1.32 {d3[0]}, [r4] ;store result - vst1.32 {d3[1]}, [r0] - vst1.32 {d4[0]}, [r1] - vst1.32 {d4[1]}, [r2] - - pop {r4, pc} - - ENDP - -;----------------- - - END diff --git a/vp9/common/arm/neon/sixtappredict8x4_neon.asm b/vp9/common/arm/neon/sixtappredict8x4_neon.asm deleted file mode 100644 index 9ce1e3bbd..000000000 --- a/vp9/common/arm/neon/sixtappredict8x4_neon.asm +++ /dev/null @@ -1,473 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict8x4_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter8_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; r4 unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -|vp8_sixtap_predict8x4_neon| PROC - push {r4-r5, lr} - - adr r12, filter8_coeff - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter8x4_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter8x4_only - - sub sp, sp, #32 ;reserve space on stack for temporary storage - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - mov lr, sp - sub r0, r0, r1, lsl #1 - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - -;First pass: output_height lines x output_width columns (9x8) - vld1.u8 {q3}, [r0], r1 ;load src data - vdup.8 d3, d25[4] - vld1.u8 {q4}, [r0], r1 - vdup.8 d4, d26[0] - vld1.u8 {q5}, [r0], r1 - vdup.8 d5, d26[4] - vld1.u8 {q6}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vld1.u8 {q3}, [r0], r1 ;load src data - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vld1.u8 {q4}, [r0], r1 - vst1.u8 {d22}, [lr]! ;store result - vld1.u8 {q5}, [r0], r1 - vst1.u8 {d23}, [lr]! - vld1.u8 {q6}, [r0], r1 - vst1.u8 {d24}, [lr]! - vld1.u8 {q7}, [r0], r1 - vst1.u8 {d25}, [lr]! - - ;first_pass filtering on the rest 5-line data - vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - vmull.u8 q11, d12, d0 - vmull.u8 q12, d14, d0 - - vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d28, d8, d9, #1 - vext.8 d29, d10, d11, #1 - vext.8 d30, d12, d13, #1 - vext.8 d31, d14, d15, #1 - - vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q9, d28, d1 - vmlsl.u8 q10, d29, d1 - vmlsl.u8 q11, d30, d1 - vmlsl.u8 q12, d31, d1 - - vext.8 d27, d6, d7, #4 ;construct src_ptr[2] - vext.8 d28, d8, d9, #4 - vext.8 d29, d10, d11, #4 - vext.8 d30, d12, d13, #4 - vext.8 d31, d14, d15, #4 - - vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q9, d28, d4 - vmlsl.u8 q10, d29, d4 - vmlsl.u8 q11, d30, d4 - vmlsl.u8 q12, d31, d4 - - vext.8 d27, d6, d7, #2 ;construct src_ptr[0] - vext.8 d28, d8, d9, #2 - vext.8 d29, d10, d11, #2 - vext.8 d30, d12, d13, #2 - vext.8 d31, d14, d15, #2 - - vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q9, d28, d2 - vmlal.u8 q10, d29, d2 - vmlal.u8 q11, d30, d2 - vmlal.u8 q12, d31, d2 - - vext.8 d27, d6, d7, #5 ;construct src_ptr[3] - vext.8 d28, d8, d9, #5 - vext.8 d29, d10, d11, #5 - vext.8 d30, d12, d13, #5 - vext.8 d31, d14, d15, #5 - - vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q9, d28, d5 - vmlal.u8 q10, d29, d5 - vmlal.u8 q11, d30, d5 - vmlal.u8 q12, d31, d5 - - vext.8 d27, d6, d7, #3 ;construct src_ptr[1] - vext.8 d28, d8, d9, #3 - vext.8 d29, d10, d11, #3 - vext.8 d30, d12, d13, #3 - vext.8 d31, d14, d15, #3 - - vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q4, d28, d3 - vmull.u8 q5, d29, d3 - vmull.u8 q6, d30, d3 - vmull.u8 q7, d31, d3 - - vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q9, q4 - vqadd.s16 q10, q5 - vqadd.s16 q11, q6 - vqadd.s16 q12, q7 - - vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d27, q9, #7 - vqrshrun.s16 d28, q10, #7 - vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack - vqrshrun.s16 d30, q12, #7 - -;Second pass: 8x4 -;secondpass_filter - add r3, r12, r3, lsl #5 - sub lr, lr, #32 - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vld1.u8 {q11}, [lr]! - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vld1.u8 {q12}, [lr]! - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - - vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d23, d0 - vmull.u8 q5, d24, d0 - vmull.u8 q6, d25, d0 - - vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q4, d24, d1 - vmlsl.u8 q5, d25, d1 - vmlsl.u8 q6, d26, d1 - - vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d27, d4 - vmlsl.u8 q5, d28, d4 - vmlsl.u8 q6, d29, d4 - - vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d25, d2 - vmlal.u8 q5, d26, d2 - vmlal.u8 q6, d27, d2 - - vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q4, d28, d5 - vmlal.u8 q5, d29, d5 - vmlal.u8 q6, d30, d5 - - vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q8, d26, d3 - vmull.u8 q9, d27, d3 - vmull.u8 q10, d28, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vst1.u8 {d7}, [r4], r5 - vst1.u8 {d8}, [r4], r5 - vst1.u8 {d9}, [r4], r5 - - add sp, sp, #32 - pop {r4-r5,pc} - -;-------------------- -firstpass_filter8x4_only - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - vld1.u8 {q3}, [r0], r1 ;load src data - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vld1.u8 {q4}, [r0], r1 - vdup.8 d1, d24[4] - vld1.u8 {q5}, [r0], r1 - vdup.8 d2, d25[0] - vld1.u8 {q6}, [r0], r1 - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First pass: output_height lines x output_width columns (4x8) - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vst1.u8 {d22}, [r4], r5 ;store result - vst1.u8 {d23}, [r4], r5 - vst1.u8 {d24}, [r4], r5 - vst1.u8 {d25}, [r4], r5 - - pop {r4-r5,pc} - -;--------------------- -secondpass_filter8x4_only -;Second pass: 8x4 - add r3, r12, r3, lsl #5 - sub r0, r0, r1, lsl #1 - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vld1.u8 {d22}, [r0], r1 - vld1.u8 {d23}, [r0], r1 - vld1.u8 {d24}, [r0], r1 - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vld1.u8 {d25}, [r0], r1 - vdup.8 d1, d14[4] - vld1.u8 {d26}, [r0], r1 - vdup.8 d2, d15[0] - vld1.u8 {d27}, [r0], r1 - vdup.8 d3, d15[4] - vld1.u8 {d28}, [r0], r1 - vdup.8 d4, d16[0] - vld1.u8 {d29}, [r0], r1 - vdup.8 d5, d16[4] - vld1.u8 {d30}, [r0], r1 - - vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d23, d0 - vmull.u8 q5, d24, d0 - vmull.u8 q6, d25, d0 - - vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q4, d24, d1 - vmlsl.u8 q5, d25, d1 - vmlsl.u8 q6, d26, d1 - - vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d27, d4 - vmlsl.u8 q5, d28, d4 - vmlsl.u8 q6, d29, d4 - - vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d25, d2 - vmlal.u8 q5, d26, d2 - vmlal.u8 q6, d27, d2 - - vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q4, d28, d5 - vmlal.u8 q5, d29, d5 - vmlal.u8 q6, d30, d5 - - vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q8, d26, d3 - vmull.u8 q9, d27, d3 - vmull.u8 q10, d28, d3 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vst1.u8 {d6}, [r4], r5 ;store result - vst1.u8 {d7}, [r4], r5 - vst1.u8 {d8}, [r4], r5 - vst1.u8 {d9}, [r4], r5 - - pop {r4-r5,pc} - - ENDP - -;----------------- - - END diff --git a/vp9/common/arm/neon/sixtappredict8x8_neon.asm b/vp9/common/arm/neon/sixtappredict8x8_neon.asm deleted file mode 100644 index 5ff16616d..000000000 --- a/vp9/common/arm/neon/sixtappredict8x8_neon.asm +++ /dev/null @@ -1,524 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict8x8_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -filter8_coeff - DCD 0, 0, 128, 0, 0, 0, 0, 0 - DCD 0, -6, 123, 12, -1, 0, 0, 0 - DCD 2, -11, 108, 36, -8, 1, 0, 0 - DCD 0, -9, 93, 50, -6, 0, 0, 0 - DCD 3, -16, 77, 77, -16, 3, 0, 0 - DCD 0, -6, 50, 93, -9, 0, 0, 0 - DCD 1, -8, 36, 108, -11, 2, 0, 0 - DCD 0, -1, 12, 123, -6, 0, 0, 0 - -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack(r4) unsigned char *dst_ptr, -; stack(r5) int dst_pitch - -|vp8_sixtap_predict8x8_neon| PROC - push {r4-r5, lr} - - adr r12, filter8_coeff - - ldr r4, [sp, #12] ;load parameters from stack - ldr r5, [sp, #16] ;load parameters from stack - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - beq secondpass_filter8x8_only - - add r2, r12, r2, lsl #5 ;calculate filter location - - cmp r3, #0 ;skip second_pass filter if yoffset=0 - - vld1.s32 {q14, q15}, [r2] ;load first_pass filter - - beq firstpass_filter8x8_only - - sub sp, sp, #64 ;reserve space on stack for temporary storage - mov lr, sp - - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #2 ;loop counter - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - sub r0, r0, r1, lsl #1 - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - -;First pass: output_height lines x output_width columns (13x8) - vld1.u8 {q3}, [r0], r1 ;load src data - vdup.8 d3, d25[4] - vld1.u8 {q4}, [r0], r1 - vdup.8 d4, d26[0] - vld1.u8 {q5}, [r0], r1 - vdup.8 d5, d26[4] - vld1.u8 {q6}, [r0], r1 - -filt_blk2d_fp8x8_loop_neon - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - - subs r2, r2, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vld1.u8 {q3}, [r0], r1 ;load src data - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vst1.u8 {d22}, [lr]! ;store result - vld1.u8 {q4}, [r0], r1 - vst1.u8 {d23}, [lr]! - vld1.u8 {q5}, [r0], r1 - vst1.u8 {d24}, [lr]! - vld1.u8 {q6}, [r0], r1 - vst1.u8 {d25}, [lr]! - - bne filt_blk2d_fp8x8_loop_neon - - ;first_pass filtering on the rest 5-line data - ;vld1.u8 {q3}, [r0], r1 ;load src data - ;vld1.u8 {q4}, [r0], r1 - ;vld1.u8 {q5}, [r0], r1 - ;vld1.u8 {q6}, [r0], r1 - vld1.u8 {q7}, [r0], r1 - - vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q9, d8, d0 - vmull.u8 q10, d10, d0 - vmull.u8 q11, d12, d0 - vmull.u8 q12, d14, d0 - - vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d28, d8, d9, #1 - vext.8 d29, d10, d11, #1 - vext.8 d30, d12, d13, #1 - vext.8 d31, d14, d15, #1 - - vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q9, d28, d1 - vmlsl.u8 q10, d29, d1 - vmlsl.u8 q11, d30, d1 - vmlsl.u8 q12, d31, d1 - - vext.8 d27, d6, d7, #4 ;construct src_ptr[2] - vext.8 d28, d8, d9, #4 - vext.8 d29, d10, d11, #4 - vext.8 d30, d12, d13, #4 - vext.8 d31, d14, d15, #4 - - vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q9, d28, d4 - vmlsl.u8 q10, d29, d4 - vmlsl.u8 q11, d30, d4 - vmlsl.u8 q12, d31, d4 - - vext.8 d27, d6, d7, #2 ;construct src_ptr[0] - vext.8 d28, d8, d9, #2 - vext.8 d29, d10, d11, #2 - vext.8 d30, d12, d13, #2 - vext.8 d31, d14, d15, #2 - - vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q9, d28, d2 - vmlal.u8 q10, d29, d2 - vmlal.u8 q11, d30, d2 - vmlal.u8 q12, d31, d2 - - vext.8 d27, d6, d7, #5 ;construct src_ptr[3] - vext.8 d28, d8, d9, #5 - vext.8 d29, d10, d11, #5 - vext.8 d30, d12, d13, #5 - vext.8 d31, d14, d15, #5 - - vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q9, d28, d5 - vmlal.u8 q10, d29, d5 - vmlal.u8 q11, d30, d5 - vmlal.u8 q12, d31, d5 - - vext.8 d27, d6, d7, #3 ;construct src_ptr[1] - vext.8 d28, d8, d9, #3 - vext.8 d29, d10, d11, #3 - vext.8 d30, d12, d13, #3 - vext.8 d31, d14, d15, #3 - - vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q4, d28, d3 - vmull.u8 q5, d29, d3 - vmull.u8 q6, d30, d3 - vmull.u8 q7, d31, d3 - - vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q9, q4 - vqadd.s16 q10, q5 - vqadd.s16 q11, q6 - vqadd.s16 q12, q7 - - add r3, r12, r3, lsl #5 - - vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 - sub lr, lr, #64 - vqrshrun.s16 d27, q9, #7 - vld1.u8 {q9}, [lr]! ;load intermediate data from stack - vqrshrun.s16 d28, q10, #7 - vld1.u8 {q10}, [lr]! - - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - - vqrshrun.s16 d29, q11, #7 - vld1.u8 {q11}, [lr]! - - vabs.s32 q7, q5 - vabs.s32 q8, q6 - - vqrshrun.s16 d30, q12, #7 - vld1.u8 {q12}, [lr]! - -;Second pass: 8x8 - mov r3, #2 ;loop counter - - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vdup.8 d1, d14[4] - vdup.8 d2, d15[0] - vdup.8 d3, d15[4] - vdup.8 d4, d16[0] - vdup.8 d5, d16[4] - -filt_blk2d_sp8x8_loop_neon - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r3, r3, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vmov q9, q11 - vst1.u8 {d6}, [r4], r5 ;store result - vmov q10, q12 - vst1.u8 {d7}, [r4], r5 - vmov q11, q13 - vst1.u8 {d8}, [r4], r5 - vmov q12, q14 - vst1.u8 {d9}, [r4], r5 - vmov d26, d30 - - bne filt_blk2d_sp8x8_loop_neon - - add sp, sp, #64 - pop {r4-r5,pc} - -;--------------------- -firstpass_filter8x8_only - ;add r2, r12, r2, lsl #5 ;calculate filter location - ;vld1.s32 {q14, q15}, [r2] ;load first_pass filter - vabs.s32 q12, q14 - vabs.s32 q13, q15 - - mov r2, #2 ;loop counter - sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) - - vdup.8 d0, d24[0] ;first_pass filter (d0-d5) - vdup.8 d1, d24[4] - vdup.8 d2, d25[0] - vdup.8 d3, d25[4] - vdup.8 d4, d26[0] - vdup.8 d5, d26[4] - -;First pass: output_height lines x output_width columns (8x8) -filt_blk2d_fpo8x8_loop_neon - vld1.u8 {q3}, [r0], r1 ;load src data - vld1.u8 {q4}, [r0], r1 - vld1.u8 {q5}, [r0], r1 - vld1.u8 {q6}, [r0], r1 - - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q8, d8, d0 - vmull.u8 q9, d10, d0 - vmull.u8 q10, d12, d0 - - vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] - vext.8 d29, d8, d9, #1 - vext.8 d30, d10, d11, #1 - vext.8 d31, d12, d13, #1 - - vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q8, d29, d1 - vmlsl.u8 q9, d30, d1 - vmlsl.u8 q10, d31, d1 - - vext.8 d28, d6, d7, #4 ;construct src_ptr[2] - vext.8 d29, d8, d9, #4 - vext.8 d30, d10, d11, #4 - vext.8 d31, d12, d13, #4 - - vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q8, d29, d4 - vmlsl.u8 q9, d30, d4 - vmlsl.u8 q10, d31, d4 - - vext.8 d28, d6, d7, #2 ;construct src_ptr[0] - vext.8 d29, d8, d9, #2 - vext.8 d30, d10, d11, #2 - vext.8 d31, d12, d13, #2 - - vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q8, d29, d2 - vmlal.u8 q9, d30, d2 - vmlal.u8 q10, d31, d2 - - vext.8 d28, d6, d7, #5 ;construct src_ptr[3] - vext.8 d29, d8, d9, #5 - vext.8 d30, d10, d11, #5 - vext.8 d31, d12, d13, #5 - - vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q8, d29, d5 - vmlal.u8 q9, d30, d5 - vmlal.u8 q10, d31, d5 - - vext.8 d28, d6, d7, #3 ;construct src_ptr[1] - vext.8 d29, d8, d9, #3 - vext.8 d30, d10, d11, #3 - vext.8 d31, d12, d13, #3 - - vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q4, d29, d3 - vmull.u8 q5, d30, d3 - vmull.u8 q6, d31, d3 - ; - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - subs r2, r2, #1 - - vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d23, q8, #7 - vqrshrun.s16 d24, q9, #7 - vqrshrun.s16 d25, q10, #7 - - vst1.u8 {d22}, [r4], r5 ;store result - vst1.u8 {d23}, [r4], r5 - vst1.u8 {d24}, [r4], r5 - vst1.u8 {d25}, [r4], r5 - - bne filt_blk2d_fpo8x8_loop_neon - - pop {r4-r5,pc} - -;--------------------- -secondpass_filter8x8_only - sub r0, r0, r1, lsl #1 - add r3, r12, r3, lsl #5 - - vld1.u8 {d18}, [r0], r1 ;load src data - vld1.s32 {q5, q6}, [r3] ;load second_pass filter - vld1.u8 {d19}, [r0], r1 - vabs.s32 q7, q5 - vld1.u8 {d20}, [r0], r1 - vabs.s32 q8, q6 - vld1.u8 {d21}, [r0], r1 - mov r3, #2 ;loop counter - vld1.u8 {d22}, [r0], r1 - vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) - vld1.u8 {d23}, [r0], r1 - vdup.8 d1, d14[4] - vld1.u8 {d24}, [r0], r1 - vdup.8 d2, d15[0] - vld1.u8 {d25}, [r0], r1 - vdup.8 d3, d15[4] - vld1.u8 {d26}, [r0], r1 - vdup.8 d4, d16[0] - vld1.u8 {d27}, [r0], r1 - vdup.8 d5, d16[4] - vld1.u8 {d28}, [r0], r1 - vld1.u8 {d29}, [r0], r1 - vld1.u8 {d30}, [r0], r1 - -;Second pass: 8x8 -filt_blk2d_spo8x8_loop_neon - vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0]) - vmull.u8 q4, d19, d0 - vmull.u8 q5, d20, d0 - vmull.u8 q6, d21, d0 - - vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1]) - vmlsl.u8 q4, d20, d1 - vmlsl.u8 q5, d21, d1 - vmlsl.u8 q6, d22, d1 - - vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4]) - vmlsl.u8 q4, d23, d4 - vmlsl.u8 q5, d24, d4 - vmlsl.u8 q6, d25, d4 - - vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2]) - vmlal.u8 q4, d21, d2 - vmlal.u8 q5, d22, d2 - vmlal.u8 q6, d23, d2 - - vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5]) - vmlal.u8 q4, d24, d5 - vmlal.u8 q5, d25, d5 - vmlal.u8 q6, d26, d5 - - vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3]) - vmull.u8 q8, d22, d3 - vmull.u8 q9, d23, d3 - vmull.u8 q10, d24, d3 - - subs r3, r3, #1 - - vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) - vqadd.s16 q8, q4 - vqadd.s16 q9, q5 - vqadd.s16 q10, q6 - - vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 - vqrshrun.s16 d7, q8, #7 - vqrshrun.s16 d8, q9, #7 - vqrshrun.s16 d9, q10, #7 - - vmov q9, q11 - vst1.u8 {d6}, [r4], r5 ;store result - vmov q10, q12 - vst1.u8 {d7}, [r4], r5 - vmov q11, q13 - vst1.u8 {d8}, [r4], r5 - vmov q12, q14 - vst1.u8 {d9}, [r4], r5 - vmov d26, d30 - - bne filt_blk2d_spo8x8_loop_neon - - pop {r4-r5,pc} - - ENDP - -;----------------- - - END diff --git a/vp9/common/arm/neon/vp9_bilinearpredict16x16_neon.asm b/vp9/common/arm/neon/vp9_bilinearpredict16x16_neon.asm new file mode 100644 index 000000000..2528be7c3 --- /dev/null +++ b/vp9/common/arm/neon/vp9_bilinearpredict16x16_neon.asm @@ -0,0 +1,357 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict16x16_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +|vp8_bilinear_predict16x16_neon| PROC + push {r4-r5, lr} + + adr r12, bifilter16_coeff + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_bfilter16x16_only + + add r2, r12, r2, lsl #3 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {d31}, [r2] ;load first_pass filter + + beq firstpass_bfilter16x16_only + + sub sp, sp, #272 ;reserve space on stack for temporary storage + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + mov lr, sp + vld1.u8 {d5, d6, d7}, [r0], r1 + + mov r2, #3 ;loop counter + vld1.u8 {d8, d9, d10}, [r0], r1 + + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {d11, d12, d13}, [r0], r1 + + vdup.8 d1, d31[4] + +;First Pass: output_height lines x output_width columns (17x16) +filt_blk2d_fp16x16_loop_neon + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp9_filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp9_filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vqrshrn.u16 d21, q14, #7 + vld1.u8 {d5, d6, d7}, [r0], r1 + + vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result + vld1.u8 {d8, d9, d10}, [r0], r1 + vst1.u8 {d18, d19, d20, d21}, [lr]! + vld1.u8 {d11, d12, d13}, [r0], r1 + + bne filt_blk2d_fp16x16_loop_neon + +;First-pass filtering for rest 5 lines + vld1.u8 {d14, d15, d16}, [r0], r1 + + vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp9_filter[0]) + vmull.u8 q10, d3, d0 + vmull.u8 q11, d5, d0 + vmull.u8 q12, d6, d0 + vmull.u8 q13, d8, d0 + vmull.u8 q14, d9, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + + vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp9_filter[1]) + vmlal.u8 q11, d5, d1 + vmlal.u8 q13, d8, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + + vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp9_filter[1]) + vmlal.u8 q12, d6, d1 + vmlal.u8 q14, d9, d1 + + vmull.u8 q1, d11, d0 + vmull.u8 q2, d12, d0 + vmull.u8 q3, d14, d0 + vmull.u8 q4, d15, d0 + + vext.8 d11, d11, d12, #1 ;construct src_ptr[1] + vext.8 d14, d14, d15, #1 + + vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp9_filter[1]) + vmlal.u8 q3, d14, d1 + + vext.8 d12, d12, d13, #1 + vext.8 d15, d15, d16, #1 + + vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp9_filter[1]) + vmlal.u8 q4, d15, d1 + + vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d11, q10, #7 + vqrshrn.u16 d12, q11, #7 + vqrshrn.u16 d13, q12, #7 + vqrshrn.u16 d14, q13, #7 + vqrshrn.u16 d15, q14, #7 + vqrshrn.u16 d16, q1, #7 + vqrshrn.u16 d17, q2, #7 + vqrshrn.u16 d18, q3, #7 + vqrshrn.u16 d19, q4, #7 + + vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result + vst1.u8 {d14, d15, d16, d17}, [lr]! + vst1.u8 {d18, d19}, [lr]! + +;Second pass: 16x16 +;secondpass_filter + add r3, r12, r3, lsl #3 + sub lr, lr, #272 + + vld1.u32 {d31}, [r3] ;load second_pass filter + + vld1.u8 {d22, d23}, [lr]! ;load src data + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + mov r12, #4 ;loop counter + +filt_blk2d_sp16x16_loop_neon + vld1.u8 {d24, d25}, [lr]! + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0]) + vld1.u8 {d26, d27}, [lr]! + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [lr]! + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [lr]! + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp9_filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + subs r12, r12, #1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r4], r5 ;store result + vst1.u8 {d4, d5}, [r4], r5 + vst1.u8 {d6, d7}, [r4], r5 + vmov q11, q15 + vst1.u8 {d8, d9}, [r4], r5 + + bne filt_blk2d_sp16x16_loop_neon + + add sp, sp, #272 + + pop {r4-r5,pc} + +;-------------------- +firstpass_bfilter16x16_only + mov r2, #4 ;loop counter + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vdup.8 d1, d31[4] + +;First Pass: output_height lines x output_width columns (16x16) +filt_blk2d_fpo16x16_loop_neon + vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data + vld1.u8 {d5, d6, d7}, [r0], r1 + vld1.u8 {d8, d9, d10}, [r0], r1 + vld1.u8 {d11, d12, d13}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0]) + vmull.u8 q8, d3, d0 + vmull.u8 q9, d5, d0 + vmull.u8 q10, d6, d0 + vmull.u8 q11, d8, d0 + vmull.u8 q12, d9, d0 + vmull.u8 q13, d11, d0 + vmull.u8 q14, d12, d0 + + vext.8 d2, d2, d3, #1 ;construct src_ptr[1] + vext.8 d5, d5, d6, #1 + vext.8 d8, d8, d9, #1 + vext.8 d11, d11, d12, #1 + + vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp9_filter[1]) + vmlal.u8 q9, d5, d1 + vmlal.u8 q11, d8, d1 + vmlal.u8 q13, d11, d1 + + vext.8 d3, d3, d4, #1 + vext.8 d6, d6, d7, #1 + vext.8 d9, d9, d10, #1 + vext.8 d12, d12, d13, #1 + + vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp9_filter[1]) + vmlal.u8 q10, d6, d1 + vmlal.u8 q12, d9, d1 + vmlal.u8 q14, d12, d1 + + subs r2, r2, #1 + + vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d15, q8, #7 + vqrshrn.u16 d16, q9, #7 + vqrshrn.u16 d17, q10, #7 + vqrshrn.u16 d18, q11, #7 + vqrshrn.u16 d19, q12, #7 + vqrshrn.u16 d20, q13, #7 + vst1.u8 {d14, d15}, [r4], r5 ;store result + vqrshrn.u16 d21, q14, #7 + + vst1.u8 {d16, d17}, [r4], r5 + vst1.u8 {d18, d19}, [r4], r5 + vst1.u8 {d20, d21}, [r4], r5 + + bne filt_blk2d_fpo16x16_loop_neon + pop {r4-r5,pc} + +;--------------------- +secondpass_bfilter16x16_only +;Second pass: 16x16 +;secondpass_filter + add r3, r12, r3, lsl #3 + mov r12, #4 ;loop counter + vld1.u32 {d31}, [r3] ;load second_pass filter + vld1.u8 {d22, d23}, [r0], r1 ;load src data + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + +filt_blk2d_spo16x16_loop_neon + vld1.u8 {d24, d25}, [r0], r1 + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0]) + vld1.u8 {d26, d27}, [r0], r1 + vmull.u8 q2, d23, d0 + vld1.u8 {d28, d29}, [r0], r1 + vmull.u8 q3, d24, d0 + vld1.u8 {d30, d31}, [r0], r1 + + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp9_filter[1]) + vmlal.u8 q2, d25, d1 + vmlal.u8 q3, d26, d1 + vmlal.u8 q4, d27, d1 + vmlal.u8 q5, d28, d1 + vmlal.u8 q6, d29, d1 + vmlal.u8 q7, d30, d1 + vmlal.u8 q8, d31, d1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2, d3}, [r4], r5 ;store result + subs r12, r12, #1 + vst1.u8 {d4, d5}, [r4], r5 + vmov q11, q15 + vst1.u8 {d6, d7}, [r4], r5 + vst1.u8 {d8, d9}, [r4], r5 + + bne filt_blk2d_spo16x16_loop_neon + pop {r4-r5,pc} + + ENDP + +;----------------- + +bifilter16_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp9/common/arm/neon/vp9_bilinearpredict4x4_neon.asm b/vp9/common/arm/neon/vp9_bilinearpredict4x4_neon.asm new file mode 100644 index 000000000..01eedf8e9 --- /dev/null +++ b/vp9/common/arm/neon/vp9_bilinearpredict4x4_neon.asm @@ -0,0 +1,130 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict4x4_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_bilinear_predict4x4_neon| PROC + push {r4, lr} + + adr r12, bifilter4_coeff + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq skip_firstpass_filter + +;First pass: output_height lines x output_width columns (5x4) + vld1.u8 {d2}, [r0], r1 ;load src data + add r2, r12, r2, lsl #3 ;calculate Hfilter location (2coeffsx4bytes=8bytes) + + vld1.u8 {d3}, [r0], r1 + vld1.u32 {d31}, [r2] ;first_pass filter + + vld1.u8 {d4}, [r0], r1 + vdup.8 d0, d31[0] ;first_pass filter (d0-d1) + vld1.u8 {d5}, [r0], r1 + vdup.8 d1, d31[4] + vld1.u8 {d6}, [r0], r1 + + vshr.u64 q4, q1, #8 ;construct src_ptr[1] + vshr.u64 q5, q2, #8 + vshr.u64 d12, d6, #8 + + vzip.32 d2, d3 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d4, d5 + vzip.32 d8, d9 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + + vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp9_filter[0]) + vmull.u8 q8, d4, d0 + vmull.u8 q9, d6, d0 + + vmlal.u8 q7, d8, d1 ;(src_ptr[1] * vp9_filter[1]) + vmlal.u8 q8, d10, d1 + vmlal.u8 q9, d12, d1 + + vqrshrn.u16 d28, q7, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d29, q8, #7 + vqrshrn.u16 d30, q9, #7 + +;Second pass: 4x4 +secondpass_filter + cmp r3, #0 ;skip second_pass filter if yoffset=0 + beq skip_secondpass_filter + + add r3, r12, r3, lsl #3 ;calculate Vfilter location + vld1.u32 {d31}, [r3] ;load second_pass filter + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d31[4] + + vmull.u8 q1, d28, d0 + vmull.u8 q2, d29, d0 + + vext.8 d26, d28, d29, #4 ;construct src_ptr[pixel_step] + vext.8 d27, d29, d30, #4 + + vmlal.u8 q1, d26, d1 + vmlal.u8 q2, d27, d1 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + + vst1.32 {d2[0]}, [r4] ;store result + vst1.32 {d2[1]}, [r0] + vst1.32 {d3[0]}, [r1] + vst1.32 {d3[1]}, [r2] + + pop {r4, pc} + +;-------------------- +skip_firstpass_filter + + vld1.32 {d28[0]}, [r0], r1 ;load src data + vld1.32 {d28[1]}, [r0], r1 + vld1.32 {d29[0]}, [r0], r1 + vld1.32 {d29[1]}, [r0], r1 + vld1.32 {d30[0]}, [r0], r1 + + b secondpass_filter + +;--------------------- +skip_secondpass_filter + vst1.32 {d28[0]}, [r4], lr ;store result + vst1.32 {d28[1]}, [r4], lr + vst1.32 {d29[0]}, [r4], lr + vst1.32 {d29[1]}, [r4], lr + + pop {r4, pc} + + ENDP + +;----------------- + +bifilter4_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp9/common/arm/neon/vp9_bilinearpredict8x4_neon.asm b/vp9/common/arm/neon/vp9_bilinearpredict8x4_neon.asm new file mode 100644 index 000000000..8f49345ff --- /dev/null +++ b/vp9/common/arm/neon/vp9_bilinearpredict8x4_neon.asm @@ -0,0 +1,135 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict8x4_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_bilinear_predict8x4_neon| PROC + push {r4, lr} + + adr r12, bifilter8x4_coeff + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq skip_firstpass_filter + +;First pass: output_height lines x output_width columns (5x8) + add r2, r12, r2, lsl #3 ;calculate filter location + + vld1.u8 {q1}, [r0], r1 ;load src data + vld1.u32 {d31}, [r2] ;load first_pass filter + vld1.u8 {q2}, [r0], r1 + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {q3}, [r0], r1 + vdup.8 d1, d31[4] + vld1.u8 {q4}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0]) + vld1.u8 {q5}, [r0], r1 + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + vext.8 d11, d10, d11, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + vmlal.u8 q10, d11, d1 + + vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d23, q7, #7 + vqrshrn.u16 d24, q8, #7 + vqrshrn.u16 d25, q9, #7 + vqrshrn.u16 d26, q10, #7 + +;Second pass: 4x8 +secondpass_filter + cmp r3, #0 ;skip second_pass filter if yoffset=0 + beq skip_secondpass_filter + + add r3, r12, r3, lsl #3 + add r0, r4, lr + + vld1.u32 {d31}, [r3] ;load second_pass filter + add r1, r0, lr + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0]) + vmull.u8 q2, d23, d0 + vmull.u8 q3, d24, d0 + vmull.u8 q4, d25, d0 + + vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp9_filter[1]) + vmlal.u8 q2, d24, d1 + vmlal.u8 q3, d25, d1 + vmlal.u8 q4, d26, d1 + + add r2, r1, lr + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + + vst1.u8 {d2}, [r4] ;store result + vst1.u8 {d3}, [r0] + vst1.u8 {d4}, [r1] + vst1.u8 {d5}, [r2] + + pop {r4, pc} + +;-------------------- +skip_firstpass_filter + vld1.u8 {d22}, [r0], r1 ;load src data + vld1.u8 {d23}, [r0], r1 + vld1.u8 {d24}, [r0], r1 + vld1.u8 {d25}, [r0], r1 + vld1.u8 {d26}, [r0], r1 + + b secondpass_filter + +;--------------------- +skip_secondpass_filter + vst1.u8 {d22}, [r4], lr ;store result + vst1.u8 {d23}, [r4], lr + vst1.u8 {d24}, [r4], lr + vst1.u8 {d25}, [r4], lr + + pop {r4, pc} + + ENDP + +;----------------- + +bifilter8x4_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp9/common/arm/neon/vp9_bilinearpredict8x8_neon.asm b/vp9/common/arm/neon/vp9_bilinearpredict8x8_neon.asm new file mode 100644 index 000000000..6967f1950 --- /dev/null +++ b/vp9/common/arm/neon/vp9_bilinearpredict8x8_neon.asm @@ -0,0 +1,183 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_bilinear_predict8x8_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_bilinear_predict8x8_neon| PROC + push {r4, lr} + + adr r12, bifilter8_coeff + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq skip_firstpass_filter + +;First pass: output_height lines x output_width columns (9x8) + add r2, r12, r2, lsl #3 ;calculate filter location + + vld1.u8 {q1}, [r0], r1 ;load src data + vld1.u32 {d31}, [r2] ;load first_pass filter + vld1.u8 {q2}, [r0], r1 + vdup.8 d0, d31[0] ;first_pass filter (d0 d1) + vld1.u8 {q3}, [r0], r1 + vdup.8 d1, d31[4] + vld1.u8 {q4}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0]) + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + + vld1.u8 {q1}, [r0], r1 ;load src data + vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8 + vld1.u8 {q2}, [r0], r1 + vqrshrn.u16 d23, q7, #7 + vld1.u8 {q3}, [r0], r1 + vqrshrn.u16 d24, q8, #7 + vld1.u8 {q4}, [r0], r1 + vqrshrn.u16 d25, q9, #7 + + ;first_pass filtering on the rest 5-line data + vld1.u8 {q5}, [r0], r1 + + vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp9_filter[0]) + vmull.u8 q7, d4, d0 + vmull.u8 q8, d6, d0 + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + + vext.8 d3, d2, d3, #1 ;construct src_ptr[-1] + vext.8 d5, d4, d5, #1 + vext.8 d7, d6, d7, #1 + vext.8 d9, d8, d9, #1 + vext.8 d11, d10, d11, #1 + + vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp9_filter[1]) + vmlal.u8 q7, d5, d1 + vmlal.u8 q8, d7, d1 + vmlal.u8 q9, d9, d1 + vmlal.u8 q10, d11, d1 + + vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d27, q7, #7 + vqrshrn.u16 d28, q8, #7 + vqrshrn.u16 d29, q9, #7 + vqrshrn.u16 d30, q10, #7 + +;Second pass: 8x8 +secondpass_filter + cmp r3, #0 ;skip second_pass filter if yoffset=0 + beq skip_secondpass_filter + + add r3, r12, r3, lsl #3 + add r0, r4, lr + + vld1.u32 {d31}, [r3] ;load second_pass filter + add r1, r0, lr + + vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1) + vdup.8 d1, d31[4] + + vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp9_filter[0]) + vmull.u8 q2, d23, d0 + vmull.u8 q3, d24, d0 + vmull.u8 q4, d25, d0 + vmull.u8 q5, d26, d0 + vmull.u8 q6, d27, d0 + vmull.u8 q7, d28, d0 + vmull.u8 q8, d29, d0 + + vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp9_filter[1]) + vmlal.u8 q2, d24, d1 + vmlal.u8 q3, d25, d1 + vmlal.u8 q4, d26, d1 + vmlal.u8 q5, d27, d1 + vmlal.u8 q6, d28, d1 + vmlal.u8 q7, d29, d1 + vmlal.u8 q8, d30, d1 + + vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8 + vqrshrn.u16 d3, q2, #7 + vqrshrn.u16 d4, q3, #7 + vqrshrn.u16 d5, q4, #7 + vqrshrn.u16 d6, q5, #7 + vqrshrn.u16 d7, q6, #7 + vqrshrn.u16 d8, q7, #7 + vqrshrn.u16 d9, q8, #7 + + vst1.u8 {d2}, [r4] ;store result + vst1.u8 {d3}, [r0] + vst1.u8 {d4}, [r1], lr + vst1.u8 {d5}, [r1], lr + vst1.u8 {d6}, [r1], lr + vst1.u8 {d7}, [r1], lr + vst1.u8 {d8}, [r1], lr + vst1.u8 {d9}, [r1], lr + + pop {r4, pc} + +;-------------------- +skip_firstpass_filter + vld1.u8 {d22}, [r0], r1 ;load src data + vld1.u8 {d23}, [r0], r1 + vld1.u8 {d24}, [r0], r1 + vld1.u8 {d25}, [r0], r1 + vld1.u8 {d26}, [r0], r1 + vld1.u8 {d27}, [r0], r1 + vld1.u8 {d28}, [r0], r1 + vld1.u8 {d29}, [r0], r1 + vld1.u8 {d30}, [r0], r1 + + b secondpass_filter + +;--------------------- +skip_secondpass_filter + vst1.u8 {d22}, [r4], lr ;store result + vst1.u8 {d23}, [r4], lr + vst1.u8 {d24}, [r4], lr + vst1.u8 {d25}, [r4], lr + vst1.u8 {d26}, [r4], lr + vst1.u8 {d27}, [r4], lr + vst1.u8 {d28}, [r4], lr + vst1.u8 {d29}, [r4], lr + + pop {r4, pc} + + ENDP + +;----------------- + +bifilter8_coeff + DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112 + + END diff --git a/vp9/common/arm/neon/vp9_buildintrapredictorsmby_neon.asm b/vp9/common/arm/neon/vp9_buildintrapredictorsmby_neon.asm new file mode 100644 index 000000000..e3ea91fe6 --- /dev/null +++ b/vp9/common/arm/neon/vp9_buildintrapredictorsmby_neon.asm @@ -0,0 +1,584 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_build_intra_predictors_mby_neon_func| + EXPORT |vp8_build_intra_predictors_mby_s_neon_func| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +; r0 unsigned char *y_buffer +; r1 unsigned char *ypred_ptr +; r2 int y_stride +; r3 int mode +; stack int Up +; stack int Left + +|vp8_build_intra_predictors_mby_neon_func| PROC + push {r4-r8, lr} + + cmp r3, #0 + beq case_dc_pred + cmp r3, #1 + beq case_v_pred + cmp r3, #2 + beq case_h_pred + cmp r3, #3 + beq case_tm_pred + +case_dc_pred + ldr r4, [sp, #24] ; Up + ldr r5, [sp, #28] ; Left + + ; Default the DC average to 128 + mov r12, #128 + vdup.u8 q0, r12 + + ; Zero out running sum + mov r12, #0 + + ; compute shift and jump + adds r7, r4, r5 + beq skip_dc_pred_up_left + + ; Load above row, if it exists + cmp r4, #0 + beq skip_dc_pred_up + + sub r6, r0, r2 + vld1.8 {q1}, [r6] + vpaddl.u8 q2, q1 + vpaddl.u16 q3, q2 + vpaddl.u32 q4, q3 + + vmov.32 r4, d8[0] + vmov.32 r6, d9[0] + + add r12, r4, r6 + + ; Move back to interger registers + +skip_dc_pred_up + + cmp r5, #0 + beq skip_dc_pred_left + + sub r0, r0, #1 + + ; Load left row, if it exists + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0] + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + +skip_dc_pred_left + add r7, r7, #3 ; Shift + sub r4, r7, #1 + mov r5, #1 + add r12, r12, r5, lsl r4 + mov r5, r12, lsr r7 ; expected_dc + + vdup.u8 q0, r5 + +skip_dc_pred_up_left + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + + pop {r4-r8,pc} +case_v_pred + ; Copy down above row + sub r6, r0, r2 + vld1.8 {q0}, [r6] + + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + vst1.u8 {q0}, [r1]! + pop {r4-r8,pc} + +case_h_pred + ; Load 4x yleft_col + sub r0, r0, #1 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + pop {r4-r8,pc} + +case_tm_pred + ; Load yabove_row + sub r3, r0, r2 + vld1.8 {q8}, [r3] + + ; Load ytop_left + sub r3, r3, #1 + ldrb r7, [r3] + + vdup.u16 q7, r7 + + ; Compute yabove_row - ytop_left + mov r3, #1 + vdup.u8 q0, r3 + + vmull.u8 q4, d16, d0 + vmull.u8 q5, d17, d0 + + vsub.s16 q4, q4, q7 + vsub.s16 q5, q5, q7 + + ; Load 4x yleft_col + sub r0, r0, #1 + mov r12, #4 + +case_tm_pred_loop + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u16 q0, r3 + vdup.u16 q1, r4 + vdup.u16 q2, r5 + vdup.u16 q3, r6 + + vqadd.s16 q8, q0, q4 + vqadd.s16 q9, q0, q5 + + vqadd.s16 q10, q1, q4 + vqadd.s16 q11, q1, q5 + + vqadd.s16 q12, q2, q4 + vqadd.s16 q13, q2, q5 + + vqadd.s16 q14, q3, q4 + vqadd.s16 q15, q3, q5 + + vqshrun.s16 d0, q8, #0 + vqshrun.s16 d1, q9, #0 + + vqshrun.s16 d2, q10, #0 + vqshrun.s16 d3, q11, #0 + + vqshrun.s16 d4, q12, #0 + vqshrun.s16 d5, q13, #0 + + vqshrun.s16 d6, q14, #0 + vqshrun.s16 d7, q15, #0 + + vst1.u8 {q0}, [r1]! + vst1.u8 {q1}, [r1]! + vst1.u8 {q2}, [r1]! + vst1.u8 {q3}, [r1]! + + subs r12, r12, #1 + bne case_tm_pred_loop + + pop {r4-r8,pc} + + ENDP + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; r0 unsigned char *y_buffer +; r1 unsigned char *ypred_ptr +; r2 int y_stride +; r3 int mode +; stack int Up +; stack int Left + +|vp8_build_intra_predictors_mby_s_neon_func| PROC + push {r4-r8, lr} + + mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor; + + cmp r3, #0 + beq case_dc_pred_s + cmp r3, #1 + beq case_v_pred_s + cmp r3, #2 + beq case_h_pred_s + cmp r3, #3 + beq case_tm_pred_s + +case_dc_pred_s + ldr r4, [sp, #24] ; Up + ldr r5, [sp, #28] ; Left + + ; Default the DC average to 128 + mov r12, #128 + vdup.u8 q0, r12 + + ; Zero out running sum + mov r12, #0 + + ; compute shift and jump + adds r7, r4, r5 + beq skip_dc_pred_up_left_s + + ; Load above row, if it exists + cmp r4, #0 + beq skip_dc_pred_up_s + + sub r6, r0, r2 + vld1.8 {q1}, [r6] + vpaddl.u8 q2, q1 + vpaddl.u16 q3, q2 + vpaddl.u32 q4, q3 + + vmov.32 r4, d8[0] + vmov.32 r6, d9[0] + + add r12, r4, r6 + + ; Move back to interger registers + +skip_dc_pred_up_s + + cmp r5, #0 + beq skip_dc_pred_left_s + + sub r0, r0, #1 + + ; Load left row, if it exists + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0] + + add r12, r12, r3 + add r12, r12, r4 + add r12, r12, r5 + add r12, r12, r6 + +skip_dc_pred_left_s + add r7, r7, #3 ; Shift + sub r4, r7, #1 + mov r5, #1 + add r12, r12, r5, lsl r4 + mov r5, r12, lsr r7 ; expected_dc + + vdup.u8 q0, r5 + +skip_dc_pred_up_left_s + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + + pop {r4-r8,pc} +case_v_pred_s + ; Copy down above row + sub r6, r0, r2 + vld1.8 {q0}, [r6] + + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q0}, [r1], r2 + pop {r4-r8,pc} + +case_h_pred_s + ; Load 4x yleft_col + sub r0, r0, #1 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u8 q0, r3 + vdup.u8 q1, r4 + vdup.u8 q2, r5 + vdup.u8 q3, r6 + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + pop {r4-r8,pc} + +case_tm_pred_s + ; Load yabove_row + sub r3, r0, r2 + vld1.8 {q8}, [r3] + + ; Load ytop_left + sub r3, r3, #1 + ldrb r7, [r3] + + vdup.u16 q7, r7 + + ; Compute yabove_row - ytop_left + mov r3, #1 + vdup.u8 q0, r3 + + vmull.u8 q4, d16, d0 + vmull.u8 q5, d17, d0 + + vsub.s16 q4, q4, q7 + vsub.s16 q5, q5, q7 + + ; Load 4x yleft_col + sub r0, r0, #1 + mov r12, #4 + +case_tm_pred_loop_s + ldrb r3, [r0], r2 + ldrb r4, [r0], r2 + ldrb r5, [r0], r2 + ldrb r6, [r0], r2 + vdup.u16 q0, r3 + vdup.u16 q1, r4 + vdup.u16 q2, r5 + vdup.u16 q3, r6 + + vqadd.s16 q8, q0, q4 + vqadd.s16 q9, q0, q5 + + vqadd.s16 q10, q1, q4 + vqadd.s16 q11, q1, q5 + + vqadd.s16 q12, q2, q4 + vqadd.s16 q13, q2, q5 + + vqadd.s16 q14, q3, q4 + vqadd.s16 q15, q3, q5 + + vqshrun.s16 d0, q8, #0 + vqshrun.s16 d1, q9, #0 + + vqshrun.s16 d2, q10, #0 + vqshrun.s16 d3, q11, #0 + + vqshrun.s16 d4, q12, #0 + vqshrun.s16 d5, q13, #0 + + vqshrun.s16 d6, q14, #0 + vqshrun.s16 d7, q15, #0 + + vst1.u8 {q0}, [r1], r2 + vst1.u8 {q1}, [r1], r2 + vst1.u8 {q2}, [r1], r2 + vst1.u8 {q3}, [r1], r2 + + subs r12, r12, #1 + bne case_tm_pred_loop_s + + pop {r4-r8,pc} + + ENDP + + + END diff --git a/vp9/common/arm/neon/vp9_copymem16x16_neon.asm b/vp9/common/arm/neon/vp9_copymem16x16_neon.asm new file mode 100644 index 000000000..bff8156d9 --- /dev/null +++ b/vp9/common/arm/neon/vp9_copymem16x16_neon.asm @@ -0,0 +1,59 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_copy_mem16x16_neon| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem16x16_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp9_copy_mem16x16_neon| PROC + + vld1.u8 {q0}, [r0], r1 + vld1.u8 {q1}, [r0], r1 + vld1.u8 {q2}, [r0], r1 + vst1.u8 {q0}, [r2], r3 + vld1.u8 {q3}, [r0], r1 + vst1.u8 {q1}, [r2], r3 + vld1.u8 {q4}, [r0], r1 + vst1.u8 {q2}, [r2], r3 + vld1.u8 {q5}, [r0], r1 + vst1.u8 {q3}, [r2], r3 + vld1.u8 {q6}, [r0], r1 + vst1.u8 {q4}, [r2], r3 + vld1.u8 {q7}, [r0], r1 + vst1.u8 {q5}, [r2], r3 + vld1.u8 {q8}, [r0], r1 + vst1.u8 {q6}, [r2], r3 + vld1.u8 {q9}, [r0], r1 + vst1.u8 {q7}, [r2], r3 + vld1.u8 {q10}, [r0], r1 + vst1.u8 {q8}, [r2], r3 + vld1.u8 {q11}, [r0], r1 + vst1.u8 {q9}, [r2], r3 + vld1.u8 {q12}, [r0], r1 + vst1.u8 {q10}, [r2], r3 + vld1.u8 {q13}, [r0], r1 + vst1.u8 {q11}, [r2], r3 + vld1.u8 {q14}, [r0], r1 + vst1.u8 {q12}, [r2], r3 + vld1.u8 {q15}, [r0], r1 + vst1.u8 {q13}, [r2], r3 + vst1.u8 {q14}, [r2], r3 + vst1.u8 {q15}, [r2], r3 + + mov pc, lr + + ENDP ; |vp9_copy_mem16x16_neon| + + END diff --git a/vp9/common/arm/neon/vp9_copymem8x4_neon.asm b/vp9/common/arm/neon/vp9_copymem8x4_neon.asm new file mode 100644 index 000000000..ffd2df8e1 --- /dev/null +++ b/vp9/common/arm/neon/vp9_copymem8x4_neon.asm @@ -0,0 +1,34 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_copy_mem8x4_neon| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem8x4_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp9_copy_mem8x4_neon| PROC + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d1}, [r0], r1 + vst1.u8 {d0}, [r2], r3 + vld1.u8 {d2}, [r0], r1 + vst1.u8 {d1}, [r2], r3 + vld1.u8 {d3}, [r0], r1 + vst1.u8 {d2}, [r2], r3 + vst1.u8 {d3}, [r2], r3 + + mov pc, lr + + ENDP ; |vp9_copy_mem8x4_neon| + + END diff --git a/vp9/common/arm/neon/vp9_copymem8x8_neon.asm b/vp9/common/arm/neon/vp9_copymem8x8_neon.asm new file mode 100644 index 000000000..2d394c043 --- /dev/null +++ b/vp9/common/arm/neon/vp9_copymem8x8_neon.asm @@ -0,0 +1,43 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_copy_mem8x8_neon| + ; ARM + ; REQUIRE8 + ; PRESERVE8 + + AREA Block, CODE, READONLY ; name this block of code +;void copy_mem8x8_neon( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) +;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= +|vp9_copy_mem8x8_neon| PROC + + vld1.u8 {d0}, [r0], r1 + vld1.u8 {d1}, [r0], r1 + vst1.u8 {d0}, [r2], r3 + vld1.u8 {d2}, [r0], r1 + vst1.u8 {d1}, [r2], r3 + vld1.u8 {d3}, [r0], r1 + vst1.u8 {d2}, [r2], r3 + vld1.u8 {d4}, [r0], r1 + vst1.u8 {d3}, [r2], r3 + vld1.u8 {d5}, [r0], r1 + vst1.u8 {d4}, [r2], r3 + vld1.u8 {d6}, [r0], r1 + vst1.u8 {d5}, [r2], r3 + vld1.u8 {d7}, [r0], r1 + vst1.u8 {d6}, [r2], r3 + vst1.u8 {d7}, [r2], r3 + + mov pc, lr + + ENDP ; |vp9_copy_mem8x8_neon| + + END diff --git a/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm b/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm new file mode 100644 index 000000000..49ba05fb0 --- /dev/null +++ b/vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm @@ -0,0 +1,49 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + + EXPORT |vp8_dc_only_idct_add_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr, +; unsigned char *dst_ptr, int pitch, int stride) +; r0 input_dc +; r1 pred_ptr +; r2 dst_ptr +; r3 pitch +; sp stride +|vp8_dc_only_idct_add_neon| PROC + add r0, r0, #4 + asr r0, r0, #3 + ldr r12, [sp] + vdup.16 q0, r0 + + vld1.32 {d2[0]}, [r1], r3 + vld1.32 {d2[1]}, [r1], r3 + vld1.32 {d4[0]}, [r1], r3 + vld1.32 {d4[1]}, [r1] + + vaddw.u8 q1, q0, d2 + vaddw.u8 q2, q0, d4 + + vqmovun.s16 d2, q1 + vqmovun.s16 d4, q2 + + vst1.32 {d2[0]}, [r2], r12 + vst1.32 {d2[1]}, [r2], r12 + vst1.32 {d4[0]}, [r2], r12 + vst1.32 {d4[1]}, [r2] + + bx lr + + ENDP + END diff --git a/vp9/common/arm/neon/vp9_iwalsh_neon.asm b/vp9/common/arm/neon/vp9_iwalsh_neon.asm new file mode 100644 index 000000000..01c79d937 --- /dev/null +++ b/vp9/common/arm/neon/vp9_iwalsh_neon.asm @@ -0,0 +1,80 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + EXPORT |vp8_short_inv_walsh4x4_neon| + EXPORT |vp8_short_inv_walsh4x4_1_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA |.text|, CODE, READONLY ; name this block of code + +;short vp8_short_inv_walsh4x4_neon(short *input, short *output) +|vp8_short_inv_walsh4x4_neon| PROC + + ; read in all four lines of values: d0->d3 + vld1.i16 {q0-q1}, [r0@128] + + ; first for loop + vadd.s16 d4, d0, d3 ;a = [0] + [12] + vadd.s16 d6, d1, d2 ;b = [4] + [8] + vsub.s16 d5, d0, d3 ;d = [0] - [12] + vsub.s16 d7, d1, d2 ;c = [4] - [8] + + vadd.s16 q0, q2, q3 ; a+b d+c + vsub.s16 q1, q2, q3 ; a-b d-c + + vtrn.32 d0, d2 ;d0: 0 1 8 9 + ;d2: 2 3 10 11 + vtrn.32 d1, d3 ;d1: 4 5 12 13 + ;d3: 6 7 14 15 + + vtrn.16 d0, d1 ;d0: 0 4 8 12 + ;d1: 1 5 9 13 + vtrn.16 d2, d3 ;d2: 2 6 10 14 + ;d3: 3 7 11 15 + + ; second for loop + + vadd.s16 d4, d0, d3 ;a = [0] + [3] + vadd.s16 d6, d1, d2 ;b = [1] + [2] + vsub.s16 d5, d0, d3 ;d = [0] - [3] + vsub.s16 d7, d1, d2 ;c = [1] - [2] + + vmov.i16 q8, #3 + + vadd.s16 q0, q2, q3 ; a+b d+c + vsub.s16 q1, q2, q3 ; a-b d-c + + vadd.i16 q0, q0, q8 ;e/f += 3 + vadd.i16 q1, q1, q8 ;g/h += 3 + + vshr.s16 q0, q0, #3 ;e/f >> 3 + vshr.s16 q1, q1, #3 ;g/h >> 3 + + vst4.i16 {d0,d1,d2,d3}, [r1@128] + + bx lr + ENDP ; |vp8_short_inv_walsh4x4_neon| + + +;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output) +|vp8_short_inv_walsh4x4_1_neon| PROC + ldrsh r2, [r0] ; load input[0] + add r3, r2, #3 ; add 3 + add r2, r1, #16 ; base for last 8 output + asr r0, r3, #3 ; right shift 3 + vdup.16 q0, r0 ; load and duplicate + vst1.16 {q0}, [r1@128] ; write back 8 + vst1.16 {q0}, [r2@128] ; write back last 8 + bx lr + ENDP ; |vp8_short_inv_walsh4x4_1_neon| + + END diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.asm b/vp9/common/arm/neon/vp9_loopfilter_neon.asm new file mode 100644 index 000000000..bc6616734 --- /dev/null +++ b/vp9/common/arm/neon/vp9_loopfilter_neon.asm @@ -0,0 +1,397 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_loop_filter_horizontal_edge_y_neon| + EXPORT |vp9_loop_filter_horizontal_edge_uv_neon| + EXPORT |vp9_loop_filter_vertical_edge_y_neon| + EXPORT |vp9_loop_filter_vertical_edge_uv_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *src +; r1 int pitch +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, +|vp9_loop_filter_horizontal_edge_y_neon| PROC + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines + ldr r3, [sp, #4] ; load thresh + add r12, r2, r1 + add r1, r1, r1 + + vdup.u8 q2, r3 ; duplicate thresh + + vld1.u8 {q3}, [r2@128], r1 ; p3 + vld1.u8 {q4}, [r12@128], r1 ; p2 + vld1.u8 {q5}, [r2@128], r1 ; p1 + vld1.u8 {q6}, [r12@128], r1 ; p0 + vld1.u8 {q7}, [r2@128], r1 ; q0 + vld1.u8 {q8}, [r12@128], r1 ; q1 + vld1.u8 {q9}, [r2@128] ; q2 + vld1.u8 {q10}, [r12@128] ; q3 + + sub r2, r2, r1, lsl #1 + sub r12, r12, r1, lsl #1 + + bl vp9_loop_filter_neon + + vst1.u8 {q5}, [r2@128], r1 ; store op1 + vst1.u8 {q6}, [r12@128], r1 ; store op0 + vst1.u8 {q7}, [r2@128], r1 ; store oq0 + vst1.u8 {q8}, [r12@128], r1 ; store oq1 + + pop {pc} + ENDP ; |vp9_loop_filter_horizontal_edge_y_neon| + + +; r0 unsigned char *u, +; r1 int pitch, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, +; sp+4 unsigned char *v +|vp9_loop_filter_horizontal_edge_uv_neon| PROC + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + ldr r12, [sp, #4] ; load thresh + ldr r2, [sp, #8] ; load v ptr + vdup.u8 q2, r12 ; duplicate thresh + + sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines + sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines + + vld1.u8 {d6}, [r3@64], r1 ; p3 + vld1.u8 {d7}, [r12@64], r1 ; p3 + vld1.u8 {d8}, [r3@64], r1 ; p2 + vld1.u8 {d9}, [r12@64], r1 ; p2 + vld1.u8 {d10}, [r3@64], r1 ; p1 + vld1.u8 {d11}, [r12@64], r1 ; p1 + vld1.u8 {d12}, [r3@64], r1 ; p0 + vld1.u8 {d13}, [r12@64], r1 ; p0 + vld1.u8 {d14}, [r3@64], r1 ; q0 + vld1.u8 {d15}, [r12@64], r1 ; q0 + vld1.u8 {d16}, [r3@64], r1 ; q1 + vld1.u8 {d17}, [r12@64], r1 ; q1 + vld1.u8 {d18}, [r3@64], r1 ; q2 + vld1.u8 {d19}, [r12@64], r1 ; q2 + vld1.u8 {d20}, [r3@64] ; q3 + vld1.u8 {d21}, [r12@64] ; q3 + + bl vp9_loop_filter_neon + + sub r0, r0, r1, lsl #1 + sub r2, r2, r1, lsl #1 + + vst1.u8 {d10}, [r0@64], r1 ; store u op1 + vst1.u8 {d11}, [r2@64], r1 ; store v op1 + vst1.u8 {d12}, [r0@64], r1 ; store u op0 + vst1.u8 {d13}, [r2@64], r1 ; store v op0 + vst1.u8 {d14}, [r0@64], r1 ; store u oq0 + vst1.u8 {d15}, [r2@64], r1 ; store v oq0 + vst1.u8 {d16}, [r0@64] ; store u oq1 + vst1.u8 {d17}, [r2@64] ; store v oq1 + + pop {pc} + ENDP ; |vp9_loop_filter_horizontal_edge_uv_neon| + +; void vp9_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, +; const signed char *flimit, +; const signed char *limit, +; const signed char *thresh, +; int count) +; r0 unsigned char *src +; r1 int pitch +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, + +|vp9_loop_filter_vertical_edge_y_neon| PROC + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + vdup.u8 q1, r3 ; duplicate limit + sub r2, r0, #4 ; src ptr down by 4 columns + add r1, r1, r1 + ldr r3, [sp, #4] ; load thresh + add r12, r2, r1, asr #1 + + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d8}, [r12], r1 + vld1.u8 {d10}, [r2], r1 + vld1.u8 {d12}, [r12], r1 + vld1.u8 {d14}, [r2], r1 + vld1.u8 {d16}, [r12], r1 + vld1.u8 {d18}, [r2], r1 + vld1.u8 {d20}, [r12], r1 + + vld1.u8 {d7}, [r2], r1 ; load second 8-line src data + vld1.u8 {d9}, [r12], r1 + vld1.u8 {d11}, [r2], r1 + vld1.u8 {d13}, [r12], r1 + vld1.u8 {d15}, [r2], r1 + vld1.u8 {d17}, [r12], r1 + vld1.u8 {d19}, [r2] + vld1.u8 {d21}, [r12] + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vdup.u8 q2, r3 ; duplicate thresh + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + bl vp9_loop_filter_neon + + vswp d12, d11 + vswp d16, d13 + + sub r0, r0, #2 ; dst ptr + + vswp d14, d12 + vswp d16, d15 + + add r12, r0, r1, asr #1 + + ;store op1, op0, oq0, oq1 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1 + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 + vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1 + + vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1 + vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1 + vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1 + vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1 + vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1 + vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1 + vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0] + vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12] + + pop {pc} + ENDP ; |vp9_loop_filter_vertical_edge_y_neon| + +; void vp9_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch +; const signed char *flimit, +; const signed char *limit, +; const signed char *thresh, +; unsigned char *v) +; r0 unsigned char *u, +; r1 int pitch, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, +; sp+4 unsigned char *v +|vp9_loop_filter_vertical_edge_uv_neon| PROC + push {lr} + vdup.u8 q0, r2 ; duplicate blimit + sub r12, r0, #4 ; move u pointer down by 4 columns + ldr r2, [sp, #8] ; load v ptr + vdup.u8 q1, r3 ; duplicate limit + sub r3, r2, #4 ; move v pointer down by 4 columns + + vld1.u8 {d6}, [r12], r1 ;load u data + vld1.u8 {d7}, [r3], r1 ;load v data + vld1.u8 {d8}, [r12], r1 + vld1.u8 {d9}, [r3], r1 + vld1.u8 {d10}, [r12], r1 + vld1.u8 {d11}, [r3], r1 + vld1.u8 {d12}, [r12], r1 + vld1.u8 {d13}, [r3], r1 + vld1.u8 {d14}, [r12], r1 + vld1.u8 {d15}, [r3], r1 + vld1.u8 {d16}, [r12], r1 + vld1.u8 {d17}, [r3], r1 + vld1.u8 {d18}, [r12], r1 + vld1.u8 {d19}, [r3], r1 + vld1.u8 {d20}, [r12] + vld1.u8 {d21}, [r3] + + ldr r12, [sp, #4] ; load thresh + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vdup.u8 q2, r12 ; duplicate thresh + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + bl vp9_loop_filter_neon + + vswp d12, d11 + vswp d16, d13 + vswp d14, d12 + vswp d16, d15 + + sub r0, r0, #2 + sub r2, r2, #2 + + ;store op1, op0, oq0, oq1 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 + vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1 + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 + vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1 + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 + vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1 + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 + vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 + vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1 + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 + vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 + vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1 + vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0] + vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2] + + pop {pc} + ENDP ; |vp9_loop_filter_vertical_edge_uv_neon| + +; void vp9_loop_filter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. + +; r0-r3 PRESERVE +; q0 flimit +; q1 limit +; q2 thresh +; q3 p3 +; q4 p2 +; q5 p1 +; q6 p0 +; q7 q0 +; q8 q1 +; q9 q2 +; q10 q3 +|vp9_loop_filter_neon| PROC + + ; vp9_filter_mask + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q3, q9, q8 ; abs(q2 - q1) + vabd.u8 q4, q10, q9 ; abs(q3 - q2) + + vmax.u8 q11, q11, q12 + vmax.u8 q12, q13, q14 + vmax.u8 q3, q3, q4 + vmax.u8 q15, q11, q12 + + vabd.u8 q9, q6, q7 ; abs(p0 - q0) + + ; vp8_hevmask + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 q15, q15, q3 + + vmov.u8 q10, #0x80 ; 0x80 + + vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) + vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 + + vcge.u8 q15, q1, q15 + + ; vp9_filter() function + ; convert to signed + veor q7, q7, q10 ; qs0 + vshr.u8 q2, q2, #1 ; a = a / 2 + veor q6, q6, q10 ; ps0 + + veor q5, q5, q10 ; ps1 + vqadd.u8 q9, q9, q2 ; a = b + a + + veor q8, q8, q10 ; qs1 + + vmov.u8 q10, #3 ; #3 + + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q11, d15, d13 + + vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 + + vmovl.u8 q4, d20 + + vqsub.s8 q1, q5, q8 ; vp9_filter = clamp(ps1-qs1) + vorr q14, q13, q14 ; vp8_hevmask + + vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) + vmul.i16 q11, q11, q4 + + vand q1, q1, q14 ; vp9_filter &= hev + vand q15, q15, q9 ; vp9_filter_mask + + vaddw.s8 q2, q2, d2 + vaddw.s8 q11, q11, d3 + + vmov.u8 q9, #4 ; #4 + + ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d2, q2 + vqmovn.s16 d3, q11 + vand q1, q1, q15 ; vp9_filter &= mask + + vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp9_filter+3) + vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp9_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q1, q1, #3 ; Filter1 >>= 3 + + + vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2) + vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1) + + ; outer tap adjustments: ++vp9_filter >> 1 + vrshr.s8 q1, q1, #1 + vbic q1, q1, q14 ; vp9_filter &= ~hev + vmov.u8 q0, #0x80 ; 0x80 + vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp9_filter) + vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp9_filter) + + veor q6, q11, q0 ; *op0 = u^0x80 + veor q7, q10, q0 ; *oq0 = u^0x80 + veor q5, q13, q0 ; *op1 = u^0x80 + veor q8, q12, q0 ; *oq1 = u^0x80 + + bx lr + ENDP ; |vp9_loop_filter_horizontal_edge_y_neon| + +;----------------- + + END diff --git a/vp9/common/arm/neon/vp9_loopfiltersimplehorizontaledge_neon.asm b/vp9/common/arm/neon/vp9_loopfiltersimplehorizontaledge_neon.asm new file mode 100644 index 000000000..eb07ce0d5 --- /dev/null +++ b/vp9/common/arm/neon/vp9_loopfiltersimplehorizontaledge_neon.asm @@ -0,0 +1,117 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + ;EXPORT |vp9_loop_filter_simple_horizontal_edge_neon| + EXPORT |vp9_loop_filter_bhs_neon| + EXPORT |vp9_loop_filter_mbhs_neon| + ARM + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *s, PRESERVE +; r1 int p, PRESERVE +; q1 limit, PRESERVE + +|vp9_loop_filter_simple_horizontal_edge_neon| PROC + + sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines + + vld1.u8 {q7}, [r0@128], r1 ; q0 + vld1.u8 {q5}, [r3@128], r1 ; p0 + vld1.u8 {q8}, [r0@128] ; q1 + vld1.u8 {q6}, [r3@128] ; p1 + + vabd.u8 q15, q6, q7 ; abs(p0 - q0) + vabd.u8 q14, q5, q8 ; abs(p1 - q1) + + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 + vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vmov.u8 q0, #0x80 ; 0x80 + vmov.s16 q13, #3 + vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + + veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value + veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value + veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value + veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value + + vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1 + + vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) + vsubl.s8 q3, d15, d13 + + vqsub.s8 q4, q5, q8 ; q4: vp9_filter = vp9_signed_char_clamp(ps1-qs1) + + vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0) + vmul.s16 q3, q3, q13 + + vmov.u8 q10, #0x03 ; 0x03 + vmov.u8 q9, #0x04 ; 0x04 + + vaddw.s8 q2, q2, d8 ; vp9_filter + 3 * ( qs0 - ps0) + vaddw.s8 q3, q3, d9 + + vqmovn.s16 d8, q2 ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d9, q3 + + vand q14, q4, q15 ; vp9_filter &= mask + + vqadd.s8 q2, q14, q10 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3) + vqadd.s8 q3, q14, q9 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q4, q3, #3 ; Filter1 >>= 3 + + sub r0, r0, r1 + + ;calculate output + vqadd.s8 q11, q6, q2 ; u = vp9_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q7, q4 ; u = vp9_signed_char_clamp(qs0 - Filter1) + + veor q6, q11, q0 ; *op0 = u^0x80 + veor q7, q10, q0 ; *oq0 = u^0x80 + + vst1.u8 {q6}, [r3@128] ; store op0 + vst1.u8 {q7}, [r0@128] ; store oq0 + + bx lr + ENDP ; |vp9_loop_filter_simple_horizontal_edge_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp9_loop_filter_bhs_neon| PROC + push {r4, lr} + ldrb r3, [r2] ; load blim from mem + vdup.s8 q1, r3 ; duplicate blim + + add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride + bl vp9_loop_filter_simple_horizontal_edge_neon + ; vp9_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1 + add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride + bl vp9_loop_filter_simple_horizontal_edge_neon + add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride + pop {r4, lr} + b vp9_loop_filter_simple_horizontal_edge_neon + ENDP ;|vp9_loop_filter_bhs_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp9_loop_filter_mbhs_neon| PROC + ldrb r3, [r2] ; load blim from mem + vdup.s8 q1, r3 ; duplicate mblim + b vp9_loop_filter_simple_horizontal_edge_neon + ENDP ;|vp9_loop_filter_bhs_neon| + + END diff --git a/vp9/common/arm/neon/vp9_loopfiltersimpleverticaledge_neon.asm b/vp9/common/arm/neon/vp9_loopfiltersimpleverticaledge_neon.asm new file mode 100644 index 000000000..d5cf8c2b5 --- /dev/null +++ b/vp9/common/arm/neon/vp9_loopfiltersimpleverticaledge_neon.asm @@ -0,0 +1,154 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + ;EXPORT |vp9_loop_filter_simple_vertical_edge_neon| + EXPORT |vp9_loop_filter_bvs_neon| + EXPORT |vp9_loop_filter_mbvs_neon| + ARM + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *s, PRESERVE +; r1 int p, PRESERVE +; q1 limit, PRESERVE + +|vp9_loop_filter_simple_vertical_edge_neon| PROC + sub r0, r0, #2 ; move src pointer down by 2 columns + add r12, r1, r1 + add r3, r0, r1 + + vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12 + vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12 + vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12 + vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12 + vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12 + vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12 + vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12 + vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12 + + vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12 + vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12 + vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12 + vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12 + vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12 + vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12 + vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12 + vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3] + + vswp d7, d10 + vswp d12, d9 + + ;vp9_filter_mask() function + ;vp8_hevmask() function + sub r0, r0, r1, lsl #4 + vabd.u8 q15, q5, q4 ; abs(p0 - q0) + vabd.u8 q14, q3, q6 ; abs(p1 - q1) + + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 + vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vmov.u8 q0, #0x80 ; 0x80 + vmov.s16 q11, #3 + vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + + veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value + veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value + veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value + veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value + + vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + + vsubl.s8 q2, d8, d10 ; ( qs0 - ps0) + vsubl.s8 q13, d9, d11 + + vqsub.s8 q14, q3, q6 ; vp9_filter = vp9_signed_char_clamp(ps1-qs1) + + vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0) + vmul.s16 q13, q13, q11 + + vmov.u8 q11, #0x03 ; 0x03 + vmov.u8 q12, #0x04 ; 0x04 + + vaddw.s8 q2, q2, d28 ; vp9_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d29 + + vqmovn.s16 d28, q2 ; vp9_filter = vp9_signed_char_clamp(vp9_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d29, q13 + + add r0, r0, #1 + add r3, r0, r1 + + vand q14, q14, q15 ; vp9_filter &= mask + + vqadd.s8 q2, q14, q11 ; Filter2 = vp9_signed_char_clamp(vp9_filter+3) + vqadd.s8 q3, q14, q12 ; Filter1 = vp9_signed_char_clamp(vp9_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q14, q3, #3 ; Filter1 >>= 3 + + ;calculate output + vqadd.s8 q11, q5, q2 ; u = vp9_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q4, q14 ; u = vp9_signed_char_clamp(qs0 - Filter1) + + veor q6, q11, q0 ; *op0 = u^0x80 + veor q7, q10, q0 ; *oq0 = u^0x80 + add r12, r1, r1 + vswp d13, d14 + + ;store op1, op0, oq0, oq1 + vst2.8 {d12[0], d13[0]}, [r0], r12 + vst2.8 {d12[1], d13[1]}, [r3], r12 + vst2.8 {d12[2], d13[2]}, [r0], r12 + vst2.8 {d12[3], d13[3]}, [r3], r12 + vst2.8 {d12[4], d13[4]}, [r0], r12 + vst2.8 {d12[5], d13[5]}, [r3], r12 + vst2.8 {d12[6], d13[6]}, [r0], r12 + vst2.8 {d12[7], d13[7]}, [r3], r12 + vst2.8 {d14[0], d15[0]}, [r0], r12 + vst2.8 {d14[1], d15[1]}, [r3], r12 + vst2.8 {d14[2], d15[2]}, [r0], r12 + vst2.8 {d14[3], d15[3]}, [r3], r12 + vst2.8 {d14[4], d15[4]}, [r0], r12 + vst2.8 {d14[5], d15[5]}, [r3], r12 + vst2.8 {d14[6], d15[6]}, [r0], r12 + vst2.8 {d14[7], d15[7]}, [r3] + + bx lr + ENDP ; |vp9_loop_filter_simple_vertical_edge_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp9_loop_filter_bvs_neon| PROC + push {r4, lr} + ldrb r3, [r2] ; load blim from mem + mov r4, r0 + add r0, r0, #4 + vdup.s8 q1, r3 ; duplicate blim + bl vp9_loop_filter_simple_vertical_edge_neon + ; vp9_loop_filter_simple_vertical_edge_neon preserves r1 and q1 + add r0, r4, #8 + bl vp9_loop_filter_simple_vertical_edge_neon + add r0, r4, #12 + pop {r4, lr} + b vp9_loop_filter_simple_vertical_edge_neon + ENDP ;|vp9_loop_filter_bvs_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp9_loop_filter_mbvs_neon| PROC + ldrb r3, [r2] ; load mblim from mem + vdup.s8 q1, r3 ; duplicate mblim + b vp9_loop_filter_simple_vertical_edge_neon + ENDP ;|vp9_loop_filter_bvs_neon| + END diff --git a/vp9/common/arm/neon/vp9_mbloopfilter_neon.asm b/vp9/common/arm/neon/vp9_mbloopfilter_neon.asm new file mode 100644 index 000000000..19b67f47d --- /dev/null +++ b/vp9/common/arm/neon/vp9_mbloopfilter_neon.asm @@ -0,0 +1,469 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_mbloop_filter_horizontal_edge_y_neon| + EXPORT |vp8_mbloop_filter_horizontal_edge_uv_neon| + EXPORT |vp8_mbloop_filter_vertical_edge_y_neon| + EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh) +; r0 unsigned char *src, +; r1 int pitch, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, +|vp8_mbloop_filter_horizontal_edge_y_neon| PROC + push {lr} + add r1, r1, r1 ; double stride + ldr r12, [sp, #4] ; load thresh + sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines + vdup.u8 q2, r12 ; thresh + add r12, r0, r1, lsr #1 ; move src pointer up by 1 line + + vld1.u8 {q3}, [r0@128], r1 ; p3 + vld1.u8 {q4}, [r12@128], r1 ; p2 + vld1.u8 {q5}, [r0@128], r1 ; p1 + vld1.u8 {q6}, [r12@128], r1 ; p0 + vld1.u8 {q7}, [r0@128], r1 ; q0 + vld1.u8 {q8}, [r12@128], r1 ; q1 + vld1.u8 {q9}, [r0@128], r1 ; q2 + vld1.u8 {q10}, [r12@128], r1 ; q3 + + bl vp8_mbloop_filter_neon + + sub r12, r12, r1, lsl #2 + add r0, r12, r1, lsr #1 + + vst1.u8 {q4}, [r12@128],r1 ; store op2 + vst1.u8 {q5}, [r0@128],r1 ; store op1 + vst1.u8 {q6}, [r12@128], r1 ; store op0 + vst1.u8 {q7}, [r0@128],r1 ; store oq0 + vst1.u8 {q8}, [r12@128] ; store oq1 + vst1.u8 {q9}, [r0@128] ; store oq2 + + pop {pc} + ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon| + +; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh, +; unsigned char *v) +; r0 unsigned char *u, +; r1 int pitch, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, +; sp+4 unsigned char *v + +|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC + push {lr} + ldr r12, [sp, #4] ; load thresh + sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines + vdup.u8 q2, r12 ; thresh + ldr r12, [sp, #8] ; load v ptr + sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines + + vld1.u8 {d6}, [r0@64], r1 ; p3 + vld1.u8 {d7}, [r12@64], r1 ; p3 + vld1.u8 {d8}, [r0@64], r1 ; p2 + vld1.u8 {d9}, [r12@64], r1 ; p2 + vld1.u8 {d10}, [r0@64], r1 ; p1 + vld1.u8 {d11}, [r12@64], r1 ; p1 + vld1.u8 {d12}, [r0@64], r1 ; p0 + vld1.u8 {d13}, [r12@64], r1 ; p0 + vld1.u8 {d14}, [r0@64], r1 ; q0 + vld1.u8 {d15}, [r12@64], r1 ; q0 + vld1.u8 {d16}, [r0@64], r1 ; q1 + vld1.u8 {d17}, [r12@64], r1 ; q1 + vld1.u8 {d18}, [r0@64], r1 ; q2 + vld1.u8 {d19}, [r12@64], r1 ; q2 + vld1.u8 {d20}, [r0@64], r1 ; q3 + vld1.u8 {d21}, [r12@64], r1 ; q3 + + bl vp8_mbloop_filter_neon + + sub r0, r0, r1, lsl #3 + sub r12, r12, r1, lsl #3 + + add r0, r0, r1 + add r12, r12, r1 + + vst1.u8 {d8}, [r0@64], r1 ; store u op2 + vst1.u8 {d9}, [r12@64], r1 ; store v op2 + vst1.u8 {d10}, [r0@64], r1 ; store u op1 + vst1.u8 {d11}, [r12@64], r1 ; store v op1 + vst1.u8 {d12}, [r0@64], r1 ; store u op0 + vst1.u8 {d13}, [r12@64], r1 ; store v op0 + vst1.u8 {d14}, [r0@64], r1 ; store u oq0 + vst1.u8 {d15}, [r12@64], r1 ; store v oq0 + vst1.u8 {d16}, [r0@64], r1 ; store u oq1 + vst1.u8 {d17}, [r12@64], r1 ; store v oq1 + vst1.u8 {d18}, [r0@64], r1 ; store u oq2 + vst1.u8 {d19}, [r12@64], r1 ; store v oq2 + + pop {pc} + ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon| + +; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh) +; r0 unsigned char *src, +; r1 int pitch, +; r2 unsigned char blimit +; r3 unsigned char limit +; sp unsigned char thresh, +|vp8_mbloop_filter_vertical_edge_y_neon| PROC + push {lr} + ldr r12, [sp, #4] ; load thresh + sub r0, r0, #4 ; move src pointer down by 4 columns + vdup.s8 q2, r12 ; thresh + add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines + + vld1.u8 {d6}, [r0], r1 ; load first 8-line src data + vld1.u8 {d7}, [r12], r1 ; load second 8-line src data + vld1.u8 {d8}, [r0], r1 + vld1.u8 {d9}, [r12], r1 + vld1.u8 {d10}, [r0], r1 + vld1.u8 {d11}, [r12], r1 + vld1.u8 {d12}, [r0], r1 + vld1.u8 {d13}, [r12], r1 + vld1.u8 {d14}, [r0], r1 + vld1.u8 {d15}, [r12], r1 + vld1.u8 {d16}, [r0], r1 + vld1.u8 {d17}, [r12], r1 + vld1.u8 {d18}, [r0], r1 + vld1.u8 {d19}, [r12], r1 + vld1.u8 {d20}, [r0], r1 + vld1.u8 {d21}, [r12], r1 + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + sub r0, r0, r1, lsl #3 + + bl vp8_mbloop_filter_neon + + sub r12, r12, r1, lsl #3 + + ;transpose to 16x8 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + ;store op2, op1, op0, oq0, oq1, oq2 + vst1.8 {d6}, [r0], r1 + vst1.8 {d7}, [r12], r1 + vst1.8 {d8}, [r0], r1 + vst1.8 {d9}, [r12], r1 + vst1.8 {d10}, [r0], r1 + vst1.8 {d11}, [r12], r1 + vst1.8 {d12}, [r0], r1 + vst1.8 {d13}, [r12], r1 + vst1.8 {d14}, [r0], r1 + vst1.8 {d15}, [r12], r1 + vst1.8 {d16}, [r0], r1 + vst1.8 {d17}, [r12], r1 + vst1.8 {d18}, [r0], r1 + vst1.8 {d19}, [r12], r1 + vst1.8 {d20}, [r0] + vst1.8 {d21}, [r12] + + pop {pc} + ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon| + +; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch, +; const unsigned char *blimit, +; const unsigned char *limit, +; const unsigned char *thresh, +; unsigned char *v) +; r0 unsigned char *u, +; r1 int pitch, +; r2 const signed char *flimit, +; r3 const signed char *limit, +; sp const signed char *thresh, +; sp+4 unsigned char *v +|vp8_mbloop_filter_vertical_edge_uv_neon| PROC + push {lr} + ldr r12, [sp, #4] ; load thresh + sub r0, r0, #4 ; move u pointer down by 4 columns + vdup.u8 q2, r12 ; thresh + ldr r12, [sp, #8] ; load v ptr + sub r12, r12, #4 ; move v pointer down by 4 columns + + vld1.u8 {d6}, [r0], r1 ;load u data + vld1.u8 {d7}, [r12], r1 ;load v data + vld1.u8 {d8}, [r0], r1 + vld1.u8 {d9}, [r12], r1 + vld1.u8 {d10}, [r0], r1 + vld1.u8 {d11}, [r12], r1 + vld1.u8 {d12}, [r0], r1 + vld1.u8 {d13}, [r12], r1 + vld1.u8 {d14}, [r0], r1 + vld1.u8 {d15}, [r12], r1 + vld1.u8 {d16}, [r0], r1 + vld1.u8 {d17}, [r12], r1 + vld1.u8 {d18}, [r0], r1 + vld1.u8 {d19}, [r12], r1 + vld1.u8 {d20}, [r0], r1 + vld1.u8 {d21}, [r12], r1 + + ;transpose to 8x16 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + sub r0, r0, r1, lsl #3 + + bl vp8_mbloop_filter_neon + + sub r12, r12, r1, lsl #3 + + ;transpose to 16x8 matrix + vtrn.32 q3, q7 + vtrn.32 q4, q8 + vtrn.32 q5, q9 + vtrn.32 q6, q10 + + vtrn.16 q3, q5 + vtrn.16 q4, q6 + vtrn.16 q7, q9 + vtrn.16 q8, q10 + + vtrn.8 q3, q4 + vtrn.8 q5, q6 + vtrn.8 q7, q8 + vtrn.8 q9, q10 + + ;store op2, op1, op0, oq0, oq1, oq2 + vst1.8 {d6}, [r0], r1 + vst1.8 {d7}, [r12], r1 + vst1.8 {d8}, [r0], r1 + vst1.8 {d9}, [r12], r1 + vst1.8 {d10}, [r0], r1 + vst1.8 {d11}, [r12], r1 + vst1.8 {d12}, [r0], r1 + vst1.8 {d13}, [r12], r1 + vst1.8 {d14}, [r0], r1 + vst1.8 {d15}, [r12], r1 + vst1.8 {d16}, [r0], r1 + vst1.8 {d17}, [r12], r1 + vst1.8 {d18}, [r0], r1 + vst1.8 {d19}, [r12], r1 + vst1.8 {d20}, [r0] + vst1.8 {d21}, [r12] + + pop {pc} + ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon| + +; void vp8_mbloop_filter_neon() +; This is a helper function for the macroblock loopfilters. The individual +; functions do the necessary load, transpose (if necessary), preserve (if +; necessary) and store. + +; r0,r1 PRESERVE +; r2 mblimit +; r3 limit + +; q2 thresh +; q3 p3 PRESERVE +; q4 p2 +; q5 p1 +; q6 p0 +; q7 q0 +; q8 q1 +; q9 q2 +; q10 q3 PRESERVE + +|vp8_mbloop_filter_neon| PROC + + ; vp9_filter_mask + vabd.u8 q11, q3, q4 ; abs(p3 - p2) + vabd.u8 q12, q4, q5 ; abs(p2 - p1) + vabd.u8 q13, q5, q6 ; abs(p1 - p0) + vabd.u8 q14, q8, q7 ; abs(q1 - q0) + vabd.u8 q1, q9, q8 ; abs(q2 - q1) + vabd.u8 q0, q10, q9 ; abs(q3 - q2) + + vmax.u8 q11, q11, q12 + vmax.u8 q12, q13, q14 + vmax.u8 q1, q1, q0 + vmax.u8 q15, q11, q12 + + vabd.u8 q12, q6, q7 ; abs(p0 - q0) + + ; vp8_hevmask + vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1 + vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1 + vmax.u8 q15, q15, q1 + + vdup.u8 q1, r3 ; limit + vdup.u8 q2, r2 ; mblimit + + vmov.u8 q0, #0x80 ; 0x80 + + vcge.u8 q15, q1, q15 + + vabd.u8 q1, q5, q8 ; a = abs(p1 - q1) + vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2 + vmov.u16 q11, #3 ; #3 + + ; vp9_filter + ; convert to signed + veor q7, q7, q0 ; qs0 + vshr.u8 q1, q1, #1 ; a = a / 2 + veor q6, q6, q0 ; ps0 + veor q5, q5, q0 ; ps1 + + vqadd.u8 q12, q12, q1 ; a = b + a + + veor q8, q8, q0 ; qs1 + veor q4, q4, q0 ; ps2 + veor q9, q9, q0 ; qs2 + + vorr q14, q13, q14 ; vp8_hevmask + + vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1 + + vsubl.s8 q2, d14, d12 ; qs0 - ps0 + vsubl.s8 q13, d15, d13 + + vqsub.s8 q1, q5, q8 ; vp9_filter = clamp(ps1-qs1) + + vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0) + + vand q15, q15, q12 ; vp9_filter_mask + + vmul.i16 q13, q13, q11 + + vmov.u8 q12, #3 ; #3 + + vaddw.s8 q2, q2, d2 ; vp9_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d3 + + vmov.u8 q11, #4 ; #4 + + ; vp9_filter = clamp(vp9_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d2, q2 + vqmovn.s16 d3, q13 + + vand q1, q1, q15 ; vp9_filter &= mask + + vmov.u16 q15, #63 ; #63 + + vand q13, q1, q14 ; Filter2 &= hev + + vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4) + vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3) + + vmov q0, q15 + + vshr.s8 q2, q2, #3 ; Filter1 >>= 3 + vshr.s8 q13, q13, #3 ; Filter2 >>= 3 + + vmov q11, q15 + vmov q12, q15 + + vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1) + + vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2) + + vbic q1, q1, q14 ; vp9_filter &= ~hev + + ; roughly 1/7th difference across boundary + ; roughly 2/7th difference across boundary + ; roughly 3/7th difference across boundary + + vmov.u8 d5, #9 ; #9 + vmov.u8 d4, #18 ; #18 + + vmov q13, q15 + vmov q14, q15 + + vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9 + vmlal.s8 q11, d3, d5 + vmov.u8 d5, #27 ; #27 + vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18 + vmlal.s8 q13, d3, d4 + vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27 + vmlal.s8 q15, d3, d5 + + vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7) + vqshrn.s16 d1, q11, #7 + vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7) + vqshrn.s16 d25, q13, #7 + vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7) + vqshrn.s16 d29, q15, #7 + + vmov.u8 q1, #0x80 ; 0x80 + + vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u) + vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u) + vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u) + vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u) + vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u) + vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u) + + veor q9, q11, q1 ; *oq2 = s^0x80 + veor q4, q0, q1 ; *op2 = s^0x80 + veor q8, q13, q1 ; *oq1 = s^0x80 + veor q5, q12, q1 ; *op2 = s^0x80 + veor q7, q15, q1 ; *oq0 = s^0x80 + veor q6, q14, q1 ; *op0 = s^0x80 + + bx lr + ENDP ; |vp8_mbloop_filter_neon| + +;----------------- + + END diff --git a/vp9/common/arm/neon/vp9_recon16x16mb_neon.asm b/vp9/common/arm/neon/vp9_recon16x16mb_neon.asm new file mode 100644 index 000000000..3f1a30f48 --- /dev/null +++ b/vp9/common/arm/neon/vp9_recon16x16mb_neon.asm @@ -0,0 +1,131 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_recon16x16mb_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *pred_ptr, +; r1 short *diff_ptr, +; r2 unsigned char *dst_ptr, +; r3 int ystride, +; stack unsigned char *udst_ptr, +; stack unsigned char *vdst_ptr + +|vp8_recon16x16mb_neon| PROC + mov r12, #4 ;loop counter for Y loop + +recon16x16mb_loop_y + vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr + vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr + vld1.u8 {q14, q15}, [r0]! + vld1.16 {q10, q11}, [r1]! + + vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits + vmovl.u8 q1, d25 + vmovl.u8 q2, d26 + vmovl.u8 q3, d27 + vmovl.u8 q4, d28 + vmovl.u8 q5, d29 + vmovl.u8 q6, d30 + vld1.16 {q12, q13}, [r1]! + vmovl.u8 q7, d31 + vld1.16 {q14, q15}, [r1]! + + pld [r0] + pld [r1] + pld [r1, #64] + + vadd.s16 q0, q0, q8 ;add Diff data and Pred data together + vadd.s16 q1, q1, q9 + vadd.s16 q2, q2, q10 + vadd.s16 q3, q3, q11 + vadd.s16 q4, q4, q12 + vadd.s16 q5, q5, q13 + vadd.s16 q6, q6, q14 + vadd.s16 q7, q7, q15 + + vqmovun.s16 d0, q0 ;CLAMP() saturation + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + vqmovun.s16 d4, q4 + vqmovun.s16 d5, q5 + vst1.u8 {q0}, [r2], r3 ;store result + vqmovun.s16 d6, q6 + vst1.u8 {q1}, [r2], r3 + vqmovun.s16 d7, q7 + vst1.u8 {q2}, [r2], r3 + subs r12, r12, #1 + + moveq r12, #2 ;loop counter for UV loop + + vst1.u8 {q3}, [r2], r3 + bne recon16x16mb_loop_y + + mov r3, r3, lsr #1 ;uv_stride = ystride>>1 + ldr r2, [sp] ;load upred_ptr + +recon16x16mb_loop_uv + vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr + vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr + vld1.u8 {q14, q15}, [r0]! + vld1.16 {q10, q11}, [r1]! + + vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits + vmovl.u8 q1, d25 + vmovl.u8 q2, d26 + vmovl.u8 q3, d27 + vmovl.u8 q4, d28 + vmovl.u8 q5, d29 + vmovl.u8 q6, d30 + vld1.16 {q12, q13}, [r1]! + vmovl.u8 q7, d31 + vld1.16 {q14, q15}, [r1]! + + vadd.s16 q0, q0, q8 ;add Diff data and Pred data together + vadd.s16 q1, q1, q9 + vadd.s16 q2, q2, q10 + vadd.s16 q3, q3, q11 + vadd.s16 q4, q4, q12 + vadd.s16 q5, q5, q13 + vadd.s16 q6, q6, q14 + + vqmovun.s16 d0, q0 ;CLAMP() saturation + vadd.s16 q7, q7, q15 + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + vst1.u8 {d0}, [r2], r3 ;store result + vqmovun.s16 d4, q4 + vst1.u8 {d1}, [r2], r3 + vqmovun.s16 d5, q5 + vst1.u8 {d2}, [r2], r3 + vqmovun.s16 d6, q6 + vst1.u8 {d3}, [r2], r3 + vqmovun.s16 d7, q7 + vst1.u8 {d4}, [r2], r3 + subs r12, r12, #1 + + vst1.u8 {d5}, [r2], r3 + vst1.u8 {d6}, [r2], r3 + vst1.u8 {d7}, [r2], r3 + + ldrne r2, [sp, #4] ;load vpred_ptr + bne recon16x16mb_loop_uv + + bx lr + + ENDP + END diff --git a/vp9/common/arm/neon/vp9_recon2b_neon.asm b/vp9/common/arm/neon/vp9_recon2b_neon.asm new file mode 100644 index 000000000..99b251c91 --- /dev/null +++ b/vp9/common/arm/neon/vp9_recon2b_neon.asm @@ -0,0 +1,54 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_recon2b_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *pred_ptr, +; r1 short *diff_ptr, +; r2 unsigned char *dst_ptr, +; r3 int stride + +|vp8_recon2b_neon| PROC + vld1.u8 {q8, q9}, [r0] ;load data from pred_ptr + vld1.16 {q4, q5}, [r1]! ;load data from diff_ptr + + vmovl.u8 q0, d16 ;modify Pred data from 8 bits to 16 bits + vld1.16 {q6, q7}, [r1]! + vmovl.u8 q1, d17 + vmovl.u8 q2, d18 + vmovl.u8 q3, d19 + + vadd.s16 q0, q0, q4 ;add Diff data and Pred data together + vadd.s16 q1, q1, q5 + vadd.s16 q2, q2, q6 + vadd.s16 q3, q3, q7 + + vqmovun.s16 d0, q0 ;CLAMP() saturation + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + add r0, r2, r3 + + vst1.u8 {d0}, [r2] ;store result + vst1.u8 {d1}, [r0], r3 + add r2, r0, r3 + vst1.u8 {d2}, [r0] + vst1.u8 {d3}, [r2], r3 + + bx lr + + ENDP + END diff --git a/vp9/common/arm/neon/vp9_recon4b_neon.asm b/vp9/common/arm/neon/vp9_recon4b_neon.asm new file mode 100644 index 000000000..991727746 --- /dev/null +++ b/vp9/common/arm/neon/vp9_recon4b_neon.asm @@ -0,0 +1,69 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_recon4b_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *pred_ptr, +; r1 short *diff_ptr, +; r2 unsigned char *dst_ptr, +; r3 int stride + +|vp8_recon4b_neon| PROC + vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr + vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr + vld1.u8 {q14, q15}, [r0] + vld1.16 {q10, q11}, [r1]! + + vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits + vmovl.u8 q1, d25 + vmovl.u8 q2, d26 + vmovl.u8 q3, d27 + vmovl.u8 q4, d28 + vmovl.u8 q5, d29 + vmovl.u8 q6, d30 + vld1.16 {q12, q13}, [r1]! + vmovl.u8 q7, d31 + vld1.16 {q14, q15}, [r1] + + vadd.s16 q0, q0, q8 ;add Diff data and Pred data together + vadd.s16 q1, q1, q9 + vadd.s16 q2, q2, q10 + vadd.s16 q3, q3, q11 + vadd.s16 q4, q4, q12 + vadd.s16 q5, q5, q13 + vadd.s16 q6, q6, q14 + vadd.s16 q7, q7, q15 + + vqmovun.s16 d0, q0 ;CLAMP() saturation + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + vqmovun.s16 d4, q4 + vqmovun.s16 d5, q5 + vqmovun.s16 d6, q6 + vqmovun.s16 d7, q7 + add r0, r2, r3 + + vst1.u8 {q0}, [r2] ;store result + vst1.u8 {q1}, [r0], r3 + add r2, r0, r3 + vst1.u8 {q2}, [r0] + vst1.u8 {q3}, [r2], r3 + + bx lr + + ENDP + END diff --git a/vp9/common/arm/neon/vp9_recon_neon.c b/vp9/common/arm/neon/vp9_recon_neon.c new file mode 100644 index 000000000..cc3f9f59d --- /dev/null +++ b/vp9/common/arm/neon/vp9_recon_neon.c @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vp9/common/recon.h" +#include "vp9/common/vp9_blockd.h" + +extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr); + +void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd) { + unsigned char *pred_ptr = &xd->predictor[0]; + short *diff_ptr = &xd->diff[0]; + unsigned char *dst_ptr = xd->dst.y_buffer; + unsigned char *udst_ptr = xd->dst.u_buffer; + unsigned char *vdst_ptr = xd->dst.v_buffer; + int ystride = xd->dst.y_stride; + /*int uv_stride = xd->dst.uv_stride;*/ + + vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride, + udst_ptr, vdst_ptr); +} diff --git a/vp9/common/arm/neon/vp9_reconb_neon.asm b/vp9/common/arm/neon/vp9_reconb_neon.asm new file mode 100644 index 000000000..288c0ef01 --- /dev/null +++ b/vp9/common/arm/neon/vp9_reconb_neon.asm @@ -0,0 +1,61 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_recon_b_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *pred_ptr, +; r1 short *diff_ptr, +; r2 unsigned char *dst_ptr, +; r3 int stride + +|vp8_recon_b_neon| PROC + mov r12, #16 + + vld1.u8 {d28}, [r0], r12 ;load 4 data/line from pred_ptr + vld1.16 {q10, q11}, [r1]! ;load data from diff_ptr + vld1.u8 {d29}, [r0], r12 + vld1.16 {q11, q12}, [r1]! + vld1.u8 {d30}, [r0], r12 + vld1.16 {q12, q13}, [r1]! + vld1.u8 {d31}, [r0], r12 + vld1.16 {q13}, [r1] + + vmovl.u8 q0, d28 ;modify Pred data from 8 bits to 16 bits + vmovl.u8 q1, d29 ;Pred data in d0, d2, d4, d6 + vmovl.u8 q2, d30 + vmovl.u8 q3, d31 + + vadd.s16 d0, d0, d20 ;add Diff data and Pred data together + vadd.s16 d2, d2, d22 + vadd.s16 d4, d4, d24 + vadd.s16 d6, d6, d26 + + vqmovun.s16 d0, q0 ;CLAMP() saturation + vqmovun.s16 d1, q1 + vqmovun.s16 d2, q2 + vqmovun.s16 d3, q3 + add r1, r2, r3 + + vst1.32 {d0[0]}, [r2] ;store result + vst1.32 {d1[0]}, [r1], r3 + add r2, r1, r3 + vst1.32 {d2[0]}, [r1] + vst1.32 {d3[0]}, [r2], r3 + + bx lr + + ENDP + END diff --git a/vp9/common/arm/neon/vp9_save_neon_reg.asm b/vp9/common/arm/neon/vp9_save_neon_reg.asm new file mode 100644 index 000000000..71c3e7077 --- /dev/null +++ b/vp9/common/arm/neon/vp9_save_neon_reg.asm @@ -0,0 +1,36 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp9_push_neon| + EXPORT |vp9_pop_neon| + + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +|vp9_push_neon| PROC + vst1.i64 {d8, d9, d10, d11}, [r0]! + vst1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + +|vp9_pop_neon| PROC + vld1.i64 {d8, d9, d10, d11}, [r0]! + vld1.i64 {d12, d13, d14, d15}, [r0]! + bx lr + + ENDP + + END + diff --git a/vp9/common/arm/neon/vp9_shortidct4x4llm_1_neon.asm b/vp9/common/arm/neon/vp9_shortidct4x4llm_1_neon.asm new file mode 100644 index 000000000..d7bdbae75 --- /dev/null +++ b/vp9/common/arm/neon/vp9_shortidct4x4llm_1_neon.asm @@ -0,0 +1,67 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_short_idct4x4llm_1_neon| + EXPORT |vp8_dc_only_idct_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch); +; r0 short *input; +; r1 short *output; +; r2 int pitch; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +|vp8_short_idct4x4llm_1_neon| PROC + vld1.16 {d0[]}, [r0] ;load input[0] + + add r3, r1, r2 + add r12, r3, r2 + + vrshr.s16 d0, d0, #3 + + add r0, r12, r2 + + vst1.16 {d0}, [r1] + vst1.16 {d0}, [r3] + vst1.16 {d0}, [r12] + vst1.16 {d0}, [r0] + + bx lr + ENDP + +;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch); +; r0 short input_dc; +; r1 short *output; +; r2 int pitch; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +|vp8_dc_only_idct_neon| PROC + vdup.16 d0, r0 + + add r3, r1, r2 + add r12, r3, r2 + + vrshr.s16 d0, d0, #3 + + add r0, r12, r2 + + vst1.16 {d0}, [r1] + vst1.16 {d0}, [r3] + vst1.16 {d0}, [r12] + vst1.16 {d0}, [r0] + + bx lr + + ENDP + END diff --git a/vp9/common/arm/neon/vp9_shortidct4x4llm_neon.asm b/vp9/common/arm/neon/vp9_shortidct4x4llm_neon.asm new file mode 100644 index 000000000..b74c31521 --- /dev/null +++ b/vp9/common/arm/neon/vp9_shortidct4x4llm_neon.asm @@ -0,0 +1,122 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_short_idct4x4llm_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +;************************************************************* +;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) +;r0 short * input +;r1 short * output +;r2 int pitch +;************************************************************* +;static const int cospi8sqrt2minus1=20091; +;static const int sinpi8sqrt2 =35468; +;static const int rounding = 0; +;Optimization note: The resulted data from dequantization are signed 13-bit data that is +;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since +;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half +;result of the multiplication that is needed in IDCT. + +|vp8_short_idct4x4llm_neon| PROC + adr r12, idct_coeff + vld1.16 {q1, q2}, [r0] + vld1.16 {d0}, [r12] + + vswp d3, d4 ;q2(vp[4] vp[12]) + + vqdmulh.s16 q3, q2, d0[2] + vqdmulh.s16 q4, q2, d0[0] + + vqadd.s16 d12, d2, d3 ;a1 + vqsub.s16 d13, d2, d3 ;b1 + + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q4, q4, q2 + + ;d6 - c1:temp1 + ;d7 - d1:temp2 + ;d8 - d1:temp1 + ;d9 - c1:temp2 + + vqsub.s16 d10, d6, d9 ;c1 + vqadd.s16 d11, d7, d8 ;d1 + + vqadd.s16 d2, d12, d11 + vqadd.s16 d3, d13, d10 + vqsub.s16 d4, d13, d10 + vqsub.s16 d5, d12, d11 + + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vswp d3, d4 + + vqdmulh.s16 q3, q2, d0[2] + vqdmulh.s16 q4, q2, d0[0] + + vqadd.s16 d12, d2, d3 ;a1 + vqsub.s16 d13, d2, d3 ;b1 + + vshr.s16 q3, q3, #1 + vshr.s16 q4, q4, #1 + + vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number) + vqadd.s16 q4, q4, q2 + + vqsub.s16 d10, d6, d9 ;c1 + vqadd.s16 d11, d7, d8 ;d1 + + vqadd.s16 d2, d12, d11 + vqadd.s16 d3, d13, d10 + vqsub.s16 d4, d13, d10 + vqsub.s16 d5, d12, d11 + + vrshr.s16 d2, d2, #3 + vrshr.s16 d3, d3, #3 + vrshr.s16 d4, d4, #3 + vrshr.s16 d5, d5, #3 + + add r3, r1, r2 + add r12, r3, r2 + add r0, r12, r2 + + vtrn.32 d2, d4 + vtrn.32 d3, d5 + vtrn.16 d2, d3 + vtrn.16 d4, d5 + + vst1.16 {d2}, [r1] + vst1.16 {d3}, [r3] + vst1.16 {d4}, [r12] + vst1.16 {d5}, [r0] + + bx lr + + ENDP + +;----------------- + +idct_coeff + DCD 0x4e7b4e7b, 0x8a8c8a8c + +;20091, 20091, 35468, 35468 + + END diff --git a/vp9/common/arm/neon/vp9_sixtappredict16x16_neon.asm b/vp9/common/arm/neon/vp9_sixtappredict16x16_neon.asm new file mode 100644 index 000000000..5e83f49f5 --- /dev/null +++ b/vp9/common/arm/neon/vp9_sixtappredict16x16_neon.asm @@ -0,0 +1,490 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict16x16_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +filter16_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +;Note: To take advantage of 8-bit mulplication instruction in NEON. First apply abs() to +; filter coeffs to make them u8. Then, use vmlsl for negtive coeffs. After multiplication, +; the result can be negtive. So, I treat the result as s16. But, since it is also possible +; that the result can be a large positive number (> 2^15-1), which could be confused as a +; negtive number. To avoid that error, apply filter coeffs in the order of 0, 1, 4 ,5 ,2, +; which ensures that the result stays in s16 range. Finally, saturated add the result by +; applying 3rd filter coeff. Same applys to other filter functions. + +|vp8_sixtap_predict16x16_neon| PROC + push {r4-r5, lr} + + adr r12, filter16_coeff + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter16x16_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter16x16_only + + sub sp, sp, #336 ;reserve space on stack for temporary storage + mov lr, sp + + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #7 ;loop counter + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + sub r0, r0, r1, lsl #1 + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First Pass: output_height lines x output_width columns (21x16) +filt_blk2d_fp16x16_loop_neon + vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data + vld1.u8 {d9, d10, d11}, [r0], r1 + vld1.u8 {d12, d13, d14}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q9, d7, d0 + vmull.u8 q10, d9, d0 + vmull.u8 q11, d10, d0 + vmull.u8 q12, d12, d0 + vmull.u8 q13, d13, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d9, d10, #1 + vext.8 d30, d12, d13, #1 + + vmlsl.u8 q8, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q10, d29, d1 + vmlsl.u8 q12, d30, d1 + + vext.8 d28, d7, d8, #1 + vext.8 d29, d10, d11, #1 + vext.8 d30, d13, d14, #1 + + vmlsl.u8 q9, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q11, d29, d1 + vmlsl.u8 q13, d30, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d9, d10, #4 + vext.8 d30, d12, d13, #4 + + vmlsl.u8 q8, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q10, d29, d4 + vmlsl.u8 q12, d30, d4 + + vext.8 d28, d7, d8, #4 + vext.8 d29, d10, d11, #4 + vext.8 d30, d13, d14, #4 + + vmlsl.u8 q9, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q11, d29, d4 + vmlsl.u8 q13, d30, d4 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d9, d10, #5 + vext.8 d30, d12, d13, #5 + + vmlal.u8 q8, d28, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q10, d29, d5 + vmlal.u8 q12, d30, d5 + + vext.8 d28, d7, d8, #5 + vext.8 d29, d10, d11, #5 + vext.8 d30, d13, d14, #5 + + vmlal.u8 q9, d28, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q11, d29, d5 + vmlal.u8 q13, d30, d5 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d9, d10, #2 + vext.8 d30, d12, d13, #2 + + vmlal.u8 q8, d28, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q10, d29, d2 + vmlal.u8 q12, d30, d2 + + vext.8 d28, d7, d8, #2 + vext.8 d29, d10, d11, #2 + vext.8 d30, d13, d14, #2 + + vmlal.u8 q9, d28, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q11, d29, d2 + vmlal.u8 q13, d30, d2 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d9, d10, #3 + vext.8 d30, d12, d13, #3 + + vext.8 d15, d7, d8, #3 + vext.8 d31, d10, d11, #3 + vext.8 d6, d13, d14, #3 + + vmull.u8 q4, d28, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q5, d29, d3 + vmull.u8 q6, d30, d3 + + vqadd.s16 q8, q4 ;sum of all (src_data*filter_parameters) + vqadd.s16 q10, q5 + vqadd.s16 q12, q6 + + vmull.u8 q6, d15, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q7, d31, d3 + vmull.u8 q3, d6, d3 + + subs r2, r2, #1 + + vqadd.s16 q9, q6 + vqadd.s16 q11, q7 + vqadd.s16 q13, q3 + + vqrshrun.s16 d6, q8, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q9, #7 + vqrshrun.s16 d8, q10, #7 + vqrshrun.s16 d9, q11, #7 + vqrshrun.s16 d10, q12, #7 + vqrshrun.s16 d11, q13, #7 + + vst1.u8 {d6, d7, d8}, [lr]! ;store result + vst1.u8 {d9, d10, d11}, [lr]! + + bne filt_blk2d_fp16x16_loop_neon + +;Second pass: 16x16 +;secondpass_filter - do first 8-columns and then second 8-columns + add r3, r12, r3, lsl #5 + sub lr, lr, #336 + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + mov r3, #2 ;loop counter + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + mov r2, #16 + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + +filt_blk2d_sp16x16_outloop_neon + vld1.u8 {d18}, [lr], r2 ;load src data + vld1.u8 {d19}, [lr], r2 + vld1.u8 {d20}, [lr], r2 + vld1.u8 {d21}, [lr], r2 + mov r12, #4 ;loop counter + vld1.u8 {d22}, [lr], r2 + +secondpass_inner_loop_neon + vld1.u8 {d23}, [lr], r2 ;load src data + vld1.u8 {d24}, [lr], r2 + vld1.u8 {d25}, [lr], r2 + vld1.u8 {d26}, [lr], r2 + + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r12, r12, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vmov q9, q11 + vst1.u8 {d7}, [r4], r5 + vmov q10, q12 + vst1.u8 {d8}, [r4], r5 + vmov d22, d26 + vst1.u8 {d9}, [r4], r5 + + bne secondpass_inner_loop_neon + + subs r3, r3, #1 + sub lr, lr, #336 + add lr, lr, #8 + + sub r4, r4, r5, lsl #4 + add r4, r4, #8 + + bne filt_blk2d_sp16x16_outloop_neon + + add sp, sp, #336 + pop {r4-r5,pc} + +;-------------------- +firstpass_filter16x16_only + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #8 ;loop counter + sub r0, r0, #2 ;move srcptr back to (column-2) + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First Pass: output_height lines x output_width columns (16x16) +filt_blk2d_fpo16x16_loop_neon + vld1.u8 {d6, d7, d8}, [r0], r1 ;load src data + vld1.u8 {d9, d10, d11}, [r0], r1 + + pld [r0] + pld [r0, r1] + + vmull.u8 q6, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q7, d7, d0 + vmull.u8 q8, d9, d0 + vmull.u8 q9, d10, d0 + + vext.8 d20, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d21, d9, d10, #1 + vext.8 d22, d7, d8, #1 + vext.8 d23, d10, d11, #1 + vext.8 d24, d6, d7, #4 ;construct src_ptr[2] + vext.8 d25, d9, d10, #4 + vext.8 d26, d7, d8, #4 + vext.8 d27, d10, d11, #4 + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d9, d10, #5 + + vmlsl.u8 q6, d20, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q8, d21, d1 + vmlsl.u8 q7, d22, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q9, d23, d1 + vmlsl.u8 q6, d24, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q8, d25, d4 + vmlsl.u8 q7, d26, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q9, d27, d4 + vmlal.u8 q6, d28, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q8, d29, d5 + + vext.8 d20, d7, d8, #5 + vext.8 d21, d10, d11, #5 + vext.8 d22, d6, d7, #2 ;construct src_ptr[0] + vext.8 d23, d9, d10, #2 + vext.8 d24, d7, d8, #2 + vext.8 d25, d10, d11, #2 + + vext.8 d26, d6, d7, #3 ;construct src_ptr[1] + vext.8 d27, d9, d10, #3 + vext.8 d28, d7, d8, #3 + vext.8 d29, d10, d11, #3 + + vmlal.u8 q7, d20, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q9, d21, d5 + vmlal.u8 q6, d22, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q8, d23, d2 + vmlal.u8 q7, d24, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q9, d25, d2 + + vmull.u8 q10, d26, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q11, d27, d3 + vmull.u8 q12, d28, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q15, d29, d3 + + vqadd.s16 q6, q10 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q11 + vqadd.s16 q7, q12 + vqadd.s16 q9, q15 + + subs r2, r2, #1 + + vqrshrun.s16 d6, q6, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q7, #7 + vqrshrun.s16 d8, q8, #7 + vqrshrun.s16 d9, q9, #7 + + vst1.u8 {q3}, [r4], r5 ;store result + vst1.u8 {q4}, [r4], r5 + + bne filt_blk2d_fpo16x16_loop_neon + + pop {r4-r5,pc} + +;-------------------- +secondpass_filter16x16_only +;Second pass: 16x16 + add r3, r12, r3, lsl #5 + sub r0, r0, r1, lsl #1 + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + mov r3, #2 ;loop counter + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + +filt_blk2d_spo16x16_outloop_neon + vld1.u8 {d18}, [r0], r1 ;load src data + vld1.u8 {d19}, [r0], r1 + vld1.u8 {d20}, [r0], r1 + vld1.u8 {d21}, [r0], r1 + mov r12, #4 ;loop counter + vld1.u8 {d22}, [r0], r1 + +secondpass_only_inner_loop_neon + vld1.u8 {d23}, [r0], r1 ;load src data + vld1.u8 {d24}, [r0], r1 + vld1.u8 {d25}, [r0], r1 + vld1.u8 {d26}, [r0], r1 + + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r12, r12, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vmov q9, q11 + vst1.u8 {d7}, [r4], r5 + vmov q10, q12 + vst1.u8 {d8}, [r4], r5 + vmov d22, d26 + vst1.u8 {d9}, [r4], r5 + + bne secondpass_only_inner_loop_neon + + subs r3, r3, #1 + sub r0, r0, r1, lsl #4 + sub r0, r0, r1, lsl #2 + sub r0, r0, r1 + add r0, r0, #8 + + sub r4, r4, r5, lsl #4 + add r4, r4, #8 + + bne filt_blk2d_spo16x16_outloop_neon + + pop {r4-r5,pc} + + ENDP + +;----------------- + END diff --git a/vp9/common/arm/neon/vp9_sixtappredict4x4_neon.asm b/vp9/common/arm/neon/vp9_sixtappredict4x4_neon.asm new file mode 100644 index 000000000..5966b642f --- /dev/null +++ b/vp9/common/arm/neon/vp9_sixtappredict4x4_neon.asm @@ -0,0 +1,422 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +filter4_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack(r4) unsigned char *dst_ptr, +; stack(lr) int dst_pitch + +|vp8_sixtap_predict_neon| PROC + push {r4, lr} + + adr r12, filter4_coeff + ldr r4, [sp, #8] ;load parameters from stack + ldr lr, [sp, #12] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter4x4_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter4x4_only + + vabs.s32 q12, q14 ;get abs(filer_parameters) + vabs.s32 q13, q15 + + sub r0, r0, #2 ;go back 2 columns of src data + sub r0, r0, r1, lsl #1 ;go back 2 lines of src data + +;First pass: output_height lines x output_width columns (9x4) + vld1.u8 {q3}, [r0], r1 ;load first 4-line src data + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vld1.u8 {q4}, [r0], r1 + vdup.8 d1, d24[4] + vld1.u8 {q5}, [r0], r1 + vdup.8 d2, d25[0] + vld1.u8 {q6}, [r0], r1 + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vext.8 d18, d6, d7, #5 ;construct src_ptr[3] + vext.8 d19, d8, d9, #5 + vext.8 d20, d10, d11, #5 + vext.8 d21, d12, d13, #5 + + vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done + vswp d11, d12 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) + vzip.32 d20, d21 + vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5]) + vmull.u8 q8, d20, d5 + + vmov q4, q3 ;keep original src data in q4 q6 + vmov q6, q5 + + vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together + vzip.32 d10, d11 + vshr.u64 q9, q4, #8 ;construct src_ptr[-1] + vshr.u64 q10, q6, #8 + vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0]) + vmlal.u8 q8, d10, d0 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #32 ;construct src_ptr[2] + vshr.u64 q5, q6, #32 + vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q8, d20, d1 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) + vzip.32 d10, d11 + vshr.u64 q9, q4, #16 ;construct src_ptr[0] + vshr.u64 q10, q6, #16 + vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q8, d10, d4 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #24 ;construct src_ptr[1] + vshr.u64 q5, q6, #24 + vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q8, d20, d2 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q10, d10, d3 + + vld1.u8 {q3}, [r0], r1 ;load rest 5-line src data + vld1.u8 {q4}, [r0], r1 + + vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q10 + + vld1.u8 {q5}, [r0], r1 + vld1.u8 {q6}, [r0], r1 + + vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d28, q8, #7 + + ;First Pass on rest 5-line data + vld1.u8 {q11}, [r0], r1 + + vext.8 d18, d6, d7, #5 ;construct src_ptr[3] + vext.8 d19, d8, d9, #5 + vext.8 d20, d10, d11, #5 + vext.8 d21, d12, d13, #5 + + vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done + vswp d11, d12 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) + vzip.32 d20, d21 + vext.8 d31, d22, d23, #5 ;construct src_ptr[3] + vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5]) + vmull.u8 q8, d20, d5 + vmull.u8 q12, d31, d5 ;(src_ptr[3] * vp9_filter[5]) + + vmov q4, q3 ;keep original src data in q4 q6 + vmov q6, q5 + + vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together + vzip.32 d10, d11 + vshr.u64 q9, q4, #8 ;construct src_ptr[-1] + vshr.u64 q10, q6, #8 + + vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0]) + vmlal.u8 q8, d10, d0 + vmlal.u8 q12, d22, d0 ;(src_ptr[-2] * vp9_filter[0]) + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #32 ;construct src_ptr[2] + vshr.u64 q5, q6, #32 + vext.8 d31, d22, d23, #1 ;construct src_ptr[-1] + + vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q8, d20, d1 + vmlsl.u8 q12, d31, d1 ;-(src_ptr[-1] * vp9_filter[1]) + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) + vzip.32 d10, d11 + vshr.u64 q9, q4, #16 ;construct src_ptr[0] + vshr.u64 q10, q6, #16 + vext.8 d31, d22, d23, #4 ;construct src_ptr[2] + + vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q8, d10, d4 + vmlsl.u8 q12, d31, d4 ;-(src_ptr[2] * vp9_filter[4]) + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #24 ;construct src_ptr[1] + vshr.u64 q5, q6, #24 + vext.8 d31, d22, d23, #2 ;construct src_ptr[0] + + vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q8, d20, d2 + vmlal.u8 q12, d31, d2 ;(src_ptr[0] * vp9_filter[2]) + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + vext.8 d31, d22, d23, #3 ;construct src_ptr[1] + vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q10, d10, d3 + vmull.u8 q11, d31, d3 ;(src_ptr[1] * vp9_filter[3]) + + add r3, r12, r3, lsl #5 + + vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q10 + vqadd.s16 q12, q11 + + vext.8 d23, d27, d28, #4 + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + + vqrshrun.s16 d29, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d30, q8, #7 + vqrshrun.s16 d31, q12, #7 + +;Second pass: 4x4 + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vext.8 d24, d28, d29, #4 + vext.8 d25, d29, d30, #4 + vext.8 d26, d30, d31, #4 + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + + vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q4, d28, d0 + + vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp9_filter[5]) + vmull.u8 q6, d26, d5 + + vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q4, d30, d4 + + vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q6, d24, d1 + + vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q4, d29, d2 + + vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp9_filter[3]) + vmlal.u8 q6, d25, d3 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q6, q4 + + vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d4, q6, #7 + + vst1.32 {d3[0]}, [r4] ;store result + vst1.32 {d3[1]}, [r0] + vst1.32 {d4[0]}, [r1] + vst1.32 {d4[1]}, [r2] + + pop {r4, pc} + + +;--------------------- +firstpass_filter4x4_only + vabs.s32 q12, q14 ;get abs(filer_parameters) + vabs.s32 q13, q15 + + sub r0, r0, #2 ;go back 2 columns of src data + +;First pass: output_height lines x output_width columns (4x4) + vld1.u8 {q3}, [r0], r1 ;load first 4-line src data + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vld1.u8 {q4}, [r0], r1 + vdup.8 d1, d24[4] + vld1.u8 {q5}, [r0], r1 + vdup.8 d2, d25[0] + vld1.u8 {q6}, [r0], r1 + + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + + vext.8 d18, d6, d7, #5 ;construct src_ptr[3] + vext.8 d19, d8, d9, #5 + vext.8 d20, d10, d11, #5 + vext.8 d21, d12, d13, #5 + + vswp d7, d8 ;discard 2nd half data after src_ptr[3] is done + vswp d11, d12 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[3]) + vzip.32 d20, d21 + vmull.u8 q7, d18, d5 ;(src_ptr[3] * vp9_filter[5]) + vmull.u8 q8, d20, d5 + + vmov q4, q3 ;keep original src data in q4 q6 + vmov q6, q5 + + vzip.32 d6, d7 ;construct src_ptr[-2], and put 2-line data together + vzip.32 d10, d11 + vshr.u64 q9, q4, #8 ;construct src_ptr[-1] + vshr.u64 q10, q6, #8 + vmlal.u8 q7, d6, d0 ;+(src_ptr[-2] * vp9_filter[0]) + vmlal.u8 q8, d10, d0 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[-1]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #32 ;construct src_ptr[2] + vshr.u64 q5, q6, #32 + vmlsl.u8 q7, d18, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q8, d20, d1 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[2]) + vzip.32 d10, d11 + vshr.u64 q9, q4, #16 ;construct src_ptr[0] + vshr.u64 q10, q6, #16 + vmlsl.u8 q7, d6, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q8, d10, d4 + + vzip.32 d18, d19 ;put 2-line data in 1 register (src_ptr[0]) + vzip.32 d20, d21 + vshr.u64 q3, q4, #24 ;construct src_ptr[1] + vshr.u64 q5, q6, #24 + vmlal.u8 q7, d18, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q8, d20, d2 + + vzip.32 d6, d7 ;put 2-line data in 1 register (src_ptr[1]) + vzip.32 d10, d11 + vmull.u8 q9, d6, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q10, d10, d3 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqadd.s16 q7, q9 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q10 + + vqrshrun.s16 d27, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d28, q8, #7 + + vst1.32 {d27[0]}, [r4] ;store result + vst1.32 {d27[1]}, [r0] + vst1.32 {d28[0]}, [r1] + vst1.32 {d28[1]}, [r2] + + pop {r4, pc} + + +;--------------------- +secondpass_filter4x4_only + sub r0, r0, r1, lsl #1 + add r3, r12, r3, lsl #5 + + vld1.32 {d27[0]}, [r0], r1 ;load src data + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vld1.32 {d27[1]}, [r0], r1 + vabs.s32 q7, q5 + vld1.32 {d28[0]}, [r0], r1 + vabs.s32 q8, q6 + vld1.32 {d28[1]}, [r0], r1 + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vld1.32 {d29[0]}, [r0], r1 + vdup.8 d1, d14[4] + vld1.32 {d29[1]}, [r0], r1 + vdup.8 d2, d15[0] + vld1.32 {d30[0]}, [r0], r1 + vdup.8 d3, d15[4] + vld1.32 {d30[1]}, [r0], r1 + vdup.8 d4, d16[0] + vld1.32 {d31[0]}, [r0], r1 + vdup.8 d5, d16[4] + + vext.8 d23, d27, d28, #4 + vext.8 d24, d28, d29, #4 + vext.8 d25, d29, d30, #4 + vext.8 d26, d30, d31, #4 + + vmull.u8 q3, d27, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q4, d28, d0 + + vmull.u8 q5, d25, d5 ;(src_ptr[3] * vp9_filter[5]) + vmull.u8 q6, d26, d5 + + vmlsl.u8 q3, d29, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q4, d30, d4 + + vmlsl.u8 q5, d23, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q6, d24, d1 + + vmlal.u8 q3, d28, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q4, d29, d2 + + vmlal.u8 q5, d24, d3 ;(src_ptr[1] * vp9_filter[3]) + vmlal.u8 q6, d25, d3 + + add r0, r4, lr + add r1, r0, lr + add r2, r1, lr + + vqadd.s16 q5, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q6, q4 + + vqrshrun.s16 d3, q5, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d4, q6, #7 + + vst1.32 {d3[0]}, [r4] ;store result + vst1.32 {d3[1]}, [r0] + vst1.32 {d4[0]}, [r1] + vst1.32 {d4[1]}, [r2] + + pop {r4, pc} + + ENDP + +;----------------- + + END diff --git a/vp9/common/arm/neon/vp9_sixtappredict8x4_neon.asm b/vp9/common/arm/neon/vp9_sixtappredict8x4_neon.asm new file mode 100644 index 000000000..9ce1e3bbd --- /dev/null +++ b/vp9/common/arm/neon/vp9_sixtappredict8x4_neon.asm @@ -0,0 +1,473 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict8x4_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +filter8_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; r4 unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +|vp8_sixtap_predict8x4_neon| PROC + push {r4-r5, lr} + + adr r12, filter8_coeff + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter8x4_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter8x4_only + + sub sp, sp, #32 ;reserve space on stack for temporary storage + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + mov lr, sp + sub r0, r0, r1, lsl #1 + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + +;First pass: output_height lines x output_width columns (9x8) + vld1.u8 {q3}, [r0], r1 ;load src data + vdup.8 d3, d25[4] + vld1.u8 {q4}, [r0], r1 + vdup.8 d4, d26[0] + vld1.u8 {q5}, [r0], r1 + vdup.8 d5, d26[4] + vld1.u8 {q6}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vld1.u8 {q3}, [r0], r1 ;load src data + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vld1.u8 {q4}, [r0], r1 + vst1.u8 {d22}, [lr]! ;store result + vld1.u8 {q5}, [r0], r1 + vst1.u8 {d23}, [lr]! + vld1.u8 {q6}, [r0], r1 + vst1.u8 {d24}, [lr]! + vld1.u8 {q7}, [r0], r1 + vst1.u8 {d25}, [lr]! + + ;first_pass filtering on the rest 5-line data + vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + vmull.u8 q11, d12, d0 + vmull.u8 q12, d14, d0 + + vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d28, d8, d9, #1 + vext.8 d29, d10, d11, #1 + vext.8 d30, d12, d13, #1 + vext.8 d31, d14, d15, #1 + + vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q9, d28, d1 + vmlsl.u8 q10, d29, d1 + vmlsl.u8 q11, d30, d1 + vmlsl.u8 q12, d31, d1 + + vext.8 d27, d6, d7, #4 ;construct src_ptr[2] + vext.8 d28, d8, d9, #4 + vext.8 d29, d10, d11, #4 + vext.8 d30, d12, d13, #4 + vext.8 d31, d14, d15, #4 + + vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q9, d28, d4 + vmlsl.u8 q10, d29, d4 + vmlsl.u8 q11, d30, d4 + vmlsl.u8 q12, d31, d4 + + vext.8 d27, d6, d7, #2 ;construct src_ptr[0] + vext.8 d28, d8, d9, #2 + vext.8 d29, d10, d11, #2 + vext.8 d30, d12, d13, #2 + vext.8 d31, d14, d15, #2 + + vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q9, d28, d2 + vmlal.u8 q10, d29, d2 + vmlal.u8 q11, d30, d2 + vmlal.u8 q12, d31, d2 + + vext.8 d27, d6, d7, #5 ;construct src_ptr[3] + vext.8 d28, d8, d9, #5 + vext.8 d29, d10, d11, #5 + vext.8 d30, d12, d13, #5 + vext.8 d31, d14, d15, #5 + + vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q9, d28, d5 + vmlal.u8 q10, d29, d5 + vmlal.u8 q11, d30, d5 + vmlal.u8 q12, d31, d5 + + vext.8 d27, d6, d7, #3 ;construct src_ptr[1] + vext.8 d28, d8, d9, #3 + vext.8 d29, d10, d11, #3 + vext.8 d30, d12, d13, #3 + vext.8 d31, d14, d15, #3 + + vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q4, d28, d3 + vmull.u8 q5, d29, d3 + vmull.u8 q6, d30, d3 + vmull.u8 q7, d31, d3 + + vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q9, q4 + vqadd.s16 q10, q5 + vqadd.s16 q11, q6 + vqadd.s16 q12, q7 + + vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d27, q9, #7 + vqrshrun.s16 d28, q10, #7 + vqrshrun.s16 d29, q11, #7 ;load intermediate data from stack + vqrshrun.s16 d30, q12, #7 + +;Second pass: 8x4 +;secondpass_filter + add r3, r12, r3, lsl #5 + sub lr, lr, #32 + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vld1.u8 {q11}, [lr]! + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vld1.u8 {q12}, [lr]! + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + + vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q4, d23, d0 + vmull.u8 q5, d24, d0 + vmull.u8 q6, d25, d0 + + vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q4, d24, d1 + vmlsl.u8 q5, d25, d1 + vmlsl.u8 q6, d26, d1 + + vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q4, d27, d4 + vmlsl.u8 q5, d28, d4 + vmlsl.u8 q6, d29, d4 + + vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q4, d25, d2 + vmlal.u8 q5, d26, d2 + vmlal.u8 q6, d27, d2 + + vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q4, d28, d5 + vmlal.u8 q5, d29, d5 + vmlal.u8 q6, d30, d5 + + vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q8, d26, d3 + vmull.u8 q9, d27, d3 + vmull.u8 q10, d28, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vst1.u8 {d7}, [r4], r5 + vst1.u8 {d8}, [r4], r5 + vst1.u8 {d9}, [r4], r5 + + add sp, sp, #32 + pop {r4-r5,pc} + +;-------------------- +firstpass_filter8x4_only + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + vld1.u8 {q3}, [r0], r1 ;load src data + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vld1.u8 {q4}, [r0], r1 + vdup.8 d1, d24[4] + vld1.u8 {q5}, [r0], r1 + vdup.8 d2, d25[0] + vld1.u8 {q6}, [r0], r1 + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First pass: output_height lines x output_width columns (4x8) + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vst1.u8 {d22}, [r4], r5 ;store result + vst1.u8 {d23}, [r4], r5 + vst1.u8 {d24}, [r4], r5 + vst1.u8 {d25}, [r4], r5 + + pop {r4-r5,pc} + +;--------------------- +secondpass_filter8x4_only +;Second pass: 8x4 + add r3, r12, r3, lsl #5 + sub r0, r0, r1, lsl #1 + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vld1.u8 {d22}, [r0], r1 + vld1.u8 {d23}, [r0], r1 + vld1.u8 {d24}, [r0], r1 + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vld1.u8 {d25}, [r0], r1 + vdup.8 d1, d14[4] + vld1.u8 {d26}, [r0], r1 + vdup.8 d2, d15[0] + vld1.u8 {d27}, [r0], r1 + vdup.8 d3, d15[4] + vld1.u8 {d28}, [r0], r1 + vdup.8 d4, d16[0] + vld1.u8 {d29}, [r0], r1 + vdup.8 d5, d16[4] + vld1.u8 {d30}, [r0], r1 + + vmull.u8 q3, d22, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q4, d23, d0 + vmull.u8 q5, d24, d0 + vmull.u8 q6, d25, d0 + + vmlsl.u8 q3, d23, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q4, d24, d1 + vmlsl.u8 q5, d25, d1 + vmlsl.u8 q6, d26, d1 + + vmlsl.u8 q3, d26, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q4, d27, d4 + vmlsl.u8 q5, d28, d4 + vmlsl.u8 q6, d29, d4 + + vmlal.u8 q3, d24, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q4, d25, d2 + vmlal.u8 q5, d26, d2 + vmlal.u8 q6, d27, d2 + + vmlal.u8 q3, d27, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q4, d28, d5 + vmlal.u8 q5, d29, d5 + vmlal.u8 q6, d30, d5 + + vmull.u8 q7, d25, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q8, d26, d3 + vmull.u8 q9, d27, d3 + vmull.u8 q10, d28, d3 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vst1.u8 {d6}, [r4], r5 ;store result + vst1.u8 {d7}, [r4], r5 + vst1.u8 {d8}, [r4], r5 + vst1.u8 {d9}, [r4], r5 + + pop {r4-r5,pc} + + ENDP + +;----------------- + + END diff --git a/vp9/common/arm/neon/vp9_sixtappredict8x8_neon.asm b/vp9/common/arm/neon/vp9_sixtappredict8x8_neon.asm new file mode 100644 index 000000000..5ff16616d --- /dev/null +++ b/vp9/common/arm/neon/vp9_sixtappredict8x8_neon.asm @@ -0,0 +1,524 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_sixtap_predict8x8_neon| + ARM + REQUIRE8 + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +filter8_coeff + DCD 0, 0, 128, 0, 0, 0, 0, 0 + DCD 0, -6, 123, 12, -1, 0, 0, 0 + DCD 2, -11, 108, 36, -8, 1, 0, 0 + DCD 0, -9, 93, 50, -6, 0, 0, 0 + DCD 3, -16, 77, 77, -16, 3, 0, 0 + DCD 0, -6, 50, 93, -9, 0, 0, 0 + DCD 1, -8, 36, 108, -11, 2, 0, 0 + DCD 0, -1, 12, 123, -6, 0, 0, 0 + +; r0 unsigned char *src_ptr, +; r1 int src_pixels_per_line, +; r2 int xoffset, +; r3 int yoffset, +; stack(r4) unsigned char *dst_ptr, +; stack(r5) int dst_pitch + +|vp8_sixtap_predict8x8_neon| PROC + push {r4-r5, lr} + + adr r12, filter8_coeff + + ldr r4, [sp, #12] ;load parameters from stack + ldr r5, [sp, #16] ;load parameters from stack + + cmp r2, #0 ;skip first_pass filter if xoffset=0 + beq secondpass_filter8x8_only + + add r2, r12, r2, lsl #5 ;calculate filter location + + cmp r3, #0 ;skip second_pass filter if yoffset=0 + + vld1.s32 {q14, q15}, [r2] ;load first_pass filter + + beq firstpass_filter8x8_only + + sub sp, sp, #64 ;reserve space on stack for temporary storage + mov lr, sp + + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #2 ;loop counter + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + sub r0, r0, r1, lsl #1 + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + +;First pass: output_height lines x output_width columns (13x8) + vld1.u8 {q3}, [r0], r1 ;load src data + vdup.8 d3, d25[4] + vld1.u8 {q4}, [r0], r1 + vdup.8 d4, d26[0] + vld1.u8 {q5}, [r0], r1 + vdup.8 d5, d26[4] + vld1.u8 {q6}, [r0], r1 + +filt_blk2d_fp8x8_loop_neon + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + + subs r2, r2, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vld1.u8 {q3}, [r0], r1 ;load src data + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vst1.u8 {d22}, [lr]! ;store result + vld1.u8 {q4}, [r0], r1 + vst1.u8 {d23}, [lr]! + vld1.u8 {q5}, [r0], r1 + vst1.u8 {d24}, [lr]! + vld1.u8 {q6}, [r0], r1 + vst1.u8 {d25}, [lr]! + + bne filt_blk2d_fp8x8_loop_neon + + ;first_pass filtering on the rest 5-line data + ;vld1.u8 {q3}, [r0], r1 ;load src data + ;vld1.u8 {q4}, [r0], r1 + ;vld1.u8 {q5}, [r0], r1 + ;vld1.u8 {q6}, [r0], r1 + vld1.u8 {q7}, [r0], r1 + + vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q9, d8, d0 + vmull.u8 q10, d10, d0 + vmull.u8 q11, d12, d0 + vmull.u8 q12, d14, d0 + + vext.8 d27, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d28, d8, d9, #1 + vext.8 d29, d10, d11, #1 + vext.8 d30, d12, d13, #1 + vext.8 d31, d14, d15, #1 + + vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q9, d28, d1 + vmlsl.u8 q10, d29, d1 + vmlsl.u8 q11, d30, d1 + vmlsl.u8 q12, d31, d1 + + vext.8 d27, d6, d7, #4 ;construct src_ptr[2] + vext.8 d28, d8, d9, #4 + vext.8 d29, d10, d11, #4 + vext.8 d30, d12, d13, #4 + vext.8 d31, d14, d15, #4 + + vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q9, d28, d4 + vmlsl.u8 q10, d29, d4 + vmlsl.u8 q11, d30, d4 + vmlsl.u8 q12, d31, d4 + + vext.8 d27, d6, d7, #2 ;construct src_ptr[0] + vext.8 d28, d8, d9, #2 + vext.8 d29, d10, d11, #2 + vext.8 d30, d12, d13, #2 + vext.8 d31, d14, d15, #2 + + vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q9, d28, d2 + vmlal.u8 q10, d29, d2 + vmlal.u8 q11, d30, d2 + vmlal.u8 q12, d31, d2 + + vext.8 d27, d6, d7, #5 ;construct src_ptr[3] + vext.8 d28, d8, d9, #5 + vext.8 d29, d10, d11, #5 + vext.8 d30, d12, d13, #5 + vext.8 d31, d14, d15, #5 + + vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q9, d28, d5 + vmlal.u8 q10, d29, d5 + vmlal.u8 q11, d30, d5 + vmlal.u8 q12, d31, d5 + + vext.8 d27, d6, d7, #3 ;construct src_ptr[1] + vext.8 d28, d8, d9, #3 + vext.8 d29, d10, d11, #3 + vext.8 d30, d12, d13, #3 + vext.8 d31, d14, d15, #3 + + vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q4, d28, d3 + vmull.u8 q5, d29, d3 + vmull.u8 q6, d30, d3 + vmull.u8 q7, d31, d3 + + vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q9, q4 + vqadd.s16 q10, q5 + vqadd.s16 q11, q6 + vqadd.s16 q12, q7 + + add r3, r12, r3, lsl #5 + + vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8 + sub lr, lr, #64 + vqrshrun.s16 d27, q9, #7 + vld1.u8 {q9}, [lr]! ;load intermediate data from stack + vqrshrun.s16 d28, q10, #7 + vld1.u8 {q10}, [lr]! + + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + + vqrshrun.s16 d29, q11, #7 + vld1.u8 {q11}, [lr]! + + vabs.s32 q7, q5 + vabs.s32 q8, q6 + + vqrshrun.s16 d30, q12, #7 + vld1.u8 {q12}, [lr]! + +;Second pass: 8x8 + mov r3, #2 ;loop counter + + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vdup.8 d1, d14[4] + vdup.8 d2, d15[0] + vdup.8 d3, d15[4] + vdup.8 d4, d16[0] + vdup.8 d5, d16[4] + +filt_blk2d_sp8x8_loop_neon + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r3, r3, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vmov q9, q11 + vst1.u8 {d6}, [r4], r5 ;store result + vmov q10, q12 + vst1.u8 {d7}, [r4], r5 + vmov q11, q13 + vst1.u8 {d8}, [r4], r5 + vmov q12, q14 + vst1.u8 {d9}, [r4], r5 + vmov d26, d30 + + bne filt_blk2d_sp8x8_loop_neon + + add sp, sp, #64 + pop {r4-r5,pc} + +;--------------------- +firstpass_filter8x8_only + ;add r2, r12, r2, lsl #5 ;calculate filter location + ;vld1.s32 {q14, q15}, [r2] ;load first_pass filter + vabs.s32 q12, q14 + vabs.s32 q13, q15 + + mov r2, #2 ;loop counter + sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2) + + vdup.8 d0, d24[0] ;first_pass filter (d0-d5) + vdup.8 d1, d24[4] + vdup.8 d2, d25[0] + vdup.8 d3, d25[4] + vdup.8 d4, d26[0] + vdup.8 d5, d26[4] + +;First pass: output_height lines x output_width columns (8x8) +filt_blk2d_fpo8x8_loop_neon + vld1.u8 {q3}, [r0], r1 ;load src data + vld1.u8 {q4}, [r0], r1 + vld1.u8 {q5}, [r0], r1 + vld1.u8 {q6}, [r0], r1 + + pld [r0] + pld [r0, r1] + pld [r0, r1, lsl #1] + + vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q8, d8, d0 + vmull.u8 q9, d10, d0 + vmull.u8 q10, d12, d0 + + vext.8 d28, d6, d7, #1 ;construct src_ptr[-1] + vext.8 d29, d8, d9, #1 + vext.8 d30, d10, d11, #1 + vext.8 d31, d12, d13, #1 + + vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q8, d29, d1 + vmlsl.u8 q9, d30, d1 + vmlsl.u8 q10, d31, d1 + + vext.8 d28, d6, d7, #4 ;construct src_ptr[2] + vext.8 d29, d8, d9, #4 + vext.8 d30, d10, d11, #4 + vext.8 d31, d12, d13, #4 + + vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q8, d29, d4 + vmlsl.u8 q9, d30, d4 + vmlsl.u8 q10, d31, d4 + + vext.8 d28, d6, d7, #2 ;construct src_ptr[0] + vext.8 d29, d8, d9, #2 + vext.8 d30, d10, d11, #2 + vext.8 d31, d12, d13, #2 + + vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q8, d29, d2 + vmlal.u8 q9, d30, d2 + vmlal.u8 q10, d31, d2 + + vext.8 d28, d6, d7, #5 ;construct src_ptr[3] + vext.8 d29, d8, d9, #5 + vext.8 d30, d10, d11, #5 + vext.8 d31, d12, d13, #5 + + vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q8, d29, d5 + vmlal.u8 q9, d30, d5 + vmlal.u8 q10, d31, d5 + + vext.8 d28, d6, d7, #3 ;construct src_ptr[1] + vext.8 d29, d8, d9, #3 + vext.8 d30, d10, d11, #3 + vext.8 d31, d12, d13, #3 + + vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q4, d29, d3 + vmull.u8 q5, d30, d3 + vmull.u8 q6, d31, d3 + ; + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + subs r2, r2, #1 + + vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d23, q8, #7 + vqrshrun.s16 d24, q9, #7 + vqrshrun.s16 d25, q10, #7 + + vst1.u8 {d22}, [r4], r5 ;store result + vst1.u8 {d23}, [r4], r5 + vst1.u8 {d24}, [r4], r5 + vst1.u8 {d25}, [r4], r5 + + bne filt_blk2d_fpo8x8_loop_neon + + pop {r4-r5,pc} + +;--------------------- +secondpass_filter8x8_only + sub r0, r0, r1, lsl #1 + add r3, r12, r3, lsl #5 + + vld1.u8 {d18}, [r0], r1 ;load src data + vld1.s32 {q5, q6}, [r3] ;load second_pass filter + vld1.u8 {d19}, [r0], r1 + vabs.s32 q7, q5 + vld1.u8 {d20}, [r0], r1 + vabs.s32 q8, q6 + vld1.u8 {d21}, [r0], r1 + mov r3, #2 ;loop counter + vld1.u8 {d22}, [r0], r1 + vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5) + vld1.u8 {d23}, [r0], r1 + vdup.8 d1, d14[4] + vld1.u8 {d24}, [r0], r1 + vdup.8 d2, d15[0] + vld1.u8 {d25}, [r0], r1 + vdup.8 d3, d15[4] + vld1.u8 {d26}, [r0], r1 + vdup.8 d4, d16[0] + vld1.u8 {d27}, [r0], r1 + vdup.8 d5, d16[4] + vld1.u8 {d28}, [r0], r1 + vld1.u8 {d29}, [r0], r1 + vld1.u8 {d30}, [r0], r1 + +;Second pass: 8x8 +filt_blk2d_spo8x8_loop_neon + vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp9_filter[0]) + vmull.u8 q4, d19, d0 + vmull.u8 q5, d20, d0 + vmull.u8 q6, d21, d0 + + vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp9_filter[1]) + vmlsl.u8 q4, d20, d1 + vmlsl.u8 q5, d21, d1 + vmlsl.u8 q6, d22, d1 + + vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp9_filter[4]) + vmlsl.u8 q4, d23, d4 + vmlsl.u8 q5, d24, d4 + vmlsl.u8 q6, d25, d4 + + vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp9_filter[2]) + vmlal.u8 q4, d21, d2 + vmlal.u8 q5, d22, d2 + vmlal.u8 q6, d23, d2 + + vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp9_filter[5]) + vmlal.u8 q4, d24, d5 + vmlal.u8 q5, d25, d5 + vmlal.u8 q6, d26, d5 + + vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp9_filter[3]) + vmull.u8 q8, d22, d3 + vmull.u8 q9, d23, d3 + vmull.u8 q10, d24, d3 + + subs r3, r3, #1 + + vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters) + vqadd.s16 q8, q4 + vqadd.s16 q9, q5 + vqadd.s16 q10, q6 + + vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8 + vqrshrun.s16 d7, q8, #7 + vqrshrun.s16 d8, q9, #7 + vqrshrun.s16 d9, q10, #7 + + vmov q9, q11 + vst1.u8 {d6}, [r4], r5 ;store result + vmov q10, q12 + vst1.u8 {d7}, [r4], r5 + vmov q11, q13 + vst1.u8 {d8}, [r4], r5 + vmov q12, q14 + vst1.u8 {d9}, [r4], r5 + vmov d26, d30 + + bne filt_blk2d_spo8x8_loop_neon + + pop {r4-r5,pc} + + ENDP + +;----------------- + + END diff --git a/vp9/common/arm/recon_arm.h b/vp9/common/arm/recon_arm.h deleted file mode 100644 index 1e402951c..000000000 --- a/vp9/common/arm/recon_arm.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef RECON_ARM_H -#define RECON_ARM_H - -#if HAVE_ARMV6 -extern prototype_recon_block(vp9_recon_b_armv6); -extern prototype_recon_block(vp9_recon2b_armv6); -extern prototype_recon_block(vp9_recon4b_armv6); - -extern prototype_copy_block(vp9_copy_mem8x8_v6); -extern prototype_copy_block(vp9_copy_mem8x4_v6); -extern prototype_copy_block(vp9_copy_mem16x16_v6); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_recon_recon -#define vp8_recon_recon vp9_recon_b_armv6 - -#undef vp8_recon_recon2 -#define vp8_recon_recon2 vp9_recon2b_armv6 - -#undef vp8_recon_recon4 -#define vp8_recon_recon4 vp9_recon4b_armv6 - -#undef vp8_recon_copy8x8 -#define vp8_recon_copy8x8 vp9_copy_mem8x8_v6 - -#undef vp8_recon_copy8x4 -#define vp8_recon_copy8x4 vp9_copy_mem8x4_v6 - -#undef vp8_recon_copy16x16 -#define vp8_recon_copy16x16 vp9_copy_mem16x16_v6 -#endif -#endif - -#if HAVE_ARMV7 -extern prototype_recon_block(vp9_recon_b_neon); -extern prototype_recon_block(vp9_recon2b_neon); -extern prototype_recon_block(vp9_recon4b_neon); - -extern prototype_copy_block(vp9_copy_mem8x8_neon); -extern prototype_copy_block(vp9_copy_mem8x4_neon); -extern prototype_copy_block(vp9_copy_mem16x16_neon); - -extern prototype_recon_macroblock(vp9_recon_mb_neon); - -extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_neon); -extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_s_neon); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp8_recon_recon -#define vp8_recon_recon vp9_recon_b_neon - -#undef vp8_recon_recon2 -#define vp8_recon_recon2 vp9_recon2b_neon - -#undef vp8_recon_recon4 -#define vp8_recon_recon4 vp9_recon4b_neon - -#undef vp8_recon_copy8x8 -#define vp8_recon_copy8x8 vp9_copy_mem8x8_neon - -#undef vp8_recon_copy8x4 -#define vp8_recon_copy8x4 vp9_copy_mem8x4_neon - -#undef vp8_recon_copy16x16 -#define vp8_recon_copy16x16 vp9_copy_mem16x16_neon - -#undef vp8_recon_recon_mb -#define vp8_recon_recon_mb vp9_recon_mb_neon - -#undef vp9_recon_build_intra_predictors_mby -#define vp9_recon_build_intra_predictors_mby vp9_build_intra_predictors_mby_neon - -#undef vp9_recon_build_intra_predictors_mby_s -#define vp9_recon_build_intra_predictors_mby_s vp9_build_intra_predictors_mby_s_neon - -#endif -#endif - -#endif diff --git a/vp9/common/arm/reconintra_arm.c b/vp9/common/arm/reconintra_arm.c deleted file mode 100644 index 590254680..000000000 --- a/vp9/common/arm/reconintra_arm.c +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include "vp9/common/blockd.h" -#include "vp9/common/reconintra.h" -#include "vpx_mem/vpx_mem.h" -#include "vp9/common/recon.h" - -#if HAVE_ARMV7 -extern void vp9_build_intra_predictors_mby_neon_func( - unsigned char *y_buffer, - unsigned char *ypred_ptr, - int y_stride, - int mode, - int Up, - int Left); - -void vp9_build_intra_predictors_mby_neon(MACROBLOCKD *xd) { - unsigned char *y_buffer = xd->dst.y_buffer; - unsigned char *ypred_ptr = xd->predictor; - int y_stride = xd->dst.y_stride; - int mode = xd->mode_info_context->mbmi.mode; - int Up = xd->up_available; - int Left = xd->left_available; - - vp9_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, - y_stride, mode, Up, Left); -} -#endif - - -#if HAVE_ARMV7 -extern void vp9_build_intra_predictors_mby_s_neon_func( - unsigned char *y_buffer, - unsigned char *ypred_ptr, - int y_stride, - int mode, - int Up, - int Left); - -void vp9_build_intra_predictors_mby_s_neon(MACROBLOCKD *xd) { - unsigned char *y_buffer = xd->dst.y_buffer; - unsigned char *ypred_ptr = xd->predictor; - int y_stride = xd->dst.y_stride; - int mode = xd->mode_info_context->mbmi.mode; - int Up = xd->up_available; - int Left = xd->left_available; - - vp9_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, - y_stride, mode, Up, Left); -} - -#endif diff --git a/vp9/common/arm/subpixel_arm.h b/vp9/common/arm/subpixel_arm.h deleted file mode 100644 index b4f9f54f3..000000000 --- a/vp9/common/arm/subpixel_arm.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef SUBPIXEL_ARM_H -#define SUBPIXEL_ARM_H - -#if HAVE_ARMV6 -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_armv6); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_armv6); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_armv6); -extern prototype_subpixel_predict(vp9_sixtap_predict_armv6); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_armv6); -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_armv6); -extern prototype_subpixel_predict(vp9_bilinear_predict8x4_armv6); -extern prototype_subpixel_predict(vp9_bilinear_predict4x4_armv6); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_armv6 - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_armv6 - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_armv6 - -#undef vp9_subpix_sixtap4x4 -#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_armv6 - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_armv6 - -#undef vp9_subpix_bilinear8x8 -#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_armv6 - -#undef vp9_subpix_bilinear8x4 -#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_armv6 - -#undef vp9_subpix_bilinear4x4 -#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_armv6 -#endif -#endif - -#if HAVE_ARMV7 -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_neon); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_neon); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_neon); -extern prototype_subpixel_predict(vp9_sixtap_predict_neon); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_neon); -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_neon); -extern prototype_subpixel_predict(vp9_bilinear_predict8x4_neon); -extern prototype_subpixel_predict(vp9_bilinear_predict4x4_neon); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_neon - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_neon - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_neon - -#undef vp9_subpix_sixtap4x4 -#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_neon - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_neon - -#undef vp9_subpix_bilinear8x8 -#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_neon - -#undef vp9_subpix_bilinear8x4 -#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_neon - -#undef vp9_subpix_bilinear4x4 -#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_neon -#endif -#endif - -#endif diff --git a/vp9/common/arm/vp9_arm_systemdependent.c b/vp9/common/arm/vp9_arm_systemdependent.c new file mode 100644 index 000000000..a6319a4c5 --- /dev/null +++ b/vp9/common/arm/vp9_arm_systemdependent.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vpx_ports/arm.h" +#include "vp9/common/vp9_pragmas.h" +#include "vp9/common/vp9_subpixel.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/recon.h" +#include "vp9/common/vp9_onyxc_int.h" + +void vp9_arch_arm_common_init(VP9_COMMON *ctx) { +#if CONFIG_RUNTIME_CPU_DETECT + VP9_COMMON_RTCD *rtcd = &ctx->rtcd; + int flags = arm_cpu_caps(); + rtcd->flags = flags; + + /* Override default functions with fastest ones for this CPU. */ +#if HAVE_ARMV5TE + if (flags & HAS_EDSP) { + } +#endif + +// The commented functions need to be re-written for vpx. +#if HAVE_ARMV6 + if (flags & HAS_MEDIA) { + rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_armv6; + rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_armv6; + rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_armv6; + rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_armv6; + + rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_armv6; + rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_armv6; + rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_armv6; + rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_armv6; + + // rtcd->idct.idct1 = vp9_short_idct4x4llm_1_v6; + // rtcd->idct.idct16 = vp9_short_idct4x4llm_v6_dual; + // rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_v6; + // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_v6; + + rtcd->recon.copy16x16 = vp9_copy_mem16x16_v6; + rtcd->recon.copy8x8 = vp9_copy_mem8x8_v6; + rtcd->recon.copy8x4 = vp9_copy_mem8x4_v6; + rtcd->recon.recon = vp9_recon_b_armv6; + rtcd->recon.recon2 = vp9_recon2b_armv6; + rtcd->recon.recon4 = vp9_recon4b_armv6; + } +#endif + +#if HAVE_ARMV7 + if (flags & HAS_NEON) { + rtcd->subpix.sixtap16x16 = vp9_sixtap_predict16x16_neon; + rtcd->subpix.sixtap8x8 = vp9_sixtap_predict8x8_neon; + rtcd->subpix.sixtap8x4 = vp9_sixtap_predict8x4_neon; + rtcd->subpix.sixtap4x4 = vp9_sixtap_predict_neon; + + rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_neon; + rtcd->subpix.bilinear8x8 = vp9_bilinear_predict8x8_neon; + rtcd->subpix.bilinear8x4 = vp9_bilinear_predict8x4_neon; + rtcd->subpix.bilinear4x4 = vp9_bilinear_predict4x4_neon; + + // rtcd->idct.idct1 = vp9_short_idct4x4llm_1_neon; + // rtcd->idct.idct16 = vp9_short_idct4x4llm_neon; + // rtcd->idct.iwalsh1 = vp9_short_inv_walsh4x4_1_neon; + // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_neon; + + rtcd->recon.copy16x16 = vp9_copy_mem16x16_neon; + rtcd->recon.copy8x8 = vp9_copy_mem8x8_neon; + rtcd->recon.copy8x4 = vp9_copy_mem8x4_neon; + rtcd->recon.recon = vp9_recon_b_neon; + rtcd->recon.recon2 = vp9_recon2b_neon; + rtcd->recon.recon4 = vp9_recon4b_neon; + rtcd->recon.recon_mb = vp9_recon_mb_neon; + rtcd->recon.build_intra_predictors_mby = + vp9_build_intra_predictors_mby_neon; + rtcd->recon.build_intra_predictors_mby_s = + vp9_build_intra_predictors_mby_s_neon; + } +#endif + +#endif +} diff --git a/vp9/common/arm/vp9_bilinearfilter_arm.c b/vp9/common/arm/vp9_bilinearfilter_arm.c new file mode 100644 index 000000000..409e271b1 --- /dev/null +++ b/vp9/common/arm/vp9_bilinearfilter_arm.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include +#include "vp9/common/vp9_filter.h" +#include "vp9/common/vp9_subpixel.h" +#include "vp9_bilinearfilter_arm.h" + +void vp9_filter_block2d_bil_armv6 +( + unsigned char *src_ptr, + unsigned char *dst_ptr, + unsigned int src_pitch, + unsigned int dst_pitch, + const short *HFilter, + const short *VFilter, + int Width, + int Height +) { + unsigned short FData[36 * 16]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + vp9_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); + + /* then 1-D vertically... */ + vp9_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, Width, VFilter); +} + + +void vp9_bilinear_predict4x4_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) { + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); +} + +void vp9_bilinear_predict8x8_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) { + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); +} + +void vp9_bilinear_predict8x4_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) { + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); +} + +void vp9_bilinear_predict16x16_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) { + const short *HFilter; + const short *VFilter; + + HFilter = vp8_bilinear_filters[xoffset]; + VFilter = vp8_bilinear_filters[yoffset]; + + vp9_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); +} diff --git a/vp9/common/arm/vp9_bilinearfilter_arm.h b/vp9/common/arm/vp9_bilinearfilter_arm.h new file mode 100644 index 000000000..b6d9cfc2d --- /dev/null +++ b/vp9/common/arm/vp9_bilinearfilter_arm.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef BILINEARFILTER_ARM_H +#define BILINEARFILTER_ARM_H + +extern void vp9_filter_block2d_bil_first_pass_armv6 +( + const unsigned char *src_ptr, + unsigned short *dst_ptr, + unsigned int src_pitch, + unsigned int height, + unsigned int width, + const short *vp9_filter +); + +extern void vp9_filter_block2d_bil_second_pass_armv6 +( + const unsigned short *src_ptr, + unsigned char *dst_ptr, + int dst_pitch, + unsigned int height, + unsigned int width, + const short *vp9_filter +); + +#endif /* BILINEARFILTER_ARM_H */ diff --git a/vp9/common/arm/vp9_filter_arm.c b/vp9/common/arm/vp9_filter_arm.c new file mode 100644 index 000000000..f55273c33 --- /dev/null +++ b/vp9/common/arm/vp9_filter_arm.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include +#include "vp9/common/vp9_filter.h" +#include "vp9/common/vp9_subpixel.h" +#include "vpx_ports/mem.h" + +extern void vp9_filter_block2d_first_pass_armv6 +( + unsigned char *src_ptr, + short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_width, + unsigned int output_height, + const short *vp9_filter +); + +// 8x8 +extern void vp9_filter_block2d_first_pass_8x8_armv6 +( + unsigned char *src_ptr, + short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_width, + unsigned int output_height, + const short *vp9_filter +); + +// 16x16 +extern void vp9_filter_block2d_first_pass_16x16_armv6 +( + unsigned char *src_ptr, + short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_width, + unsigned int output_height, + const short *vp9_filter +); + +extern void vp9_filter_block2d_second_pass_armv6 +( + short *src_ptr, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int cnt, + const short *vp9_filter +); + +extern void vp9_filter4_block2d_second_pass_armv6 +( + short *src_ptr, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int cnt, + const short *vp9_filter +); + +extern void vp9_filter_block2d_first_pass_only_armv6 +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int cnt, + unsigned int output_pitch, + const short *vp9_filter +); + + +extern void vp9_filter_block2d_second_pass_only_armv6 +( + unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int cnt, + unsigned int output_pitch, + const short *vp9_filter +); + +#if HAVE_ARMV6 +void vp9_sixtap_predict_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) { + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED_ARRAY(4, short, FData, 12 * 4); /* Temp data buffer used in filtering */ + + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + /* Vfilter is null. First pass only */ + if (xoffset && !yoffset) { + /*vp9_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, src_pixels_per_line, 4, 4, HFilter ); + vp9_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, VFilter );*/ + + vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter); + } + /* Hfilter is null. Second pass only */ + else if (!xoffset && yoffset) { + vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter); + } else { + /* Vfilter is a 4 tap filter */ + if (yoffset & 0x1) { + vp9_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter); + vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter); + } + /* Vfilter is 6 tap filter */ + else { + vp9_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter); + vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter); + } + } +} + +void vp9_sixtap_predict8x8_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) { + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED_ARRAY(4, short, FData, 16 * 8); /* Temp data buffer used in filtering */ + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + if (xoffset && !yoffset) { + vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter); + } + /* Hfilter is null. Second pass only */ + else if (!xoffset && yoffset) { + vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter); + } else { + if (yoffset & 0x1) { + vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter); + vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); + } else { + vp9_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter); + vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); + } + } +} + + +void vp9_sixtap_predict16x16_armv6 +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) { + const short *HFilter; + const short *VFilter; + DECLARE_ALIGNED_ARRAY(4, short, FData, 24 * 16); /* Temp data buffer used in filtering */ + + HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ + VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ + + if (xoffset && !yoffset) { + vp9_filter_block2d_first_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter); + } + /* Hfilter is null. Second pass only */ + else if (!xoffset && yoffset) { + vp9_filter_block2d_second_pass_only_armv6(src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter); + } else { + if (yoffset & 0x1) { + vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter); + vp9_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); + } else { + vp9_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter); + vp9_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); + } + } + +} +#endif diff --git a/vp9/common/arm/vp9_idct_arm.h b/vp9/common/arm/vp9_idct_arm.h new file mode 100644 index 000000000..2fc4cf7fc --- /dev/null +++ b/vp9/common/arm/vp9_idct_arm.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef IDCT_ARM_H +#define IDCT_ARM_H + +#if HAVE_ARMV6 +extern prototype_idct(vp9_short_idct4x4llm_1_v6); +extern prototype_idct(vp9_short_idct4x4llm_v6_dual); +extern prototype_idct_scalar_add(vp9_dc_only_idct_add_v6); +extern prototype_second_order(vp9_short_inv_walsh4x4_1_v6); +extern prototype_second_order(vp9_short_inv_walsh4x4_v6); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp9_idct_idct1 +#define vp9_idct_idct1 vp9_short_idct4x4llm_1_v6 + +#undef vp9_idct_idct16 +#define vp9_idct_idct16 vp9_short_idct4x4llm_v6_dual + +#undef vp9_idct_idct1_scalar_add +#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_v6 + +#undef vp8_idct_iwalsh1 +#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_v6 + +#undef vp8_idct_iwalsh16 +#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_v6 +#endif +#endif + +#if HAVE_ARMV7 +extern prototype_idct(vp9_short_idct4x4llm_1_neon); +extern prototype_idct(vp9_short_idct4x4llm_neon); +extern prototype_idct_scalar_add(vp9_dc_only_idct_add_neon); +extern prototype_second_order(vp9_short_inv_walsh4x4_1_neon); +extern prototype_second_order(vp9_short_inv_walsh4x4_neon); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp9_idct_idct1 +#define vp9_idct_idct1 vp9_short_idct4x4llm_1_neon + +#undef vp9_idct_idct16 +#define vp9_idct_idct16 vp9_short_idct4x4llm_neon + +#undef vp9_idct_idct1_scalar_add +#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_neon + +#undef vp8_idct_iwalsh1 +#define vp8_idct_iwalsh1 vp9_short_inv_walsh4x4_1_neon + +#undef vp8_idct_iwalsh16 +#define vp8_idct_iwalsh16 vp9_short_inv_walsh4x4_neon +#endif +#endif + +#endif diff --git a/vp9/common/arm/vp9_loopfilter_arm.c b/vp9/common/arm/vp9_loopfilter_arm.c new file mode 100644 index 000000000..b61f1a86b --- /dev/null +++ b/vp9/common/arm/vp9_loopfilter_arm.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_onyxc_int.h" + +#if HAVE_ARMV6 +extern prototype_loopfilter(vp9_loop_filter_horizontal_edge_armv6); +extern prototype_loopfilter(vp9_loop_filter_vertical_edge_armv6); +extern prototype_loopfilter(vp9_mbloop_filter_horizontal_edge_armv6); +extern prototype_loopfilter(vp9_mbloop_filter_vertical_edge_armv6); +#endif + +#if HAVE_ARMV7 +typedef void loopfilter_y_neon(unsigned char *src, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh); +typedef void loopfilter_uv_neon(unsigned char *u, int pitch, + unsigned char blimit, unsigned char limit, unsigned char thresh, + unsigned char *v); + +extern loopfilter_y_neon vp9_loop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp9_loop_filter_vertical_edge_y_neon; +extern loopfilter_y_neon vp9_mbloop_filter_horizontal_edge_y_neon; +extern loopfilter_y_neon vp9_mbloop_filter_vertical_edge_y_neon; + +extern loopfilter_uv_neon vp9_loop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp9_loop_filter_vertical_edge_uv_neon; +extern loopfilter_uv_neon vp9_mbloop_filter_horizontal_edge_uv_neon; +extern loopfilter_uv_neon vp9_mbloop_filter_vertical_edge_uv_neon; +#endif + +#if HAVE_ARMV6 +/*ARMV6 loopfilter functions*/ +/* Horizontal MB filtering */ +void vp9_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + vp9_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp9_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp9_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical MB Filtering */ +void vp9_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + vp9_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp9_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp9_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Horizontal B Filtering */ +void vp9_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + vp9_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp9_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp9_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp9_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit); + vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit); + vp9_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit); +} + +/* Vertical B Filtering */ +void vp9_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + vp9_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp9_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp9_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp9_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit); + vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit); + vp9_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit); +} +#endif + +#if HAVE_ARMV7 +/* NEON loopfilter functions */ +/* Horizontal MB filtering */ +void vp9_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + vp9_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); + + if (u_ptr) + vp9_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); +} + +/* Vertical MB Filtering */ +void vp9_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + unsigned char mblim = *lfi->mblim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp9_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr); + + if (u_ptr) + vp9_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr); +} + +/* Horizontal B Filtering */ +void vp9_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr); + vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr); + vp9_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr); + + if (u_ptr) + vp9_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride); +} + +/* Vertical B Filtering */ +void vp9_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + unsigned char blim = *lfi->blim; + unsigned char lim = *lfi->lim; + unsigned char hev_thr = *lfi->hev_thr; + + vp9_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr); + vp9_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr); + vp9_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr); + + if (u_ptr) + vp9_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4); +} +#endif diff --git a/vp9/common/arm/vp9_loopfilter_arm.h b/vp9/common/arm/vp9_loopfilter_arm.h new file mode 100644 index 000000000..de6b7ffbc --- /dev/null +++ b/vp9/common/arm/vp9_loopfilter_arm.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef LOOPFILTER_ARM_H +#define LOOPFILTER_ARM_H + +#include "vpx_config.h" + +#if HAVE_ARMV6 +extern prototype_loopfilter_block(vp9_loop_filter_mbv_armv6); +extern prototype_loopfilter_block(vp9_loop_filter_bv_armv6); +extern prototype_loopfilter_block(vp9_loop_filter_mbh_armv6); +extern prototype_loopfilter_block(vp9_loop_filter_bh_armv6); +extern prototype_simple_loopfilter(vp9_loop_filter_bvs_armv6); +extern prototype_simple_loopfilter(vp9_loop_filter_bhs_armv6); +extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_armv6); +extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_armv6); + +#endif /* HAVE_ARMV6 */ + +#if HAVE_ARMV7 +extern prototype_loopfilter_block(vp9_loop_filter_mbv_neon); +extern prototype_loopfilter_block(vp9_loop_filter_bv_neon); +extern prototype_loopfilter_block(vp9_loop_filter_mbh_neon); +extern prototype_loopfilter_block(vp9_loop_filter_bh_neon); +extern prototype_simple_loopfilter(vp9_loop_filter_mbvs_neon); +extern prototype_simple_loopfilter(vp9_loop_filter_bvs_neon); +extern prototype_simple_loopfilter(vp9_loop_filter_mbhs_neon); +extern prototype_simple_loopfilter(vp9_loop_filter_bhs_neon); + +#endif /* HAVE_ARMV7 */ + +#endif /* LOOPFILTER_ARM_H */ diff --git a/vp9/common/arm/vp9_recon_arm.h b/vp9/common/arm/vp9_recon_arm.h new file mode 100644 index 000000000..1e402951c --- /dev/null +++ b/vp9/common/arm/vp9_recon_arm.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef RECON_ARM_H +#define RECON_ARM_H + +#if HAVE_ARMV6 +extern prototype_recon_block(vp9_recon_b_armv6); +extern prototype_recon_block(vp9_recon2b_armv6); +extern prototype_recon_block(vp9_recon4b_armv6); + +extern prototype_copy_block(vp9_copy_mem8x8_v6); +extern prototype_copy_block(vp9_copy_mem8x4_v6); +extern prototype_copy_block(vp9_copy_mem16x16_v6); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_recon_recon +#define vp8_recon_recon vp9_recon_b_armv6 + +#undef vp8_recon_recon2 +#define vp8_recon_recon2 vp9_recon2b_armv6 + +#undef vp8_recon_recon4 +#define vp8_recon_recon4 vp9_recon4b_armv6 + +#undef vp8_recon_copy8x8 +#define vp8_recon_copy8x8 vp9_copy_mem8x8_v6 + +#undef vp8_recon_copy8x4 +#define vp8_recon_copy8x4 vp9_copy_mem8x4_v6 + +#undef vp8_recon_copy16x16 +#define vp8_recon_copy16x16 vp9_copy_mem16x16_v6 +#endif +#endif + +#if HAVE_ARMV7 +extern prototype_recon_block(vp9_recon_b_neon); +extern prototype_recon_block(vp9_recon2b_neon); +extern prototype_recon_block(vp9_recon4b_neon); + +extern prototype_copy_block(vp9_copy_mem8x8_neon); +extern prototype_copy_block(vp9_copy_mem8x4_neon); +extern prototype_copy_block(vp9_copy_mem16x16_neon); + +extern prototype_recon_macroblock(vp9_recon_mb_neon); + +extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_neon); +extern prototype_build_intra_predictors(vp9_build_intra_predictors_mby_s_neon); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_recon_recon +#define vp8_recon_recon vp9_recon_b_neon + +#undef vp8_recon_recon2 +#define vp8_recon_recon2 vp9_recon2b_neon + +#undef vp8_recon_recon4 +#define vp8_recon_recon4 vp9_recon4b_neon + +#undef vp8_recon_copy8x8 +#define vp8_recon_copy8x8 vp9_copy_mem8x8_neon + +#undef vp8_recon_copy8x4 +#define vp8_recon_copy8x4 vp9_copy_mem8x4_neon + +#undef vp8_recon_copy16x16 +#define vp8_recon_copy16x16 vp9_copy_mem16x16_neon + +#undef vp8_recon_recon_mb +#define vp8_recon_recon_mb vp9_recon_mb_neon + +#undef vp9_recon_build_intra_predictors_mby +#define vp9_recon_build_intra_predictors_mby vp9_build_intra_predictors_mby_neon + +#undef vp9_recon_build_intra_predictors_mby_s +#define vp9_recon_build_intra_predictors_mby_s vp9_build_intra_predictors_mby_s_neon + +#endif +#endif + +#endif diff --git a/vp9/common/arm/vp9_reconintra_arm.c b/vp9/common/arm/vp9_reconintra_arm.c new file mode 100644 index 000000000..5720828c7 --- /dev/null +++ b/vp9/common/arm/vp9_reconintra_arm.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_reconintra.h" +#include "vpx_mem/vpx_mem.h" +#include "vp9/common/recon.h" + +#if HAVE_ARMV7 +extern void vp9_build_intra_predictors_mby_neon_func( + unsigned char *y_buffer, + unsigned char *ypred_ptr, + int y_stride, + int mode, + int Up, + int Left); + +void vp9_build_intra_predictors_mby_neon(MACROBLOCKD *xd) { + unsigned char *y_buffer = xd->dst.y_buffer; + unsigned char *ypred_ptr = xd->predictor; + int y_stride = xd->dst.y_stride; + int mode = xd->mode_info_context->mbmi.mode; + int Up = xd->up_available; + int Left = xd->left_available; + + vp9_build_intra_predictors_mby_neon_func(y_buffer, ypred_ptr, + y_stride, mode, Up, Left); +} +#endif + + +#if HAVE_ARMV7 +extern void vp9_build_intra_predictors_mby_s_neon_func( + unsigned char *y_buffer, + unsigned char *ypred_ptr, + int y_stride, + int mode, + int Up, + int Left); + +void vp9_build_intra_predictors_mby_s_neon(MACROBLOCKD *xd) { + unsigned char *y_buffer = xd->dst.y_buffer; + unsigned char *ypred_ptr = xd->predictor; + int y_stride = xd->dst.y_stride; + int mode = xd->mode_info_context->mbmi.mode; + int Up = xd->up_available; + int Left = xd->left_available; + + vp9_build_intra_predictors_mby_s_neon_func(y_buffer, ypred_ptr, + y_stride, mode, Up, Left); +} + +#endif diff --git a/vp9/common/arm/vp9_subpixel_arm.h b/vp9/common/arm/vp9_subpixel_arm.h new file mode 100644 index 000000000..b4f9f54f3 --- /dev/null +++ b/vp9/common/arm/vp9_subpixel_arm.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef SUBPIXEL_ARM_H +#define SUBPIXEL_ARM_H + +#if HAVE_ARMV6 +extern prototype_subpixel_predict(vp9_sixtap_predict16x16_armv6); +extern prototype_subpixel_predict(vp9_sixtap_predict8x8_armv6); +extern prototype_subpixel_predict(vp9_sixtap_predict8x4_armv6); +extern prototype_subpixel_predict(vp9_sixtap_predict_armv6); +extern prototype_subpixel_predict(vp9_bilinear_predict16x16_armv6); +extern prototype_subpixel_predict(vp9_bilinear_predict8x8_armv6); +extern prototype_subpixel_predict(vp9_bilinear_predict8x4_armv6); +extern prototype_subpixel_predict(vp9_bilinear_predict4x4_armv6); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp9_subpix_sixtap16x16 +#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_armv6 + +#undef vp9_subpix_sixtap8x8 +#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_armv6 + +#undef vp9_subpix_sixtap8x4 +#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_armv6 + +#undef vp9_subpix_sixtap4x4 +#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_armv6 + +#undef vp9_subpix_bilinear16x16 +#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_armv6 + +#undef vp9_subpix_bilinear8x8 +#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_armv6 + +#undef vp9_subpix_bilinear8x4 +#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_armv6 + +#undef vp9_subpix_bilinear4x4 +#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_armv6 +#endif +#endif + +#if HAVE_ARMV7 +extern prototype_subpixel_predict(vp9_sixtap_predict16x16_neon); +extern prototype_subpixel_predict(vp9_sixtap_predict8x8_neon); +extern prototype_subpixel_predict(vp9_sixtap_predict8x4_neon); +extern prototype_subpixel_predict(vp9_sixtap_predict_neon); +extern prototype_subpixel_predict(vp9_bilinear_predict16x16_neon); +extern prototype_subpixel_predict(vp9_bilinear_predict8x8_neon); +extern prototype_subpixel_predict(vp9_bilinear_predict8x4_neon); +extern prototype_subpixel_predict(vp9_bilinear_predict4x4_neon); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp9_subpix_sixtap16x16 +#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_neon + +#undef vp9_subpix_sixtap8x8 +#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_neon + +#undef vp9_subpix_sixtap8x4 +#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_neon + +#undef vp9_subpix_sixtap4x4 +#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_neon + +#undef vp9_subpix_bilinear16x16 +#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_neon + +#undef vp9_subpix_bilinear8x8 +#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_neon + +#undef vp9_subpix_bilinear8x4 +#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_neon + +#undef vp9_subpix_bilinear4x4 +#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_neon +#endif +#endif + +#endif diff --git a/vp9/common/asm_com_offsets.c b/vp9/common/asm_com_offsets.c deleted file mode 100644 index 07d3e333a..000000000 --- a/vp9/common/asm_com_offsets.c +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_config.h" -#include "vpx/vpx_codec.h" -#include "vpx_ports/asm_offsets.h" -#include "vpx_scale/yv12config.h" - -BEGIN - -/* vpx_scale */ -DEFINE(yv12_buffer_config_y_width, offsetof(YV12_BUFFER_CONFIG, y_width)); -DEFINE(yv12_buffer_config_y_height, offsetof(YV12_BUFFER_CONFIG, y_height)); -DEFINE(yv12_buffer_config_y_stride, offsetof(YV12_BUFFER_CONFIG, y_stride)); -DEFINE(yv12_buffer_config_uv_width, offsetof(YV12_BUFFER_CONFIG, uv_width)); -DEFINE(yv12_buffer_config_uv_height, offsetof(YV12_BUFFER_CONFIG, uv_height)); -DEFINE(yv12_buffer_config_uv_stride, offsetof(YV12_BUFFER_CONFIG, uv_stride)); -DEFINE(yv12_buffer_config_y_buffer, offsetof(YV12_BUFFER_CONFIG, y_buffer)); -DEFINE(yv12_buffer_config_u_buffer, offsetof(YV12_BUFFER_CONFIG, u_buffer)); -DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_buffer)); -DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border)); -DEFINE(VP9BORDERINPIXELS_VAL, VP9BORDERINPIXELS); - -END - -/* add asserts for any offset that is not supported by assembly code */ -/* add asserts for any size that is not supported by assembly code */ - -#if HAVE_ARMV7 -/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */ -ct_assert(VP9BORDERINPIXELS_VAL, VP9BORDERINPIXELS == 32) -#endif diff --git a/vp9/common/blockd.c b/vp9/common/blockd.c deleted file mode 100644 index 140500ec4..000000000 --- a/vp9/common/blockd.c +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "blockd.h" -#include "vpx_mem/vpx_mem.h" - - -const unsigned char vp9_block2left[25] = { - 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -}; -const unsigned char vp9_block2above[25] = { - 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8 -}; - -const unsigned char vp9_block2left_8x8[25] = { - 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8 -}; -const unsigned char vp9_block2above_8x8[25] = { - 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8 -}; - diff --git a/vp9/common/blockd.h b/vp9/common/blockd.h deleted file mode 100644 index 409c7b8f8..000000000 --- a/vp9/common/blockd.h +++ /dev/null @@ -1,568 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_BLOCKD_H -#define __INC_BLOCKD_H - -void vpx_log(const char *format, ...); - -#include "vpx_ports/config.h" -#include "vpx_scale/yv12config.h" -#include "mv.h" -#include "treecoder.h" -#include "subpixel.h" -#include "vpx_ports/mem.h" -#include "common.h" - -#define TRUE 1 -#define FALSE 0 - -// #define MODE_STATS - -/*#define DCPRED 1*/ -#define DCPREDSIMTHRESH 0 -#define DCPREDCNTTHRESH 3 - -#define MB_FEATURE_TREE_PROBS 3 -#define PREDICTION_PROBS 3 - -#define MBSKIP_CONTEXTS 3 - -#define MAX_MB_SEGMENTS 4 - -#define MAX_REF_LF_DELTAS 4 -#define MAX_MODE_LF_DELTAS 4 - -/* Segment Feature Masks */ -#define SEGMENT_DELTADATA 0 -#define SEGMENT_ABSDATA 1 -#define MAX_MV_REFS 9 - -typedef struct { - int r, c; -} POS; - -typedef enum PlaneType { - PLANE_TYPE_Y_NO_DC = 0, - PLANE_TYPE_Y2, - PLANE_TYPE_UV, - PLANE_TYPE_Y_WITH_DC, -} PLANE_TYPE; - -typedef char ENTROPY_CONTEXT; -typedef struct { - ENTROPY_CONTEXT y1[4]; - ENTROPY_CONTEXT u[2]; - ENTROPY_CONTEXT v[2]; - ENTROPY_CONTEXT y2; -} ENTROPY_CONTEXT_PLANES; - -extern const unsigned char vp9_block2left[25]; -extern const unsigned char vp9_block2above[25]; -extern const unsigned char vp9_block2left_8x8[25]; -extern const unsigned char vp9_block2above_8x8[25]; - -#define VP9_COMBINEENTROPYCONTEXTS( Dest, A, B) \ - Dest = ((A)!=0) + ((B)!=0); - -typedef enum { - KEY_FRAME = 0, - INTER_FRAME = 1 -} FRAME_TYPE; - -typedef enum -{ - SIXTAP = 0, - BILINEAR = 1, - EIGHTTAP = 2, - EIGHTTAP_SHARP = 3, - SWITCHABLE /* should be the last one */ -} INTERPOLATIONFILTERTYPE; - -typedef enum -{ - DC_PRED, /* average of above and left pixels */ - V_PRED, /* vertical prediction */ - H_PRED, /* horizontal prediction */ - D45_PRED, /* Directional 45 deg prediction [anti-clockwise from 0 deg hor] */ - D135_PRED, /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */ - D117_PRED, /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */ - D153_PRED, /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */ - D27_PRED, /* Directional 22 deg prediction [anti-clockwise from 0 deg hor] */ - D63_PRED, /* Directional 67 deg prediction [anti-clockwise from 0 deg hor] */ - TM_PRED, /* Truemotion prediction */ - I8X8_PRED, /* 8x8 based prediction, each 8x8 has its own prediction mode */ - B_PRED, /* block based prediction, each block has its own prediction mode */ - - NEARESTMV, - NEARMV, - ZEROMV, - NEWMV, - SPLITMV, - - MB_MODE_COUNT -} MB_PREDICTION_MODE; - -// Segment level features. -typedef enum { - SEG_LVL_ALT_Q = 0, // Use alternate Quantizer .... - SEG_LVL_ALT_LF = 1, // Use alternate loop filter value... - SEG_LVL_REF_FRAME = 2, // Optional Segment reference frame - SEG_LVL_MODE = 3, // Optional Segment mode - SEG_LVL_EOB = 4, // EOB end stop marker. - SEG_LVL_TRANSFORM = 5, // Block transform size. - SEG_LVL_MAX = 6 // Number of MB level features supported - -} SEG_LVL_FEATURES; - -// Segment level features. -typedef enum { - TX_4X4, // 4x4 dct transform - TX_8X8, // 8x8 dct transform - TX_16X16, // 16x16 dct transform - TX_SIZE_MAX // Number of different transforms available -} TX_SIZE; - -typedef enum { - DCT_DCT = 0, // DCT in both horizontal and vertical - ADST_DCT = 1, // ADST in vertical, DCT in horizontal - DCT_ADST = 2, // DCT in vertical, ADST in horizontal - ADST_ADST = 3 // ADST in both directions -} TX_TYPE; - -#define VP9_YMODES (B_PRED + 1) -#define VP9_UV_MODES (TM_PRED + 1) -#define VP9_I8X8_MODES (TM_PRED + 1) -#define VP9_I32X32_MODES (TM_PRED + 1) - -#define VP9_MVREFS (1 + SPLITMV - NEARESTMV) - -#if CONFIG_LOSSLESS -#define WHT_UPSCALE_FACTOR 3 -#define Y2_WHT_UPSCALE_FACTOR 2 -#endif - -typedef enum { - B_DC_PRED, /* average of above and left pixels */ - B_TM_PRED, - - B_VE_PRED, /* vertical prediction */ - B_HE_PRED, /* horizontal prediction */ - - B_LD_PRED, - B_RD_PRED, - - B_VR_PRED, - B_VL_PRED, - B_HD_PRED, - B_HU_PRED, -#if CONFIG_NEWBINTRAMODES - B_CONTEXT_PRED, -#endif - - LEFT4X4, - ABOVE4X4, - ZERO4X4, - NEW4X4, - - B_MODE_COUNT -} B_PREDICTION_MODE; - -#define VP9_BINTRAMODES (LEFT4X4) -#define VP9_SUBMVREFS (1 + NEW4X4 - LEFT4X4) - -#if CONFIG_NEWBINTRAMODES -/* The number of B_PRED intra modes that are replaced by B_CONTEXT_PRED */ -#define CONTEXT_PRED_REPLACEMENTS 0 -#define VP9_KF_BINTRAMODES (VP9_BINTRAMODES - 1) -#define VP9_NKF_BINTRAMODES (VP9_BINTRAMODES - CONTEXT_PRED_REPLACEMENTS) -#else -#define VP9_KF_BINTRAMODES (VP9_BINTRAMODES) /* 10 */ -#define VP9_NKF_BINTRAMODES (VP9_BINTRAMODES) /* 10 */ -#endif - -typedef enum { - PARTITIONING_16X8 = 0, - PARTITIONING_8X16, - PARTITIONING_8X8, - PARTITIONING_4X4, - NB_PARTITIONINGS, -} SPLITMV_PARTITIONING_TYPE; - -/* For keyframes, intra block modes are predicted by the (already decoded) - modes for the Y blocks to the left and above us; for interframes, there - is a single probability table. */ - -union b_mode_info { - struct { - B_PREDICTION_MODE first; - TX_TYPE tx_type; -#if CONFIG_COMP_INTRA_PRED - B_PREDICTION_MODE second; -#endif -#if CONFIG_NEWBINTRAMODES - B_PREDICTION_MODE context; -#endif - } as_mode; - struct { - int_mv first; - int_mv second; - } as_mv; -}; - -typedef enum { - NONE = -1, - INTRA_FRAME = 0, - LAST_FRAME = 1, - GOLDEN_FRAME = 2, - ALTREF_FRAME = 3, - MAX_REF_FRAMES = 4 -} MV_REFERENCE_FRAME; - -typedef struct { - MB_PREDICTION_MODE mode, uv_mode; -#if CONFIG_COMP_INTRA_PRED - MB_PREDICTION_MODE second_mode, second_uv_mode; -#endif -#if CONFIG_COMP_INTERINTRA_PRED - MB_PREDICTION_MODE interintra_mode, interintra_uv_mode; -#endif - MV_REFERENCE_FRAME ref_frame, second_ref_frame; - TX_SIZE txfm_size; - int_mv mv[2]; // for each reference frame used - int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS]; - int_mv best_mv, best_second_mv; -#if CONFIG_NEW_MVREF - int best_index, best_second_index; -#endif - - int mb_mode_context[MAX_REF_FRAMES]; - - SPLITMV_PARTITIONING_TYPE partitioning; - unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */ - unsigned char need_to_clamp_mvs; - unsigned char need_to_clamp_secondmv; - unsigned char segment_id; /* Which set of segmentation parameters should be used for this MB */ - - // Flags used for prediction status of various bistream signals - unsigned char seg_id_predicted; - unsigned char ref_predicted; - - // Indicates if the mb is part of the image (1) vs border (0) - // This can be useful in determining whether the MB provides - // a valid predictor - unsigned char mb_in_image; - -#if CONFIG_PRED_FILTER - // Flag to turn prediction signal filter on(1)/off(0 ) at the MB level - unsigned int pred_filter_enabled; -#endif - INTERPOLATIONFILTERTYPE interp_filter; - -#if CONFIG_SUPERBLOCKS - // FIXME need a SB array of 4 MB_MODE_INFOs that - // only needs one encoded_as_sb. - unsigned char encoded_as_sb; -#endif -} MB_MODE_INFO; - -typedef struct { - MB_MODE_INFO mbmi; - union b_mode_info bmi[16]; -} MODE_INFO; - -typedef struct blockd { - short *qcoeff; - short *dqcoeff; - unsigned char *predictor; - short *diff; - short *dequant; - - /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */ - unsigned char **base_pre; - unsigned char **base_second_pre; - int pre; - int pre_stride; - - unsigned char **base_dst; - int dst; - int dst_stride; - - int eob; - - union b_mode_info bmi; -} BLOCKD; - -typedef struct macroblockd { - DECLARE_ALIGNED(16, short, diff[400]); /* from idct diff */ - DECLARE_ALIGNED(16, unsigned char, predictor[384]); - DECLARE_ALIGNED(16, short, qcoeff[400]); - DECLARE_ALIGNED(16, short, dqcoeff[400]); - DECLARE_ALIGNED(16, unsigned short, eobs[25]); - - /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */ - BLOCKD block[25]; - int fullpixel_mask; - - YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */ - struct { - uint8_t *y_buffer, *u_buffer, *v_buffer; - } second_pre; - YV12_BUFFER_CONFIG dst; - - MODE_INFO *prev_mode_info_context; - MODE_INFO *mode_info_context; - int mode_info_stride; - - FRAME_TYPE frame_type; - - int up_available; - int left_available; - - /* Y,U,V,Y2 */ - ENTROPY_CONTEXT_PLANES *above_context; - ENTROPY_CONTEXT_PLANES *left_context; - - /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */ - unsigned char segmentation_enabled; - - /* 0 (do not update) 1 (update) the macroblock segmentation map. */ - unsigned char update_mb_segmentation_map; - - /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ - unsigned char update_mb_segmentation_data; - - /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ - unsigned char mb_segment_abs_delta; - - /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */ - /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */ - - // Probability Tree used to code Segment number - vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS]; - -#if CONFIG_NEW_MVREF - vp9_prob mb_mv_ref_id_probs[MAX_REF_FRAMES][3]; -#endif - - // Segment features - signed char segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX]; - unsigned int segment_feature_mask[MAX_MB_SEGMENTS]; - - /* mode_based Loop filter adjustment */ - unsigned char mode_ref_lf_delta_enabled; - unsigned char mode_ref_lf_delta_update; - - /* Delta values have the range +/- MAX_LOOP_FILTER */ - signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */ - signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */ - signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */ - signed char mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */ - - /* Distance of MB away from frame edges */ - int mb_to_left_edge; - int mb_to_right_edge; - int mb_to_top_edge; - int mb_to_bottom_edge; - - unsigned int frames_since_golden; - unsigned int frames_till_alt_ref_frame; - - /* Inverse transform function pointers. */ - void (*inv_xform4x4_1_x8)(short *input, short *output, int pitch); - void (*inv_xform4x4_x8)(short *input, short *output, int pitch); - void (*inv_walsh4x4_1)(short *in, short *out); - void (*inv_walsh4x4_lossless)(short *in, short *out); - - - vp9_subpix_fn_t subpixel_predict; - vp9_subpix_fn_t subpixel_predict8x4; - vp9_subpix_fn_t subpixel_predict8x8; - vp9_subpix_fn_t subpixel_predict16x16; - vp9_subpix_fn_t subpixel_predict_avg; - vp9_subpix_fn_t subpixel_predict_avg8x4; - vp9_subpix_fn_t subpixel_predict_avg8x8; - vp9_subpix_fn_t subpixel_predict_avg16x16; - int allow_high_precision_mv; - - int corrupted; - -#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64) - /* This is an intermediate buffer currently used in sub-pixel motion search - * to keep a copy of the reference area. This buffer can be used for other - * purpose. - */ - DECLARE_ALIGNED(32, unsigned char, y_buf[22 * 32]); -#endif - -#if CONFIG_RUNTIME_CPU_DETECT - struct VP9_COMMON_RTCD *rtcd; -#endif - - int mb_index; // Index of the MB in the SB (0..3) - int q_index; - -} MACROBLOCKD; - -#define ACTIVE_HT 110 // quantization stepsize threshold - -#define ACTIVE_HT8 300 - -#define ACTIVE_HT16 300 - -// convert MB_PREDICTION_MODE to B_PREDICTION_MODE -static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) { - B_PREDICTION_MODE b_mode; - switch (mode) { - case DC_PRED: - b_mode = B_DC_PRED; - break; - case V_PRED: - b_mode = B_VE_PRED; - break; - case H_PRED: - b_mode = B_HE_PRED; - break; - case TM_PRED: - b_mode = B_TM_PRED; - break; - case D45_PRED: - b_mode = B_LD_PRED; - break; - case D135_PRED: - b_mode = B_RD_PRED; - break; - case D117_PRED: - b_mode = B_VR_PRED; - break; - case D153_PRED: - b_mode = B_HD_PRED; - break; - case D27_PRED: - b_mode = B_HU_PRED; - break; - case D63_PRED: - b_mode = B_VL_PRED; - break; - default : - // for debug purpose, to be removed after full testing - assert(0); - break; - } - return b_mode; -} - -// transform mapping -static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) { - // map transform type - TX_TYPE tx_type; - switch (bmode) { - case B_TM_PRED : - case B_RD_PRED : - tx_type = ADST_ADST; - break; - - case B_VE_PRED : - case B_VR_PRED : - tx_type = ADST_DCT; - break; - - case B_HE_PRED : - case B_HD_PRED : - case B_HU_PRED : - tx_type = DCT_ADST; - break; - -#if CONFIG_NEWBINTRAMODES - case B_CONTEXT_PRED: - assert(0); - break; -#endif - - default : - tx_type = DCT_DCT; - break; - } - return tx_type; -} - -static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) { - TX_TYPE tx_type = DCT_DCT; - if (xd->mode_info_context->mbmi.mode == B_PRED && - xd->q_index < ACTIVE_HT) { - tx_type = txfm_map( -#if CONFIG_NEWBINTRAMODES - b->bmi.as_mode.first == B_CONTEXT_PRED ? b->bmi.as_mode.context : -#endif - b->bmi.as_mode.first); - } - return tx_type; -} - -static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) { - TX_TYPE tx_type = DCT_DCT; - if (xd->mode_info_context->mbmi.mode == I8X8_PRED && - xd->q_index < ACTIVE_HT8) { - // TODO(rbultje): MB_PREDICTION_MODE / B_PREDICTION_MODE should be merged - // or the relationship otherwise modified to address this type conversion. - tx_type = txfm_map(pred_mode_conv( - (MB_PREDICTION_MODE)b->bmi.as_mode.first)); - } - return tx_type; -} - -static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) { - TX_TYPE tx_type = DCT_DCT; - if (xd->mode_info_context->mbmi.mode < I8X8_PRED && -#if CONFIG_SUPERBLOCKS - !xd->mode_info_context->mbmi.encoded_as_sb && -#endif - xd->q_index < ACTIVE_HT16) { - tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)); - } - return tx_type; -} - -static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) { - TX_TYPE tx_type = DCT_DCT; - int ib = (int)(b - xd->block); - if (ib >= 16) - return tx_type; - if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) { - tx_type = get_tx_type_16x16(xd, b); - } - if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) { - ib = (ib & 8) + ((ib & 4) >> 1); - tx_type = get_tx_type_8x8(xd, &xd->block[ib]); - } - if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) { - tx_type = get_tx_type_4x4(xd, b); - } - return tx_type; -} - -extern void vp9_build_block_doffsets(MACROBLOCKD *xd); -extern void vp9_setup_block_dptrs(MACROBLOCKD *xd); - -static void update_blockd_bmi(MACROBLOCKD *xd) { - int i; - int is_4x4; - is_4x4 = (xd->mode_info_context->mbmi.mode == SPLITMV) || - (xd->mode_info_context->mbmi.mode == I8X8_PRED) || - (xd->mode_info_context->mbmi.mode == B_PRED); - - if (is_4x4) { - for (i = 0; i < 16; i++) { - xd->block[i].bmi = xd->mode_info_context->bmi[i]; - } - } -} -#endif /* __INC_BLOCKD_H */ diff --git a/vp9/common/coefupdateprobs.h b/vp9/common/coefupdateprobs.h deleted file mode 100644 index 185bc6d84..000000000 --- a/vp9/common/coefupdateprobs.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -/* Update probabilities for the nodes in the token entropy tree. - Generated file included by entropy.c */ -#define COEF_UPDATE_PROB 252 -#define COEF_UPDATE_PROB_8X8 252 -#define COEF_UPDATE_PROB_16X16 252 diff --git a/vp9/common/common.h b/vp9/common/common.h deleted file mode 100644 index bd34bf9b1..000000000 --- a/vp9/common/common.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef common_h -#define common_h 1 - -#include -#include "vpx_config.h" -/* Interface header for common constant data structures and lookup tables */ - -#include "vpx_mem/vpx_mem.h" - -#include "common_types.h" - -/* Only need this for fixed-size arrays, for structs just assign. */ - -#define vp9_copy( Dest, Src) { \ - assert( sizeof( Dest) == sizeof( Src)); \ - vpx_memcpy( Dest, Src, sizeof( Src)); \ - } - -/* Use this for variably-sized arrays. */ - -#define vp9_copy_array( Dest, Src, N) { \ - assert( sizeof( *Dest) == sizeof( *Src)); \ - vpx_memcpy( Dest, Src, N * sizeof( *Src)); \ - } - -#define vp9_zero( Dest) vpx_memset( &Dest, 0, sizeof( Dest)); - -#define vp9_zero_array( Dest, N) vpx_memset( Dest, 0, N * sizeof( *Dest)); - -#endif /* common_h */ diff --git a/vp9/common/common_types.h b/vp9/common/common_types.h deleted file mode 100644 index 4e6248697..000000000 --- a/vp9/common/common_types.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_COMMON_TYPES -#define __INC_COMMON_TYPES - -#define TRUE 1 -#define FALSE 0 - -#endif diff --git a/vp9/common/context.c b/vp9/common/context.c deleted file mode 100644 index 181a27f3e..000000000 --- a/vp9/common/context.c +++ /dev/null @@ -1,397 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "entropy.h" - -/* *** GENERATED FILE: DO NOT EDIT *** */ - -#if 0 -int Contexts[vp8_coef_counter_dimen]; - -const int default_contexts[vp8_coef_counter_dimen] = { - { - // Block Type ( 0 ) - { - // Coeff Band ( 0 ) - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - }, - { - // Coeff Band ( 1 ) - {30190, 26544, 225, 24, 4, 0, 0, 0, 0, 0, 0, 4171593}, - {26846, 25157, 1241, 130, 26, 6, 1, 0, 0, 0, 0, 149987}, - {10484, 9538, 1006, 160, 36, 18, 0, 0, 0, 0, 0, 15104}, - }, - { - // Coeff Band ( 2 ) - {25842, 40456, 1126, 83, 11, 2, 0, 0, 0, 0, 0, 0}, - {9338, 8010, 512, 73, 7, 3, 2, 0, 0, 0, 0, 43294}, - {1047, 751, 149, 31, 13, 6, 1, 0, 0, 0, 0, 879}, - }, - { - // Coeff Band ( 3 ) - {26136, 9826, 252, 13, 0, 0, 0, 0, 0, 0, 0, 0}, - {8134, 5574, 191, 14, 2, 0, 0, 0, 0, 0, 0, 35302}, - { 605, 677, 116, 9, 1, 0, 0, 0, 0, 0, 0, 611}, - }, - { - // Coeff Band ( 4 ) - {10263, 15463, 283, 17, 0, 0, 0, 0, 0, 0, 0, 0}, - {2773, 2191, 128, 9, 2, 2, 0, 0, 0, 0, 0, 10073}, - { 134, 125, 32, 4, 0, 2, 0, 0, 0, 0, 0, 50}, - }, - { - // Coeff Band ( 5 ) - {10483, 2663, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0}, - {2137, 1251, 27, 1, 1, 0, 0, 0, 0, 0, 0, 14362}, - { 116, 156, 14, 2, 1, 0, 0, 0, 0, 0, 0, 190}, - }, - { - // Coeff Band ( 6 ) - {40977, 27614, 412, 28, 0, 0, 0, 0, 0, 0, 0, 0}, - {6113, 5213, 261, 22, 3, 0, 0, 0, 0, 0, 0, 26164}, - { 382, 312, 50, 14, 2, 0, 0, 0, 0, 0, 0, 345}, - }, - { - // Coeff Band ( 7 ) - { 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 319}, - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8}, - }, - }, - { - // Block Type ( 1 ) - { - // Coeff Band ( 0 ) - {3268, 19382, 1043, 250, 93, 82, 49, 26, 17, 8, 25, 82289}, - {8758, 32110, 5436, 1832, 827, 668, 420, 153, 24, 0, 3, 52914}, - {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399, 59, 0, 0, 18620}, - }, - { - // Coeff Band ( 1 ) - {12419, 8420, 452, 62, 9, 1, 0, 0, 0, 0, 0, 0}, - {11715, 8705, 693, 92, 15, 7, 2, 0, 0, 0, 0, 53988}, - {7603, 8585, 2306, 778, 270, 145, 39, 5, 0, 0, 0, 9136}, - }, - { - // Coeff Band ( 2 ) - {15938, 14335, 1207, 184, 55, 13, 4, 1, 0, 0, 0, 0}, - {7415, 6829, 1138, 244, 71, 26, 7, 0, 0, 0, 0, 9980}, - {1580, 1824, 655, 241, 89, 46, 10, 2, 0, 0, 0, 429}, - }, - { - // Coeff Band ( 3 ) - {19453, 5260, 201, 19, 0, 0, 0, 0, 0, 0, 0, 0}, - {9173, 3758, 213, 22, 1, 1, 0, 0, 0, 0, 0, 9820}, - {1689, 1277, 276, 51, 17, 4, 0, 0, 0, 0, 0, 679}, - }, - { - // Coeff Band ( 4 ) - {12076, 10667, 620, 85, 19, 9, 5, 0, 0, 0, 0, 0}, - {4665, 3625, 423, 55, 19, 9, 0, 0, 0, 0, 0, 5127}, - { 415, 440, 143, 34, 20, 7, 2, 0, 0, 0, 0, 101}, - }, - { - // Coeff Band ( 5 ) - {12183, 4846, 115, 11, 1, 0, 0, 0, 0, 0, 0, 0}, - {4226, 3149, 177, 21, 2, 0, 0, 0, 0, 0, 0, 7157}, - { 375, 621, 189, 51, 11, 4, 1, 0, 0, 0, 0, 198}, - }, - { - // Coeff Band ( 6 ) - {61658, 37743, 1203, 94, 10, 3, 0, 0, 0, 0, 0, 0}, - {15514, 11563, 903, 111, 14, 5, 0, 0, 0, 0, 0, 25195}, - { 929, 1077, 291, 78, 14, 7, 1, 0, 0, 0, 0, 507}, - }, - { - // Coeff Band ( 7 ) - { 0, 990, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 412, 13, 0, 0, 0, 0, 0, 0, 0, 0, 1641}, - { 0, 18, 7, 1, 0, 0, 0, 0, 0, 0, 0, 30}, - }, - }, - { - // Block Type ( 2 ) - { - // Coeff Band ( 0 ) - { 953, 24519, 628, 120, 28, 12, 4, 0, 0, 0, 0, 2248798}, - {1525, 25654, 2647, 617, 239, 143, 42, 5, 0, 0, 0, 66837}, - {1180, 11011, 3001, 1237, 532, 448, 239, 54, 5, 0, 0, 7122}, - }, - { - // Coeff Band ( 1 ) - {1356, 2220, 67, 10, 4, 1, 0, 0, 0, 0, 0, 0}, - {1450, 2544, 102, 18, 4, 3, 0, 0, 0, 0, 0, 57063}, - {1182, 2110, 470, 130, 41, 21, 0, 0, 0, 0, 0, 6047}, - }, - { - // Coeff Band ( 2 ) - { 370, 3378, 200, 30, 5, 4, 1, 0, 0, 0, 0, 0}, - { 293, 1006, 131, 29, 11, 0, 0, 0, 0, 0, 0, 5404}, - { 114, 387, 98, 23, 4, 8, 1, 0, 0, 0, 0, 236}, - }, - { - // Coeff Band ( 3 ) - { 579, 194, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - { 395, 213, 5, 1, 0, 0, 0, 0, 0, 0, 0, 4157}, - { 119, 122, 4, 0, 0, 0, 0, 0, 0, 0, 0, 300}, - }, - { - // Coeff Band ( 4 ) - { 38, 557, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - { 21, 114, 12, 1, 0, 0, 0, 0, 0, 0, 0, 427}, - { 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7}, - }, - { - // Coeff Band ( 5 ) - { 52, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - { 18, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652}, - { 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30}, - }, - { - // Coeff Band ( 6 ) - { 640, 569, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - { 25, 77, 2, 0, 0, 0, 0, 0, 0, 0, 0, 517}, - { 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3}, - }, - { - // Coeff Band ( 7 ) - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - }, - }, - { - // Block Type ( 3 ) - { - // Coeff Band ( 0 ) - {2506, 20161, 2707, 767, 261, 178, 107, 30, 14, 3, 0, 100694}, - {8806, 36478, 8817, 3268, 1280, 850, 401, 114, 42, 0, 0, 58572}, - {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175, 32, 0, 0, 19284}, - }, - { - // Coeff Band ( 1 ) - {9738, 11313, 959, 205, 70, 18, 11, 1, 0, 0, 0, 0}, - {12628, 15085, 1507, 273, 52, 19, 9, 0, 0, 0, 0, 54280}, - {10701, 15846, 5561, 1926, 813, 570, 249, 36, 0, 0, 0, 6460}, - }, - { - // Coeff Band ( 2 ) - {6781, 22539, 2784, 634, 182, 123, 20, 4, 0, 0, 0, 0}, - {6263, 11544, 2649, 790, 259, 168, 27, 5, 0, 0, 0, 20539}, - {3109, 4075, 2031, 896, 457, 386, 158, 29, 0, 0, 0, 1138}, - }, - { - // Coeff Band ( 3 ) - {11515, 4079, 465, 73, 5, 14, 2, 0, 0, 0, 0, 0}, - {9361, 5834, 650, 96, 24, 8, 4, 0, 0, 0, 0, 22181}, - {4343, 3974, 1360, 415, 132, 96, 14, 1, 0, 0, 0, 1267}, - }, - { - // Coeff Band ( 4 ) - {4787, 9297, 823, 168, 44, 12, 4, 0, 0, 0, 0, 0}, - {3619, 4472, 719, 198, 60, 31, 3, 0, 0, 0, 0, 8401}, - {1157, 1175, 483, 182, 88, 31, 8, 0, 0, 0, 0, 268}, - }, - { - // Coeff Band ( 5 ) - {8299, 1226, 32, 5, 1, 0, 0, 0, 0, 0, 0, 0}, - {3502, 1568, 57, 4, 1, 1, 0, 0, 0, 0, 0, 9811}, - {1055, 1070, 166, 29, 6, 1, 0, 0, 0, 0, 0, 527}, - }, - { - // Coeff Band ( 6 ) - {27414, 27927, 1989, 347, 69, 26, 0, 0, 0, 0, 0, 0}, - {5876, 10074, 1574, 341, 91, 24, 4, 0, 0, 0, 0, 21954}, - {1571, 2171, 778, 324, 124, 65, 16, 0, 0, 0, 0, 979}, - }, - { - // Coeff Band ( 7 ) - { 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459}, - { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13}, - }, - }, -}; - -// Update probabilities for the nodes in the token entropy tree. -const vp9_prob tree_update_probs[vp9_coef_tree_dimen] = { - { - { - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, - {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, - {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, }, - {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, }, - {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - }, - { - { - {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, }, - {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, }, - }, - { - {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, - {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - }, - { - { - {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, }, - {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, }, - {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, }, - }, - { - {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - }, - { - { - {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, }, - {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, - {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, - {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, - {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, - {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, - {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, - {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - { - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, - }, - }, -}; -#endif diff --git a/vp9/common/debugmodes.c b/vp9/common/debugmodes.c deleted file mode 100644 index 5cd84c51b..000000000 --- a/vp9/common/debugmodes.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include "blockd.h" - -void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, - int frame) { - int mb_row; - int mb_col; - int mb_index = 0; - FILE *mvs = fopen("mvs.stt", "a"); - - /* print out the macroblock Y modes */ - mb_index = 0; - fprintf(mvs, "Mb Modes for Frame %d\n", frame); - - for (mb_row = 0; mb_row < rows; mb_row++) { - for (mb_col = 0; mb_col < cols; mb_col++) { - - fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode); - - mb_index++; - } - - fprintf(mvs, "\n"); - mb_index++; - } - - fprintf(mvs, "\n"); - - mb_index = 0; - fprintf(mvs, "Mb mv ref for Frame %d\n", frame); - - for (mb_row = 0; mb_row < rows; mb_row++) { - for (mb_col = 0; mb_col < cols; mb_col++) { - - fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame); - - mb_index++; - } - - fprintf(mvs, "\n"); - mb_index++; - } - - fprintf(mvs, "\n"); - - /* print out the macroblock UV modes */ - mb_index = 0; - fprintf(mvs, "UV Modes for Frame %d\n", frame); - - for (mb_row = 0; mb_row < rows; mb_row++) { - for (mb_col = 0; mb_col < cols; mb_col++) { - - fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode); - - mb_index++; - } - - mb_index++; - fprintf(mvs, "\n"); - } - - fprintf(mvs, "\n"); - - /* print out the block modes */ - mb_index = 0; - fprintf(mvs, "Mbs for Frame %d\n", frame); - { - int b_row; - - for (b_row = 0; b_row < 4 * rows; b_row++) { - int b_col; - int bindex; - - for (b_col = 0; b_col < 4 * cols; b_col++) { - mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); - bindex = (b_row & 3) * 4 + (b_col & 3); - - if (mi[mb_index].mbmi.mode == B_PRED) { - fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.first); -#if CONFIG_COMP_INTRA_PRED - fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.second); -#endif - } else - fprintf(mvs, "xx "); - - } - - fprintf(mvs, "\n"); - } - } - fprintf(mvs, "\n"); - - /* print out the macroblock mvs */ - mb_index = 0; - fprintf(mvs, "MVs for Frame %d\n", frame); - - for (mb_row = 0; mb_row < rows; mb_row++) { - for (mb_col = 0; mb_col < cols; mb_col++) { - fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv[0].as_mv.row / 2, - mi[mb_index].mbmi.mv[0].as_mv.col / 2); - - mb_index++; - } - - mb_index++; - fprintf(mvs, "\n"); - } - - fprintf(mvs, "\n"); - - /* print out the block modes */ - mb_index = 0; - fprintf(mvs, "MVs for Frame %d\n", frame); - { - int b_row; - - for (b_row = 0; b_row < 4 * rows; b_row++) { - int b_col; - int bindex; - - for (b_col = 0; b_col < 4 * cols; b_col++) { - mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); - bindex = (b_row & 3) * 4 + (b_col & 3); - fprintf(mvs, "%3d:%-3d ", - mi[mb_index].bmi[bindex].as_mv.first.as_mv.row, - mi[mb_index].bmi[bindex].as_mv.first.as_mv.col); - - } - - fprintf(mvs, "\n"); - } - } - fprintf(mvs, "\n"); - - fclose(mvs); -} diff --git a/vp9/common/default_coef_probs.h b/vp9/common/default_coef_probs.h deleted file mode 100644 index 4caf8818d..000000000 --- a/vp9/common/default_coef_probs.h +++ /dev/null @@ -1,1377 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. -*/ - - -/*Generated file, included by entropy.c*/ - - -static const vp9_prob default_coef_probs [BLOCK_TYPES] - [COEF_BANDS] - [PREV_COEF_CONTEXTS] - [ENTROPY_NODES] = { - { - /* Block Type ( 0 ) */ - { - /* Coeff Band ( 0 )*/ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 1 )*/ - { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 }, - { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 }, - { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }, - { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 }, - }, - { - /* Coeff Band ( 2 )*/ - { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 }, - { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 }, - { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 }, - { 64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 }, - }, - { - /* Coeff Band ( 3 )*/ - { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 }, - { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 }, - { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 }, - { 64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 4 )*/ - { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 }, - { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 }, - { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }, - { 28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 }, - }, - { - /* Coeff Band ( 5 )*/ - { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 }, - { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 }, - { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }, - { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 6 )*/ - { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 }, - { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 }, - { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }, - { 64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 7 )*/ - { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - } - }, - { - /* Block Type ( 1 ) */ - { - /* Coeff Band ( 0 )*/ - { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 }, - { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 }, - { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }, - { 48, 32, 146, 208, 149, 167, 221, 162, 255, 223, 128 }, - }, - { - /* Coeff Band ( 1 )*/ - { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 }, - { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 }, - { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }, - { 66, 90, 181, 242, 176, 190, 249, 202, 255, 255, 128 }, - }, - { - /* Coeff Band ( 2 )*/ - { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 }, - { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 }, - { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }, - { 18, 80, 163, 242, 170, 187, 247, 210, 255, 255, 128 }, - }, - { - /* Coeff Band ( 3 )*/ - { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 }, - { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 }, - { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }, - { 36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 }, - }, - { - /* Coeff Band ( 4 )*/ - { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 }, - { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 }, - { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }, - { 18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 }, - }, - { - /* Coeff Band ( 5 )*/ - { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 }, - { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 }, - { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }, - { 28, 70, 181, 251, 193, 211, 255, 205, 128, 128, 128 }, - }, - { - /* Coeff Band ( 6 )*/ - { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 }, - { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 }, - { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }, - { 40, 90, 188, 251, 195, 217, 255, 224, 128, 128, 128 }, - }, - { - /* Coeff Band ( 7 )*/ - { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 }, - { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, - { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }, - { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }, - } - }, - { - /* Block Type ( 2 ) */ - { - /* Coeff Band ( 0 )*/ - { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 }, - { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 }, - { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }, - { 64, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }, - }, - { - /* Coeff Band ( 1 )*/ - { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 }, - { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 }, - { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }, - { 140, 70, 195, 248, 188, 195, 255, 255, 128, 128, 128 }, - }, - { - /* Coeff Band ( 2 )*/ - { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 }, - { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 }, - { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }, - { 60, 40, 190, 239, 201, 218, 255, 228, 128, 128, 128 }, - }, - { - /* Coeff Band ( 3 )*/ - { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 }, - { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 }, - { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, - { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 4 )*/ - { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, - { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 }, - { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 5 )*/ - { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 6 )*/ - { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 }, - { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 }, - { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 48, 85, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 7 )*/ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - } - }, - { - /* Block Type ( 3 ) */ - { - /* Coeff Band ( 0 )*/ - { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 }, - { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 }, - { 63, 48, 138, 219, 151, 178, 240, 170, 255, 216, 128 }, - { 54, 40, 138, 219, 151, 178, 240, 170, 255, 216, 128 }, - }, - { - /* Coeff Band ( 1 )*/ - { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 }, - { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 }, - { 44, 84, 162, 232, 172, 180, 245, 178, 255, 255, 128 }, - { 32, 70, 162, 232, 172, 180, 245, 178, 255, 255, 128 }, - }, - { - /* Coeff Band ( 2 )*/ - { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 }, - { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 }, - { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }, - { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }, - }, - { - /* Coeff Band ( 3 )*/ - { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 }, - { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 }, - { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }, - { 26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 }, - }, - { - /* Coeff Band ( 4 )*/ - { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 }, - { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 }, - { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }, - { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }, - }, - { - /* Coeff Band ( 5 )*/ - { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 }, - { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 }, - { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }, - { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }, - }, - { - /* Coeff Band ( 6 )*/ - { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 }, - { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 }, - { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }, - { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }, - }, - { - /* Coeff Band ( 7 )*/ - { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - } - } -}; - -static const vp9_prob default_hybrid_coef_probs [BLOCK_TYPES] - [COEF_BANDS] - [PREV_COEF_CONTEXTS] - [ENTROPY_NODES] = { - { - /* Block Type ( 0 ) */ - { - /* Coeff Band ( 0 )*/ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 1 )*/ - { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 }, - { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 }, - { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }, - { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 }, - }, - { - /* Coeff Band ( 2 )*/ - { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 }, - { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 }, - { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 }, - { 64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 }, - }, - { - /* Coeff Band ( 3 )*/ - { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 }, - { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 }, - { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 }, - { 64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 4 )*/ - { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 }, - { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 }, - { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }, - { 28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 }, - }, - { - /* Coeff Band ( 5 )*/ - { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 }, - { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 }, - { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }, - { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 6 )*/ - { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 }, - { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 }, - { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }, - { 64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 7 )*/ - { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - } - }, - { - /* Block Type ( 1 ) */ - { - /* Coeff Band ( 0 )*/ - { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 }, - { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 }, - { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }, - { 48, 32, 146, 208, 149, 167, 221, 162, 255, 223, 128 }, - }, - { - /* Coeff Band ( 1 )*/ - { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 }, - { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 }, - { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }, - { 66, 90, 181, 242, 176, 190, 249, 202, 255, 255, 128 }, - }, - { - /* Coeff Band ( 2 )*/ - { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 }, - { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 }, - { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }, - { 18, 80, 163, 242, 170, 187, 247, 210, 255, 255, 128 }, - }, - { - /* Coeff Band ( 3 )*/ - { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 }, - { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 }, - { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }, - { 36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 }, - }, - { - /* Coeff Band ( 4 )*/ - { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 }, - { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 }, - { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }, - { 18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 }, - }, - { - /* Coeff Band ( 5 )*/ - { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 }, - { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 }, - { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }, - { 28, 70, 181, 251, 193, 211, 255, 205, 128, 128, 128 }, - }, - { - /* Coeff Band ( 6 )*/ - { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 }, - { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 }, - { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }, - { 40, 90, 188, 251, 195, 217, 255, 224, 128, 128, 128 }, - }, - { - /* Coeff Band ( 7 )*/ - { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 }, - { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, - { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }, - { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }, - } - }, - { - /* Block Type ( 2 ) */ - { - /* Coeff Band ( 0 )*/ - { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 }, - { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 }, - { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }, - { 64, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }, - }, - { - /* Coeff Band ( 1 )*/ - { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 }, - { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 }, - { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }, - { 140, 70, 195, 248, 188, 195, 255, 255, 128, 128, 128 }, - }, - { - /* Coeff Band ( 2 )*/ - { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 }, - { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 }, - { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }, - { 60, 40, 190, 239, 201, 218, 255, 228, 128, 128, 128 }, - }, - { - /* Coeff Band ( 3 )*/ - { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 }, - { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 }, - { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, - { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 4 )*/ - { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, - { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 }, - { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 5 )*/ - { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 6 )*/ - { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 }, - { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 }, - { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 48, 85, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - }, - { - /* Coeff Band ( 7 )*/ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, - } - }, - { - /* Block Type ( 3 ) */ - { - /* Coeff Band ( 0 )*/ - { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 }, - { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 }, - { 63, 48, 138, 219, 151, 178, 240, 170, 255, 216, 128 }, - { 54, 40, 138, 219, 151, 178, 240, 170, 255, 216, 128 }, - }, - { - /* Coeff Band ( 1 )*/ - { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 }, - { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 }, - { 44, 84, 162, 232, 172, 180, 245, 178, 255, 255, 128 }, - { 32, 70, 162, 232, 172, 180, 245, 178, 255, 255, 128 }, - }, - { - /* Coeff Band ( 2 )*/ - { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 }, - { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 }, - { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }, - { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }, - }, - { - /* Coeff Band ( 3 )*/ - { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 }, - { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 }, - { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }, - { 26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 }, - }, - { - /* Coeff Band ( 4 )*/ - { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 }, - { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 }, - { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }, - { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }, - }, - { - /* Coeff Band ( 5 )*/ - { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 }, - { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 }, - { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }, - { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }, - }, - { - /* Coeff Band ( 6 )*/ - { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 }, - { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 }, - { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }, - { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }, - }, - { - /* Coeff Band ( 7 )*/ - { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, - } - } -}; - -static const vp9_prob -default_coef_probs_8x8[BLOCK_TYPES_8X8] -[COEF_BANDS] -[PREV_COEF_CONTEXTS] -[ENTROPY_NODES] = { - { - /* block Type 0 */ - { - /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { - /* Coeff Band 1 */ - { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128}, - { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128}, - { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}, - { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128} - }, - { - /* Coeff Band 2 */ - { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128}, - { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128}, - { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}, - { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128} - }, - { - /* Coeff Band 3 */ - { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128}, - { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128}, - { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}, - { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128} - }, - { - /* Coeff Band 4 */ - { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128}, - { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128}, - { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}, - { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128} - }, - { - /* Coeff Band 5 */ - { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128}, - { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} - }, - { - /* Coeff Band 6 */ - { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128}, - { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} - }, - { - /* Coeff Band 7 */ - { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128}, - { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128}, - { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}, - { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128} - } - }, - { - /* block Type 1 */ - { - /* Coeff Band 0 */ - { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128}, - { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128}, - { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128} - }, - { - /* Coeff Band 1 */ - { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128}, - { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128}, - { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}, - { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128} - }, - { - /* Coeff Band 2 */ - { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128}, - { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128}, - { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}, - { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128} - }, - { - /* Coeff Band 3 */ - { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128}, - { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128}, - { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}, - { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128} - }, - { - /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { - /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { - /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { - /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - } - }, - { - /* block Type 2 */ - { - /* Coeff Band 0 */ - { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128}, - { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128}, - { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}, - { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128} - }, - { - /* Coeff Band 1 */ - { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128}, - { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128}, - { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}, - { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128} - }, - { - /* Coeff Band 2 */ - { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128}, - { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128}, - { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}, - { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128} - }, - { - /* Coeff Band 3 */ - { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128}, - { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128}, - { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}, - { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128} - }, - { - /* Coeff Band 4 */ - { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128}, - { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128}, - { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}, - { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128} - }, - { - /* Coeff Band 5 */ - { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128}, - { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128}, - { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}, - { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128} - }, - { - /* Coeff Band 6 */ - { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128}, - { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128}, - { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}, - { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128} - }, - { - /* Coeff Band 7 */ - { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128}, - { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128}, - { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}, - { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128} - } - }, - { /* block Type 3 */ - { /* Coeff Band 0 */ - { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255}, - { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255}, - { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { /* Coeff Band 1 */ - { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128}, - { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128}, - { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128}, - { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128} - }, - { /* Coeff Band 2 */ - { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128}, - { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128}, - { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128}, - { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128} - }, - { /* Coeff Band 3 */ - { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128}, - { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128}, - { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128}, - { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128} - }, - { /* Coeff Band 4 */ - { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128}, - { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128}, - { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128}, - { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128} - }, - { /* Coeff Band 5 */ - { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128}, - { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128}, - { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128}, - { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128} - }, - { /* Coeff Band 6 */ - { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128}, - { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128}, - { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128}, - { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128} - }, - { /* Coeff Band 7 */ - { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128}, - { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128}, - { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - } - } -}; - -static const vp9_prob -default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8] - [COEF_BANDS] - [PREV_COEF_CONTEXTS] - [ENTROPY_NODES] = { - { - /* block Type 0 */ - { - /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { - /* Coeff Band 1 */ - { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128}, - { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128}, - { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}, - { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128} - }, - { - /* Coeff Band 2 */ - { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128}, - { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128}, - { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}, - { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128} - }, - { - /* Coeff Band 3 */ - { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128}, - { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128}, - { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}, - { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128} - }, - { - /* Coeff Band 4 */ - { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128}, - { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128}, - { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}, - { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128} - }, - { - /* Coeff Band 5 */ - { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128}, - { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} - }, - { - /* Coeff Band 6 */ - { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128}, - { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} - }, - { - /* Coeff Band 7 */ - { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128}, - { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128}, - { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}, - { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128} - } - }, - { - /* block Type 1 */ - { - /* Coeff Band 0 */ - { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128}, - { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128}, - { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128} - }, - { - /* Coeff Band 1 */ - { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128}, - { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128}, - { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}, - { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128} - }, - { - /* Coeff Band 2 */ - { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128}, - { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128}, - { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}, - { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128} - }, - { - /* Coeff Band 3 */ - { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128}, - { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128}, - { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}, - { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128} - }, - { - /* Coeff Band 4 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { - /* Coeff Band 5 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { - /* Coeff Band 6 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { - /* Coeff Band 7 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - } - }, - { - /* block Type 2 */ - { - /* Coeff Band 0 */ - { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128}, - { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128}, - { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}, - { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128} - }, - { - /* Coeff Band 1 */ - { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128}, - { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128}, - { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}, - { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128} - }, - { - /* Coeff Band 2 */ - { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128}, - { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128}, - { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}, - { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128} - }, - { - /* Coeff Band 3 */ - { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128}, - { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128}, - { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}, - { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128} - }, - { - /* Coeff Band 4 */ - { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128}, - { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128}, - { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}, - { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128} - }, - { - /* Coeff Band 5 */ - { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128}, - { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128}, - { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}, - { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128} - }, - { - /* Coeff Band 6 */ - { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128}, - { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128}, - { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}, - { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128} - }, - { - /* Coeff Band 7 */ - { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128}, - { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128}, - { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}, - { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128} - } - }, - { /* block Type 3 */ - { /* Coeff Band 0 */ - { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255}, - { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255}, - { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { /* Coeff Band 1 */ - { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128}, - { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128}, - { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128}, - { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128} - }, - { /* Coeff Band 2 */ - { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128}, - { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128}, - { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128}, - { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128} - }, - { /* Coeff Band 3 */ - { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128}, - { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128}, - { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128}, - { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128} - }, - { /* Coeff Band 4 */ - { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128}, - { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128}, - { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128}, - { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128} - }, - { /* Coeff Band 5 */ - { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128}, - { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128}, - { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128}, - { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128} - }, - { /* Coeff Band 6 */ - { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128}, - { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128}, - { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128}, - { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128} - }, - { /* Coeff Band 7 */ - { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128}, - { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128}, - { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - } - } -}; - -static const vp9_prob - default_coef_probs_16x16[BLOCK_TYPES_16X16] - [COEF_BANDS] - [PREV_COEF_CONTEXTS] - [ENTROPY_NODES] = { - { /* block Type 0 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { /* Coeff Band 1 */ - { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128}, - { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128}, - { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}, - { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128} - }, - { /* Coeff Band 2 */ - { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128}, - { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128}, - { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}, - { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128} - }, - { /* Coeff Band 3 */ - { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128}, - { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128}, - { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}, - { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128} - }, - { /* Coeff Band 4 */ - { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128}, - { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128}, - { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}, - { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128} - }, - { /* Coeff Band 5 */ - { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128}, - { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} - }, - { /* Coeff Band 6 */ - { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128}, - { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} - }, - { /* Coeff Band 7 */ - { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128}, - { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128}, - { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}, - { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128} - } - }, - { /* block Type 1 */ - { /* Coeff Band 0 */ - { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128}, - { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255}, - { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { /* Coeff Band 1 */ - { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128}, - { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128}, - { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128}, - { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128} - }, - { /* Coeff Band 2 */ - { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128}, - { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128}, - { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128}, - { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128} - }, - { /* Coeff Band 3 */ - { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128}, - { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128}, - { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128}, - { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128} - }, - { /* Coeff Band 4 */ - { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128}, - { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128}, - { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128}, - { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128} - }, - { /* Coeff Band 5 */ - { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128}, - { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128}, - { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128}, - { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128} - }, - { /* Coeff Band 6 */ - { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128}, - { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128}, - { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128}, - { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128} - }, - { /* Coeff Band 7 */ - { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128}, - { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128}, - { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128}, - { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128} - } - }, - { /* block Type 2 */ - { /* Coeff Band 0 */ - { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128}, - { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255}, - { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { /* Coeff Band 1 */ - { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128}, - { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128}, - { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128}, - { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128} - }, - { /* Coeff Band 2 */ - { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128}, - { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128}, - { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128}, - { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128} - }, - { /* Coeff Band 3 */ - { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128}, - { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128}, - { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128}, - { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128} - }, - { /* Coeff Band 4 */ - { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128}, - { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128}, - { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128}, - { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128} - }, - { /* Coeff Band 5 */ - { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128}, - { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128}, - { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128}, - { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128} - }, - { /* Coeff Band 6 */ - { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128}, - { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128}, - { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128}, - { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128} - }, - { /* Coeff Band 7 */ - { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128}, - { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128}, - { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128}, - { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128} - } - }, - { /* block Type 3 */ - { /* Coeff Band 0 */ - { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184}, - { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200}, - { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { /* Coeff Band 1 */ - { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128}, - { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128}, - { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255}, - { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255} - }, - { /* Coeff Band 2 */ - { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128}, - { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255}, - { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255}, - { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255} - }, - { /* Coeff Band 3 */ - { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128}, - { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128}, - { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128}, - { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255} - }, - { /* Coeff Band 4 */ - { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128}, - { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128}, - { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128}, - { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128} - }, - { /* Coeff Band 5 */ - { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128}, - { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128}, - { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128}, - { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128} - }, - { /* Coeff Band 6 */ - { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128}, - { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128}, - { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128}, - { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255} - }, - { /* Coeff Band 7 */ - { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128}, - { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128}, - { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128}, - { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128} - } - } -}; - -static const vp9_prob - default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16] - [COEF_BANDS] - [PREV_COEF_CONTEXTS] - [ENTROPY_NODES] = { - { /* block Type 0 */ - { /* Coeff Band 0 */ - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { /* Coeff Band 1 */ - { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128}, - { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128}, - { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}, - { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128} - }, - { /* Coeff Band 2 */ - { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128}, - { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128}, - { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}, - { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128} - }, - { /* Coeff Band 3 */ - { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128}, - { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128}, - { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}, - { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128} - }, - { /* Coeff Band 4 */ - { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128}, - { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128}, - { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}, - { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128} - }, - { /* Coeff Band 5 */ - { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128}, - { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} - }, - { /* Coeff Band 6 */ - { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128}, - { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, - { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} - }, - { /* Coeff Band 7 */ - { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128}, - { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128}, - { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}, - { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128} - } - }, - { /* block Type 1 */ - { /* Coeff Band 0 */ - { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128}, - { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255}, - { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { /* Coeff Band 1 */ - { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128}, - { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128}, - { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128}, - { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128} - }, - { /* Coeff Band 2 */ - { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128}, - { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128}, - { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128}, - { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128} - }, - { /* Coeff Band 3 */ - { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128}, - { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128}, - { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128}, - { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128} - }, - { /* Coeff Band 4 */ - { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128}, - { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128}, - { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128}, - { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128} - }, - { /* Coeff Band 5 */ - { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128}, - { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128}, - { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128}, - { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128} - }, - { /* Coeff Band 6 */ - { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128}, - { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128}, - { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128}, - { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128} - }, - { /* Coeff Band 7 */ - { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128}, - { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128}, - { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128}, - { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128} - } - }, - { /* block Type 2 */ - { /* Coeff Band 0 */ - { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128}, - { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255}, - { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { /* Coeff Band 1 */ - { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128}, - { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128}, - { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128}, - { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128} - }, - { /* Coeff Band 2 */ - { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128}, - { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128}, - { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128}, - { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128} - }, - { /* Coeff Band 3 */ - { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128}, - { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128}, - { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128}, - { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128} - }, - { /* Coeff Band 4 */ - { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128}, - { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128}, - { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128}, - { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128} - }, - { /* Coeff Band 5 */ - { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128}, - { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128}, - { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128}, - { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128} - }, - { /* Coeff Band 6 */ - { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128}, - { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128}, - { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128}, - { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128} - }, - { /* Coeff Band 7 */ - { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128}, - { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128}, - { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128}, - { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128} - } - }, - { /* block Type 3 */ - { /* Coeff Band 0 */ - { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184}, - { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200}, - { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247}, - { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} - }, - { /* Coeff Band 1 */ - { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128}, - { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128}, - { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255}, - { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255} - }, - { /* Coeff Band 2 */ - { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128}, - { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255}, - { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255}, - { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255} - }, - { /* Coeff Band 3 */ - { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128}, - { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128}, - { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128}, - { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255} - }, - { /* Coeff Band 4 */ - { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128}, - { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128}, - { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128}, - { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128} - }, - { /* Coeff Band 5 */ - { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128}, - { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128}, - { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128}, - { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128} - }, - { /* Coeff Band 6 */ - { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128}, - { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128}, - { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128}, - { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255} - }, - { /* Coeff Band 7 */ - { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128}, - { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128}, - { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128}, - { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128} - } - } -}; diff --git a/vp9/common/entropy.c b/vp9/common/entropy.c deleted file mode 100644 index a4ecae09f..000000000 --- a/vp9/common/entropy.c +++ /dev/null @@ -1,447 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include - -#include "entropy.h" -#include "string.h" -#include "blockd.h" -#include "onyxc_int.h" -#include "entropymode.h" -#include "vpx_mem/vpx_mem.h" - -#define uchar unsigned char /* typedefs can clash */ -#define uint unsigned int - -typedef const uchar cuchar; -typedef const uint cuint; - -typedef vp9_prob Prob; - -#include "coefupdateprobs.h" - -const int vp9_i8x8_block[4] = {0, 2, 8, 10}; - -DECLARE_ALIGNED(16, const unsigned char, vp9_norm[256]) = { - 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]) = { - 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7 -}; - -DECLARE_ALIGNED(16, cuchar, vp9_prev_token_class[MAX_ENTROPY_TOKENS]) = { - 0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0 -}; - -DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]) = { - 0, 1, 4, 8, - 5, 2, 3, 6, - 9, 12, 13, 10, - 7, 11, 14, 15, -}; - -DECLARE_ALIGNED(16, const int, vp9_col_scan[16]) = { - 0, 4, 8, 12, - 1, 5, 9, 13, - 2, 6, 10, 14, - 3, 7, 11, 15 -}; -DECLARE_ALIGNED(16, const int, vp9_row_scan[16]) = { - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, 14, 15 -}; - - -DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5, - 5, 3, 6, 3, 5, 4, 6, 6, - 6, 5, 5, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7 - }; -DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = { - 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, - 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, - 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, - 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, -}; - -// Table can be optimized. -DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]) = { - 0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6, - 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, -}; -DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = { - 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4, 5, - 20, 35, 50, 65, 80, 96, 81, 66, 51, 36, 21, 6, 7, 22, 37, 52, - 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, 8, 9, 24, 39, - 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100, 85, 70, 55, 40, - 25, 10, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 192, 177, - 162, 147, 132, 117, 102, 87, 72, 57, 42, 27, 12, 13, 28, 43, 58, 73, - 88, 103, 118, 133, 148, 163, 178, 193, 208, 224, 209, 194, 179, 164, 149, 134, - 119, 104, 89, 74, 59, 44, 29, 14, 15, 30, 45, 60, 75, 90, 105, 120, - 135, 150, 165, 180, 195, 210, 225, 240, 241, 226, 211, 196, 181, 166, 151, 136, - 121, 106, 91, 76, 61, 46, 31, 47, 62, 77, 92, 107, 122, 137, 152, 167, - 182, 197, 212, 227, 242, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, - 78, 63, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, - 215, 200, 185, 170, 155, 140, 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, - 216, 231, 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, - 203, 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235, - 250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255, -}; - - -/* Array indices are identical to previously-existing CONTEXT_NODE indices */ - -const vp9_tree_index vp9_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */ -{ - -DCT_EOB_TOKEN, 2, /* 0 = EOB */ - -ZERO_TOKEN, 4, /* 1 = ZERO */ - -ONE_TOKEN, 6, /* 2 = ONE */ - 8, 12, /* 3 = LOW_VAL */ - -TWO_TOKEN, 10, /* 4 = TWO */ - -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */ - 14, 16, /* 6 = HIGH_LOW */ - -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */ - 18, 20, /* 8 = CAT_THREEFOUR */ - -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */ - -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */ -}; - -struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS]; - -/* Trees for extra bits. Probabilities are constant and - do not depend on previously encoded bits */ - -static const Prob Pcat1[] = { 159}; -static const Prob Pcat2[] = { 165, 145}; -static const Prob Pcat3[] = { 173, 148, 140}; -static const Prob Pcat4[] = { 176, 155, 140, 135}; -static const Prob Pcat5[] = { 180, 157, 141, 134, 130}; -static const Prob Pcat6[] = -{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129}; - -static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[26]; - -static void init_bit_tree(vp9_tree_index *p, int n) { - int i = 0; - - while (++i < n) { - p[0] = p[1] = i << 1; - p += 2; - } - - p[0] = p[1] = 0; -} - -static void init_bit_trees() { - init_bit_tree(cat1, 1); - init_bit_tree(cat2, 2); - init_bit_tree(cat3, 3); - init_bit_tree(cat4, 4); - init_bit_tree(cat5, 5); - init_bit_tree(cat6, 13); -} - -vp9_extra_bit_struct vp9_extra_bits[12] = { - { 0, 0, 0, 0}, - { 0, 0, 0, 1}, - { 0, 0, 0, 2}, - { 0, 0, 0, 3}, - { 0, 0, 0, 4}, - { cat1, Pcat1, 1, 5}, - { cat2, Pcat2, 2, 7}, - { cat3, Pcat3, 3, 11}, - { cat4, Pcat4, 4, 19}, - { cat5, Pcat5, 5, 35}, - { cat6, Pcat6, 13, 67}, - { 0, 0, 0, 0} -}; - -#include "default_coef_probs.h" - -void vp9_default_coef_probs(VP9_COMMON *pc) { - vpx_memcpy(pc->fc.coef_probs, default_coef_probs, - sizeof(pc->fc.coef_probs)); - vpx_memcpy(pc->fc.hybrid_coef_probs, default_hybrid_coef_probs, - sizeof(pc->fc.hybrid_coef_probs)); - - vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8, - sizeof(pc->fc.coef_probs_8x8)); - vpx_memcpy(pc->fc.hybrid_coef_probs_8x8, default_hybrid_coef_probs_8x8, - sizeof(pc->fc.hybrid_coef_probs_8x8)); - - vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16, - sizeof(pc->fc.coef_probs_16x16)); - vpx_memcpy(pc->fc.hybrid_coef_probs_16x16, - default_hybrid_coef_probs_16x16, - sizeof(pc->fc.hybrid_coef_probs_16x16)); -} - -void vp9_coef_tree_initialize() { - init_bit_trees(); - vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree); -} - -// #define COEF_COUNT_TESTING - -#define COEF_COUNT_SAT 24 -#define COEF_MAX_UPDATE_FACTOR 112 -#define COEF_COUNT_SAT_KEY 24 -#define COEF_MAX_UPDATE_FACTOR_KEY 112 -#define COEF_COUNT_SAT_AFTER_KEY 24 -#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128 - -void vp9_adapt_coef_probs(VP9_COMMON *cm) { - int t, i, j, k, count; - unsigned int branch_ct[ENTROPY_NODES][2]; - vp9_prob coef_probs[ENTROPY_NODES]; - int update_factor; /* denominator 256 */ - int factor; - int count_sat; - - // printf("Frame type: %d\n", cm->frame_type); - if (cm->frame_type == KEY_FRAME) { - update_factor = COEF_MAX_UPDATE_FACTOR_KEY; - count_sat = COEF_COUNT_SAT_KEY; - } else if (cm->last_frame_type == KEY_FRAME) { - update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY; /* adapt quickly */ - count_sat = COEF_COUNT_SAT_AFTER_KEY; - } else { - update_factor = COEF_MAX_UPDATE_FACTOR; - count_sat = COEF_COUNT_SAT; - } - -#ifdef COEF_COUNT_TESTING - { - printf("static const unsigned int\ncoef_counts" - "[BLOCK_TYPES] [COEF_BANDS]" - "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n"); - for (i = 0; i < BLOCK_TYPES; ++i) { - printf(" {\n"); - for (j = 0; j < COEF_BANDS; ++j) { - printf(" {\n"); - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - printf(" {"); - for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) - printf("%d, ", cm->fc.coef_counts[i][j][k][t]); - printf("},\n"); - } - printf(" },\n"); - } - printf(" },\n"); - } - printf("};\n"); - printf("static const unsigned int\ncoef_counts_8x8" - "[BLOCK_TYPES_8X8] [COEF_BANDS]" - "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n"); - for (i = 0; i < BLOCK_TYPES_8X8; ++i) { - printf(" {\n"); - for (j = 0; j < COEF_BANDS; ++j) { - printf(" {\n"); - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - printf(" {"); - for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) - printf("%d, ", cm->fc.coef_counts_8x8[i][j][k][t]); - printf("},\n"); - } - printf(" },\n"); - } - printf(" },\n"); - } - printf("};\n"); - printf("static const unsigned int\nhybrid_coef_counts" - "[BLOCK_TYPES] [COEF_BANDS]" - "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n"); - for (i = 0; i < BLOCK_TYPES; ++i) { - printf(" {\n"); - for (j = 0; j < COEF_BANDS; ++j) { - printf(" {\n"); - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - printf(" {"); - for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) - printf("%d, ", cm->fc.hybrid_coef_counts[i][j][k][t]); - printf("},\n"); - } - printf(" },\n"); - } - printf(" },\n"); - } - printf("};\n"); - } -#endif - - for (i = 0; i < BLOCK_TYPES; ++i) - for (j = 0; j < COEF_BANDS; ++j) - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) - continue; - vp9_tree_probs_from_distribution( - MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, - coef_probs, branch_ct, cm->fc.coef_counts [i][j][k], - 256, 1); - for (t = 0; t < ENTROPY_NODES; ++t) { - int prob; - count = branch_ct[t][0] + branch_ct[t][1]; - count = count > count_sat ? count_sat : count; - factor = (update_factor * count / count_sat); - prob = ((int)cm->fc.pre_coef_probs[i][j][k][t] * (256 - factor) + - (int)coef_probs[t] * factor + 128) >> 8; - if (prob <= 0) cm->fc.coef_probs[i][j][k][t] = 1; - else if (prob > 255) cm->fc.coef_probs[i][j][k][t] = 255; - else cm->fc.coef_probs[i][j][k][t] = prob; - } - } - - for (i = 0; i < BLOCK_TYPES; ++i) - for (j = 0; j < COEF_BANDS; ++j) - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) - continue; - vp9_tree_probs_from_distribution( - MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, - coef_probs, branch_ct, cm->fc.hybrid_coef_counts [i][j][k], - 256, 1); - for (t = 0; t < ENTROPY_NODES; ++t) { - int prob; - count = branch_ct[t][0] + branch_ct[t][1]; - count = count > count_sat ? count_sat : count; - factor = (update_factor * count / count_sat); - prob = ((int)cm->fc.pre_hybrid_coef_probs[i][j][k][t] * (256 - factor) + - (int)coef_probs[t] * factor + 128) >> 8; - if (prob <= 0) cm->fc.hybrid_coef_probs[i][j][k][t] = 1; - else if (prob > 255) cm->fc.hybrid_coef_probs[i][j][k][t] = 255; - else cm->fc.hybrid_coef_probs[i][j][k][t] = prob; - } - } - - for (i = 0; i < BLOCK_TYPES_8X8; ++i) - for (j = 0; j < COEF_BANDS; ++j) - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) - continue; - vp9_tree_probs_from_distribution( - MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, - coef_probs, branch_ct, cm->fc.coef_counts_8x8 [i][j][k], - 256, 1); - for (t = 0; t < ENTROPY_NODES; ++t) { - int prob; - count = branch_ct[t][0] + branch_ct[t][1]; - count = count > count_sat ? count_sat : count; - factor = (update_factor * count / count_sat); - prob = ((int)cm->fc.pre_coef_probs_8x8[i][j][k][t] * (256 - factor) + - (int)coef_probs[t] * factor + 128) >> 8; - if (prob <= 0) cm->fc.coef_probs_8x8[i][j][k][t] = 1; - else if (prob > 255) cm->fc.coef_probs_8x8[i][j][k][t] = 255; - else cm->fc.coef_probs_8x8[i][j][k][t] = prob; - } - } - - for (i = 0; i < BLOCK_TYPES_8X8; ++i) - for (j = 0; j < COEF_BANDS; ++j) - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) - continue; - vp9_tree_probs_from_distribution( - MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, - coef_probs, branch_ct, cm->fc.hybrid_coef_counts_8x8 [i][j][k], - 256, 1); - for (t = 0; t < ENTROPY_NODES; ++t) { - int prob; - count = branch_ct[t][0] + branch_ct[t][1]; - count = count > count_sat ? count_sat : count; - factor = (update_factor * count / count_sat); - prob = ((int)cm->fc.pre_hybrid_coef_probs_8x8[i][j][k][t] * - (256 - factor) + - (int)coef_probs[t] * factor + 128) >> 8; - if (prob <= 0) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 1; - else if (prob > 255) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 255; - else cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = prob; - } - } - - for (i = 0; i < BLOCK_TYPES_16X16; ++i) - for (j = 0; j < COEF_BANDS; ++j) - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) - continue; - vp9_tree_probs_from_distribution( - MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, - coef_probs, branch_ct, cm->fc.coef_counts_16x16[i][j][k], 256, 1); - for (t = 0; t < ENTROPY_NODES; ++t) { - int prob; - count = branch_ct[t][0] + branch_ct[t][1]; - count = count > count_sat ? count_sat : count; - factor = (update_factor * count / count_sat); - prob = ((int)cm->fc.pre_coef_probs_16x16[i][j][k][t] * - (256 - factor) + - (int)coef_probs[t] * factor + 128) >> 8; - if (prob <= 0) cm->fc.coef_probs_16x16[i][j][k][t] = 1; - else if (prob > 255) cm->fc.coef_probs_16x16[i][j][k][t] = 255; - else cm->fc.coef_probs_16x16[i][j][k][t] = prob; - } - } - - for (i = 0; i < BLOCK_TYPES_16X16; ++i) - for (j = 0; j < COEF_BANDS; ++j) - for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { - if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) - continue; - vp9_tree_probs_from_distribution( - MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, - coef_probs, branch_ct, cm->fc.hybrid_coef_counts_16x16[i][j][k], 256, 1); - for (t = 0; t < ENTROPY_NODES; ++t) { - int prob; - count = branch_ct[t][0] + branch_ct[t][1]; - count = count > count_sat ? count_sat : count; - factor = (update_factor * count / count_sat); - prob = ((int)cm->fc.pre_hybrid_coef_probs_16x16[i][j][k][t] * (256 - factor) + - (int)coef_probs[t] * factor + 128) >> 8; - if (prob <= 0) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 1; - else if (prob > 255) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 255; - else cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = prob; - } - } -} diff --git a/vp9/common/entropy.h b/vp9/common/entropy.h deleted file mode 100644 index fa24a058e..000000000 --- a/vp9/common/entropy.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_ENTROPY_H -#define __INC_ENTROPY_H - -#include "treecoder.h" -#include "blockd.h" -#include "common.h" -#include "coefupdateprobs.h" - -extern const int vp9_i8x8_block[4]; - -/* Coefficient token alphabet */ - -#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */ -#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */ -#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */ -#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */ -#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */ -#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */ -#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */ -#define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */ -#define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */ -#define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */ -#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 13+1 */ -#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */ -#define MAX_ENTROPY_TOKENS 12 -#define ENTROPY_NODES 11 -#define EOSB_TOKEN 127 /* Not signalled, encoder only */ - -#define INTER_MODE_CONTEXTS 7 - -extern const vp9_tree_index vp9_coef_tree[]; - -extern struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS]; - -typedef struct { - vp9_tree_p tree; - const vp9_prob *prob; - int Len; - int base_val; -} vp9_extra_bit_struct; - -extern vp9_extra_bit_struct vp9_extra_bits[12]; /* indexed by token value */ - -#define PROB_UPDATE_BASELINE_COST 7 - -#define MAX_PROB 255 -#define DCT_MAX_VALUE 8192 - -/* Coefficients are predicted via a 3-dimensional probability table. */ - -/* Outside dimension. 0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */ -#define BLOCK_TYPES 4 - -#define BLOCK_TYPES_8X8 4 - -#define BLOCK_TYPES_16X16 4 - -/* Middle dimension is a coarsening of the coefficient's - position within the 4x4 DCT. */ - -#define COEF_BANDS 8 -extern DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]); -extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]); -extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]); - -/* Inside dimension is 3-valued measure of nearby complexity, that is, - the extent to which nearby coefficients are nonzero. For the first - coefficient (DC, unless block type is 0), we look at the (already encoded) - blocks above and to the left of the current block. The context index is - then the number (0,1,or 2) of these blocks having nonzero coefficients. - After decoding a coefficient, the measure is roughly the size of the - most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1). - Note that the intuitive meaning of this measure changes as coefficients - are decoded, e.g., prior to the first token, a zero means that my neighbors - are empty while, after the first token, because of the use of end-of-block, - a zero means we just decoded a zero and hence guarantees that a non-zero - coefficient will appear later in this block. However, this shift - in meaning is perfectly OK because our context depends also on the - coefficient band (and since zigzag positions 0, 1, and 2 are in - distinct bands). */ - -/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */ -#define PREV_COEF_CONTEXTS 4 - -#define SUBEXP_PARAM 4 /* Subexponential code parameter */ -#define MODULUS_PARAM 13 /* Modulus parameter */ - -extern DECLARE_ALIGNED(16, const unsigned char, vp9_prev_token_class[MAX_ENTROPY_TOKENS]); - -struct VP9Common; -void vp9_default_coef_probs(struct VP9Common *); -extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]); - -extern DECLARE_ALIGNED(16, const int, vp9_col_scan[16]); -extern DECLARE_ALIGNED(16, const int, vp9_row_scan[16]); - -extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]); -void vp9_coef_tree_initialize(void); - -extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]); -void vp9_adapt_coef_probs(struct VP9Common *); - -#endif diff --git a/vp9/common/entropymode.c b/vp9/common/entropymode.c deleted file mode 100644 index 5d3336916..000000000 --- a/vp9/common/entropymode.c +++ /dev/null @@ -1,713 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "onyxc_int.h" -#include "modecont.h" -#include "vpx_mem/vpx_mem.h" - - -static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = { - /* DC V H D45 135 117 153 D27 D63 TM i8x8 BPRED */ - {12, 6, 5, 5, 5, 5, 5, 5, 5, 2, 22, 200}, - {25, 13, 13, 7, 7, 7, 7, 7, 7, 6, 27, 160}, - {31, 17, 18, 8, 8, 8, 8, 8, 8, 9, 26, 139}, - {40, 22, 23, 8, 8, 8, 8, 8, 8, 12, 27, 116}, - {53, 26, 28, 8, 8, 8, 8, 8, 8, 13, 26, 94}, - {68, 33, 35, 8, 8, 8, 8, 8, 8, 17, 20, 68}, - {78, 38, 38, 8, 8, 8, 8, 8, 8, 19, 16, 52}, - {89, 42, 42, 8, 8, 8, 8, 8, 8, 21, 12, 34}, -}; - -static const unsigned int y_mode_cts [VP9_YMODES] = { - /* DC V H D45 135 117 153 D27 D63 TM i8x8 BPRED */ - 98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 16, 70 -}; - -static const unsigned int uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = { - /* DC V H D45 135 117 153 D27 D63 TM */ - { 200, 15, 15, 10, 10, 10, 10, 10, 10, 6}, /* DC */ - { 130, 75, 10, 10, 10, 10, 10, 10, 10, 6}, /* V */ - { 130, 10, 75, 10, 10, 10, 10, 10, 10, 6}, /* H */ - { 130, 15, 10, 75, 10, 10, 10, 10, 10, 6}, /* D45 */ - { 150, 15, 10, 10, 75, 10, 10, 10, 10, 6}, /* D135 */ - { 150, 15, 10, 10, 10, 75, 10, 10, 10, 6}, /* D117 */ - { 150, 15, 10, 10, 10, 10, 75, 10, 10, 6}, /* D153 */ - { 150, 15, 10, 10, 10, 10, 10, 75, 10, 6}, /* D27 */ - { 150, 15, 10, 10, 10, 10, 10, 10, 75, 6}, /* D63 */ - { 160, 30, 30, 10, 10, 10, 10, 10, 10, 16}, /* TM */ - { 132, 46, 40, 10, 10, 10, 10, 10, 10, 18}, /* i8x8 - never used */ - { 150, 35, 41, 10, 10, 10, 10, 10, 10, 10}, /* BPRED */ -}; - -static const unsigned int i8x8_mode_cts [VP9_I8X8_MODES] = { - /* DC V H D45 135 117 153 D27 D63 TM */ - 73, 49, 61, 30, 30, 30, 30, 30, 30, 13 -}; - -static const unsigned int kf_uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = { - // DC V H D45 135 117 153 D27 D63 TM - { 160, 24, 24, 20, 20, 20, 20, 20, 20, 8}, /* DC */ - { 102, 64, 30, 20, 20, 20, 20, 20, 20, 10}, /* V */ - { 102, 30, 64, 20, 20, 20, 20, 20, 20, 10}, /* H */ - { 102, 33, 20, 64, 20, 20, 20, 20, 20, 14}, /* D45 */ - { 102, 33, 20, 20, 64, 20, 20, 20, 20, 14}, /* D135 */ - { 122, 33, 20, 20, 20, 64, 20, 20, 20, 14}, /* D117 */ - { 102, 33, 20, 20, 20, 20, 64, 20, 20, 14}, /* D153 */ - { 102, 33, 20, 20, 20, 20, 20, 64, 20, 14}, /* D27 */ - { 102, 33, 20, 20, 20, 20, 20, 20, 64, 14}, /* D63 */ - { 132, 36, 30, 20, 20, 20, 20, 20, 20, 18}, /* TM */ - { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* i8x8 - never used */ - { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* BPRED */ -}; - -static const unsigned int bmode_cts[VP9_NKF_BINTRAMODES] = { -#if CONFIG_NEWBINTRAMODES -#if CONTEXT_PRED_REPLACEMENTS == 6 - /* DC TM VE HE CONTEXT */ - 43891, 17694, 10036, 3920, 20000 -#elif CONTEXT_PRED_REPLACEMENTS == 4 - /* DC TM VE HE LD RD CONTEXT */ - 43891, 17694, 10036, 3920, 3363, 2546, 14000 -#elif CONTEXT_PRED_REPLACEMENTS == 0 - /* DC TM VE HE LD RD VR VL HD HU CONTEXT */ - 43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723, 50000 -#endif -#else - /* DC TM VE HE LD RD VR VL HD HU */ - 43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723 -#endif -}; - -typedef enum { - SUBMVREF_NORMAL, - SUBMVREF_LEFT_ZED, - SUBMVREF_ABOVE_ZED, - SUBMVREF_LEFT_ABOVE_SAME, - SUBMVREF_LEFT_ABOVE_ZED -} sumvfref_t; - -int vp9_mv_cont(const int_mv *l, const int_mv *a) { - int lez = (l->as_int == 0); - int aez = (a->as_int == 0); - int lea = (l->as_int == a->as_int); - - if (lea && lez) - return SUBMVREF_LEFT_ABOVE_ZED; - - if (lea) - return SUBMVREF_LEFT_ABOVE_SAME; - - if (aez) - return SUBMVREF_ABOVE_ZED; - - if (lez) - return SUBMVREF_LEFT_ZED; - - return SUBMVREF_NORMAL; -} - -const vp9_prob vp9_sub_mv_ref_prob [VP9_SUBMVREFS - 1] = { 180, 162, 25}; - -const vp9_prob vp9_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP9_SUBMVREFS - 1] = { - { 147, 136, 18 }, - { 106, 145, 1 }, - { 179, 121, 1 }, - { 223, 1, 34 }, - { 208, 1, 1 } -}; - -vp9_mbsplit vp9_mbsplits [VP9_NUMMBSPLITS] = { - { - 0, 0, 0, 0, - 0, 0, 0, 0, - 1, 1, 1, 1, - 1, 1, 1, 1, - }, { - 0, 0, 1, 1, - 0, 0, 1, 1, - 0, 0, 1, 1, - 0, 0, 1, 1, - }, { - 0, 0, 1, 1, - 0, 0, 1, 1, - 2, 2, 3, 3, - 2, 2, 3, 3, - }, { - 0, 1, 2, 3, - 4, 5, 6, 7, - 8, 9, 10, 11, - 12, 13, 14, 15, - }, -}; - -const int vp9_mbsplit_count [VP9_NUMMBSPLITS] = { 2, 2, 4, 16}; - -const vp9_prob vp9_mbsplit_probs [VP9_NUMMBSPLITS - 1] = { 110, 111, 150}; - -/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ - -const vp9_tree_index vp9_kf_bmode_tree[VP9_KF_BINTRAMODES * 2 - 2] = { - -B_DC_PRED, 2, /* 0 = DC_NODE */ - -B_TM_PRED, 4, /* 1 = TM_NODE */ - -B_VE_PRED, 6, /* 2 = VE_NODE */ - 8, 12, /* 3 = COM_NODE */ - -B_HE_PRED, 10, /* 4 = HE_NODE */ - -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */ - -B_LD_PRED, 14, /* 6 = LD_NODE */ - -B_VL_PRED, 16, /* 7 = VL_NODE */ - -B_HD_PRED, -B_HU_PRED /* 8 = HD_NODE */ -}; - -const vp9_tree_index vp9_bmode_tree[VP9_NKF_BINTRAMODES * 2 - 2] = { -#if CONFIG_NEWBINTRAMODES -#if CONTEXT_PRED_REPLACEMENTS == 6 - -B_DC_PRED, 2, - -B_TM_PRED, 4, - 6, -(B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS), - -B_VE_PRED, -B_HE_PRED -#elif CONTEXT_PRED_REPLACEMENTS == 4 - -B_DC_PRED, 2, - -B_TM_PRED, 4, - 6, 8, - -B_VE_PRED, -B_HE_PRED, - 10, -(B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS), - -B_RD_PRED, -B_LD_PRED, -#elif CONTEXT_PRED_REPLACEMENTS == 0 - -B_DC_PRED, 2, /* 0 = DC_NODE */ - -B_TM_PRED, 4, /* 1 = TM_NODE */ - -B_VE_PRED, 6, /* 2 = VE_NODE */ - 8, 12, /* 3 = COM_NODE */ - -B_HE_PRED, 10, /* 4 = HE_NODE */ - -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */ - -B_LD_PRED, 14, /* 6 = LD_NODE */ - -B_VL_PRED, 16, /* 7 = VL_NODE */ - -B_HD_PRED, 18, - -B_HU_PRED, -B_CONTEXT_PRED -#endif -#else - -B_DC_PRED, 2, /* 0 = DC_NODE */ - -B_TM_PRED, 4, /* 1 = TM_NODE */ - -B_VE_PRED, 6, /* 2 = VE_NODE */ - 8, 12, /* 3 = COM_NODE */ - -B_HE_PRED, 10, /* 4 = HE_NODE */ - -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */ - -B_LD_PRED, 14, /* 6 = LD_NODE */ - -B_VL_PRED, 16, /* 7 = VL_NODE */ - -B_HD_PRED, -B_HU_PRED /* 8 = HD_NODE */ -#endif -}; - -/* Again, these trees use the same probability indices as their - explicitly-programmed predecessors. */ -const vp9_tree_index vp9_ymode_tree[VP9_YMODES * 2 - 2] = { - 2, 14, - -DC_PRED, 4, - 6, 8, - -D45_PRED, -D135_PRED, - 10, 12, - -D117_PRED, -D153_PRED, - -D27_PRED, -D63_PRED, - 16, 18, - -V_PRED, -H_PRED, - -TM_PRED, 20, - -B_PRED, -I8X8_PRED -}; - -const vp9_tree_index vp9_kf_ymode_tree[VP9_YMODES * 2 - 2] = { - 2, 14, - -DC_PRED, 4, - 6, 8, - -D45_PRED, -D135_PRED, - 10, 12, - -D117_PRED, -D153_PRED, - -D27_PRED, -D63_PRED, - 16, 18, - -V_PRED, -H_PRED, - -TM_PRED, 20, - -B_PRED, -I8X8_PRED -}; - -const vp9_tree_index vp9_i8x8_mode_tree[VP9_I8X8_MODES * 2 - 2] = { - 2, 14, - -DC_PRED, 4, - 6, 8, - -D45_PRED, -D135_PRED, - 10, 12, - -D117_PRED, -D153_PRED, - -D27_PRED, -D63_PRED, - -V_PRED, 16, - -H_PRED, -TM_PRED -}; - -const vp9_tree_index vp9_uv_mode_tree[VP9_UV_MODES * 2 - 2] = { - 2, 14, - -DC_PRED, 4, - 6, 8, - -D45_PRED, -D135_PRED, - 10, 12, - -D117_PRED, -D153_PRED, - -D27_PRED, -D63_PRED, - -V_PRED, 16, - -H_PRED, -TM_PRED -}; - -const vp9_tree_index vp9_mbsplit_tree[6] = { - -PARTITIONING_4X4, 2, - -PARTITIONING_8X8, 4, - -PARTITIONING_16X8, -PARTITIONING_8X16, -}; - -const vp9_tree_index vp9_mv_ref_tree[8] = { - -ZEROMV, 2, - -NEARESTMV, 4, - -NEARMV, 6, - -NEWMV, -SPLITMV -}; - -#if CONFIG_SUPERBLOCKS -const vp9_tree_index vp9_sb_mv_ref_tree[6] = { - -ZEROMV, 2, - -NEARESTMV, 4, - -NEARMV, -NEWMV -}; -#endif - -const vp9_tree_index vp9_sub_mv_ref_tree[6] = { - -LEFT4X4, 2, - -ABOVE4X4, 4, - -ZERO4X4, -NEW4X4 -}; - -struct vp9_token_struct vp9_bmode_encodings[VP9_NKF_BINTRAMODES]; -struct vp9_token_struct vp9_kf_bmode_encodings[VP9_KF_BINTRAMODES]; -struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES]; -#if CONFIG_SUPERBLOCKS -struct vp9_token_struct vp9_sb_ymode_encodings[VP9_I32X32_MODES]; -struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES]; -#endif -struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES]; -struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES]; -struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES]; -struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS]; - -struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS]; -#if CONFIG_SUPERBLOCKS -struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS]; -#endif -struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS]; - -void vp9_init_mbmode_probs(VP9_COMMON *x) { - unsigned int bct [VP9_YMODES] [2]; /* num Ymodes > num UV modes */ - - vp9_tree_probs_from_distribution(VP9_YMODES, vp9_ymode_encodings, - vp9_ymode_tree, x->fc.ymode_prob, - bct, y_mode_cts, 256, 1); -#if CONFIG_SUPERBLOCKS - vp9_tree_probs_from_distribution(VP9_I32X32_MODES, vp9_sb_ymode_encodings, - vp9_sb_ymode_tree, x->fc.sb_ymode_prob, - bct, y_mode_cts, 256, 1); -#endif - { - int i; - for (i = 0; i < 8; i++) { - vp9_tree_probs_from_distribution(VP9_YMODES, vp9_kf_ymode_encodings, - vp9_kf_ymode_tree, x->kf_ymode_prob[i], - bct, kf_y_mode_cts[i], 256, 1); -#if CONFIG_SUPERBLOCKS - vp9_tree_probs_from_distribution(VP9_I32X32_MODES, - vp9_sb_kf_ymode_encodings, - vp9_sb_kf_ymode_tree, - x->sb_kf_ymode_prob[i], bct, - kf_y_mode_cts[i], 256, 1); -#endif - } - } - { - int i; - for (i = 0; i < VP9_YMODES; i++) { - vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings, - vp9_uv_mode_tree, x->kf_uv_mode_prob[i], - bct, kf_uv_mode_cts[i], 256, 1); - vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings, - vp9_uv_mode_tree, x->fc.uv_mode_prob[i], - bct, uv_mode_cts[i], 256, 1); - } - } - - vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings, - vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob, - bct, i8x8_mode_cts, 256, 1); - - vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2, - sizeof(vp9_sub_mv_ref_prob2)); - vpx_memcpy(x->fc.mbsplit_prob, vp9_mbsplit_probs, sizeof(vp9_mbsplit_probs)); - vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob, - sizeof(vp9_switchable_interp_prob)); -#if CONFIG_COMP_INTERINTRA_PRED - x->fc.interintra_prob = VP9_DEF_INTERINTRA_PROB; -#endif -} - - -static void intra_bmode_probs_from_distribution( - vp9_prob p[VP9_NKF_BINTRAMODES - 1], - unsigned int branch_ct[VP9_NKF_BINTRAMODES - 1][2], - const unsigned int events[VP9_NKF_BINTRAMODES]) { - vp9_tree_probs_from_distribution(VP9_NKF_BINTRAMODES, vp9_bmode_encodings, - vp9_bmode_tree, p, branch_ct, events, 256, 1); -} - -void vp9_default_bmode_probs(vp9_prob p[VP9_NKF_BINTRAMODES - 1]) { - unsigned int branch_ct[VP9_NKF_BINTRAMODES - 1][2]; - intra_bmode_probs_from_distribution(p, branch_ct, bmode_cts); -} - -static void intra_kf_bmode_probs_from_distribution( - vp9_prob p[VP9_KF_BINTRAMODES - 1], - unsigned int branch_ct[VP9_KF_BINTRAMODES - 1][2], - const unsigned int events[VP9_KF_BINTRAMODES]) { - vp9_tree_probs_from_distribution(VP9_KF_BINTRAMODES, vp9_kf_bmode_encodings, - vp9_kf_bmode_tree, p, branch_ct, events, 256, 1); -} - -void vp9_kf_default_bmode_probs(vp9_prob p[VP9_KF_BINTRAMODES] - [VP9_KF_BINTRAMODES] - [VP9_KF_BINTRAMODES - 1]) { - unsigned int branch_ct[VP9_KF_BINTRAMODES - 1][2]; - int i, j; - - for (i = 0; i < VP9_KF_BINTRAMODES; ++i) { - for (j = 0; j < VP9_KF_BINTRAMODES; ++j) { - intra_kf_bmode_probs_from_distribution( - p[i][j], branch_ct, vp9_kf_default_bmode_counts[i][j]); - } - } -} - -#if VP9_SWITCHABLE_FILTERS == 3 -const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = { - -0, 2, - -1, -2 -}; -struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS]; -const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = { - EIGHTTAP, SIXTAP, EIGHTTAP_SHARP}; -const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, -1, 0, 2, -1}; -const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1] - [VP9_SWITCHABLE_FILTERS-1] = { - {248, 192}, { 32, 248}, { 32, 32}, {192, 160} -}; -#elif VP9_SWITCHABLE_FILTERS == 2 -const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = { - -0, -1, -}; -struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS]; -const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1] - [VP9_SWITCHABLE_FILTERS-1] = { - {248}, - { 64}, - {192}, -}; -const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = { - EIGHTTAP, EIGHTTAP_SHARP}; -const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1}; //8, 8s -#endif - -void vp9_entropy_mode_init() { - vp9_tokens_from_tree(vp9_kf_bmode_encodings, vp9_kf_bmode_tree); - vp9_tokens_from_tree(vp9_bmode_encodings, vp9_bmode_tree); - vp9_tokens_from_tree(vp9_ymode_encodings, vp9_ymode_tree); - vp9_tokens_from_tree(vp9_kf_ymode_encodings, vp9_kf_ymode_tree); -#if CONFIG_SUPERBLOCKS - vp9_tokens_from_tree(vp9_sb_ymode_encodings, vp9_sb_ymode_tree); - vp9_tokens_from_tree(vp9_sb_kf_ymode_encodings, vp9_sb_kf_ymode_tree); -#endif - vp9_tokens_from_tree(vp9_uv_mode_encodings, vp9_uv_mode_tree); - vp9_tokens_from_tree(vp9_i8x8_mode_encodings, vp9_i8x8_mode_tree); - vp9_tokens_from_tree(vp9_mbsplit_encodings, vp9_mbsplit_tree); - vp9_tokens_from_tree(vp9_switchable_interp_encodings, - vp9_switchable_interp_tree); - - vp9_tokens_from_tree_offset(vp9_mv_ref_encoding_array, - vp9_mv_ref_tree, NEARESTMV); -#if CONFIG_SUPERBLOCKS - vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array, - vp9_sb_mv_ref_tree, NEARESTMV); -#endif - vp9_tokens_from_tree_offset(vp9_sub_mv_ref_encoding_array, - vp9_sub_mv_ref_tree, LEFT4X4); -} - -void vp9_init_mode_contexts(VP9_COMMON *pc) { - vpx_memset(pc->fc.mv_ref_ct, 0, sizeof(pc->fc.mv_ref_ct)); - - vpx_memcpy(pc->fc.mode_context, - vp9_default_mode_contexts, - sizeof(pc->fc.mode_context)); - vpx_memcpy(pc->fc.mode_context_a, - vp9_default_mode_contexts_a, - sizeof(pc->fc.mode_context_a)); - -} - -void vp9_accum_mv_refs(VP9_COMMON *pc, - MB_PREDICTION_MODE m, - const int context) { - int (*mv_ref_ct)[4][2]; - - mv_ref_ct = pc->fc.mv_ref_ct; - - if (m == ZEROMV) { - ++mv_ref_ct[context][0][0]; - } else { - ++mv_ref_ct[context][0][1]; - if (m == NEARESTMV) { - ++mv_ref_ct[context][1][0]; - } else { - ++mv_ref_ct[context][1][1]; - if (m == NEARMV) { - ++mv_ref_ct[context][2][0]; - } else { - ++mv_ref_ct[context][2][1]; - if (m == NEWMV) { - ++mv_ref_ct[context][3][0]; - } else { - ++mv_ref_ct[context][3][1]; - } - } - } - } -} - -#define MVREF_COUNT_SAT 20 -#define MVREF_MAX_UPDATE_FACTOR 128 -void vp9_update_mode_context(VP9_COMMON *pc) { - int i, j; - int (*mv_ref_ct)[4][2]; - int (*mode_context)[4]; - - if (pc->refresh_alt_ref_frame) { - mode_context = pc->fc.mode_context_a; - } else { - mode_context = pc->fc.mode_context; - } - mv_ref_ct = pc->fc.mv_ref_ct; - - for (j = 0; j < INTER_MODE_CONTEXTS; j++) { - for (i = 0; i < 4; i++) { - int this_prob; - int count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1]; - int factor; - { - this_prob = count > 0 ? 256 * mv_ref_ct[j][i][0] / count : 128; - count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count; - factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT); - this_prob = (pc->fc.vp9_mode_contexts[j][i] * (256 - factor) + - this_prob * factor + 128) >> 8; - mode_context[j][i] = clip_prob(this_prob); - } - } - } -} - -#ifdef MODE_STATS -#include "vp9/common/modecont.h" -void print_mode_contexts(VP9_COMMON *pc) { - int j, i; - printf("\n====================\n"); - for (j = 0; j < INTER_MODE_CONTEXTS; j++) { - for (i = 0; i < 4; i++) { - printf("%4d ", pc->fc.mode_context[j][i]); - } - printf("\n"); - } - printf("====================\n"); - for (j = 0; j < INTER_MODE_CONTEXTS; j++) { - for (i = 0; i < 4; i++) { - printf("%4d ", pc->fc.mode_context_a[j][i]); - } - printf("\n"); - } -} -#endif - -// #define MODE_COUNT_TESTING -#define MODE_COUNT_SAT 20 -#define MODE_MAX_UPDATE_FACTOR 144 -void vp9_adapt_mode_probs(VP9_COMMON *cm) { - int i, t, count, factor; - unsigned int branch_ct[32][2]; - vp9_prob ymode_probs[VP9_YMODES - 1]; -#if CONFIG_SUPERBLOCKS - vp9_prob sb_ymode_probs[VP9_I32X32_MODES - 1]; -#endif - vp9_prob uvmode_probs[VP9_UV_MODES - 1]; - vp9_prob bmode_probs[VP9_NKF_BINTRAMODES - 1]; - vp9_prob i8x8_mode_probs[VP9_I8X8_MODES - 1]; - vp9_prob sub_mv_ref_probs[VP9_SUBMVREFS - 1]; - vp9_prob mbsplit_probs[VP9_NUMMBSPLITS - 1]; -#if CONFIG_COMP_INTERINTRA_PRED - vp9_prob interintra_prob; -#endif -#ifdef MODE_COUNT_TESTING - printf("static const unsigned int\nymode_counts" - "[VP9_YMODES] = {\n"); - for (t = 0; t < VP9_YMODES; ++t) printf("%d, ", cm->fc.ymode_counts[t]); - printf("};\n"); - printf("static const unsigned int\nuv_mode_counts" - "[VP9_YMODES] [VP9_UV_MODES] = {\n"); - for (i = 0; i < VP9_YMODES; ++i) { - printf(" {"); - for (t = 0; t < VP9_UV_MODES; ++t) printf("%d, ", cm->fc.uv_mode_counts[i][t]); - printf("},\n"); - } - printf("};\n"); - printf("static const unsigned int\nbmode_counts" - "[VP9_NKF_BINTRAMODES] = {\n"); - for (t = 0; t < VP9_NKF_BINTRAMODES; ++t) - printf("%d, ", cm->fc.bmode_counts[t]); - printf("};\n"); - printf("static const unsigned int\ni8x8_mode_counts" - "[VP9_I8X8_MODES] = {\n"); - for (t = 0; t < VP9_I8X8_MODES; ++t) printf("%d, ", cm->fc.i8x8_mode_counts[t]); - printf("};\n"); - printf("static const unsigned int\nsub_mv_ref_counts" - "[SUBMVREF_COUNT] [VP9_SUBMVREFS] = {\n"); - for (i = 0; i < SUBMVREF_COUNT; ++i) { - printf(" {"); - for (t = 0; t < VP9_SUBMVREFS; ++t) printf("%d, ", cm->fc.sub_mv_ref_counts[i][t]); - printf("},\n"); - } - printf("};\n"); - printf("static const unsigned int\nmbsplit_counts" - "[VP9_NUMMBSPLITS] = {\n"); - for (t = 0; t < VP9_NUMMBSPLITS; ++t) printf("%d, ", cm->fc.mbsplit_counts[t]); - printf("};\n"); -#if CONFIG_COMP_INTERINTRA_PRED - printf("static const unsigned int\ninterintra_counts" - "[2] = {\n"); - for (t = 0; t < 2; ++t) printf("%d, ", cm->fc.interintra_counts[t]); - printf("};\n"); -#endif -#endif - vp9_tree_probs_from_distribution( - VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree, - ymode_probs, branch_ct, cm->fc.ymode_counts, - 256, 1); - for (t = 0; t < VP9_YMODES - 1; ++t) { - int prob; - count = branch_ct[t][0] + branch_ct[t][1]; - count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; - factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); - prob = ((int)cm->fc.pre_ymode_prob[t] * (256 - factor) + - (int)ymode_probs[t] * factor + 128) >> 8; - cm->fc.ymode_prob[t] = clip_prob(prob); - } -#if CONFIG_SUPERBLOCKS - vp9_tree_probs_from_distribution(VP9_I32X32_MODES, - vp9_sb_ymode_encodings, vp9_sb_ymode_tree, - sb_ymode_probs, branch_ct, - cm->fc.sb_ymode_counts, - 256, 1); - for (t = 0; t < VP9_I32X32_MODES - 1; ++t) { - int prob; - count = branch_ct[t][0] + branch_ct[t][1]; - count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; - factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); - prob = ((int)cm->fc.pre_sb_ymode_prob[t] * (256 - factor) + - (int)sb_ymode_probs[t] * factor + 128) >> 8; - cm->fc.sb_ymode_prob[t] = clip_prob(prob); - } -#endif - for (i = 0; i < VP9_YMODES; ++i) { - vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings, - vp9_uv_mode_tree, uvmode_probs, branch_ct, - cm->fc.uv_mode_counts[i], 256, 1); - for (t = 0; t < VP9_UV_MODES - 1; ++t) { - int prob; - count = branch_ct[t][0] + branch_ct[t][1]; - count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; - factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); - prob = ((int)cm->fc.pre_uv_mode_prob[i][t] * (256 - factor) + - (int)uvmode_probs[t] * factor + 128) >> 8; - cm->fc.uv_mode_prob[i][t] = clip_prob(prob); - } - } - vp9_tree_probs_from_distribution(VP9_NKF_BINTRAMODES, vp9_bmode_encodings, - vp9_bmode_tree, bmode_probs, branch_ct, - cm->fc.bmode_counts, 256, 1); - for (t = 0; t < VP9_NKF_BINTRAMODES - 1; ++t) { - int prob; - count = branch_ct[t][0] + branch_ct[t][1]; - count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; - factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); - prob = ((int)cm->fc.pre_bmode_prob[t] * (256 - factor) + - (int)bmode_probs[t] * factor + 128) >> 8; - cm->fc.bmode_prob[t] = clip_prob(prob); - } - vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings, - vp9_i8x8_mode_tree, i8x8_mode_probs, - branch_ct, cm->fc.i8x8_mode_counts, 256, 1); - for (t = 0; t < VP9_I8X8_MODES - 1; ++t) { - int prob; - count = branch_ct[t][0] + branch_ct[t][1]; - count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; - factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); - prob = ((int)cm->fc.pre_i8x8_mode_prob[t] * (256 - factor) + - (int)i8x8_mode_probs[t] * factor + 128) >> 8; - cm->fc.i8x8_mode_prob[t] = clip_prob(prob); - } - for (i = 0; i < SUBMVREF_COUNT; ++i) { - vp9_tree_probs_from_distribution(VP9_SUBMVREFS, - vp9_sub_mv_ref_encoding_array, - vp9_sub_mv_ref_tree, sub_mv_ref_probs, - branch_ct, cm->fc.sub_mv_ref_counts[i], - 256, 1); - for (t = 0; t < VP9_SUBMVREFS - 1; ++t) { - int prob; - count = branch_ct[t][0] + branch_ct[t][1]; - count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; - factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); - prob = ((int)cm->fc.pre_sub_mv_ref_prob[i][t] * (256 - factor) + - (int)sub_mv_ref_probs[t] * factor + 128) >> 8; - cm->fc.sub_mv_ref_prob[i][t] = clip_prob(prob); - } - } - vp9_tree_probs_from_distribution(VP9_NUMMBSPLITS, vp9_mbsplit_encodings, - vp9_mbsplit_tree, mbsplit_probs, branch_ct, - cm->fc.mbsplit_counts, 256, 1); - for (t = 0; t < VP9_NUMMBSPLITS - 1; ++t) { - int prob; - count = branch_ct[t][0] + branch_ct[t][1]; - count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; - factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); - prob = ((int)cm->fc.pre_mbsplit_prob[t] * (256 - factor) + - (int)mbsplit_probs[t] * factor + 128) >> 8; - cm->fc.mbsplit_prob[t] = clip_prob(prob); - } -#if CONFIG_COMP_INTERINTRA_PRED - if (cm->use_interintra) { - int prob; - interintra_prob = vp9_bin_prob_from_distribution(cm->fc.interintra_counts); - count = cm->fc.interintra_counts[0] + cm->fc.interintra_counts[1]; - count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; - factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); - prob = ((int)cm->fc.pre_interintra_prob * (256 - factor) + - (int)interintra_prob * factor + 128) >> 8; - if (prob <= 0) - cm->fc.interintra_prob = 1; - else if (prob > 255) - cm->fc.interintra_prob = 255; - else - cm->fc.interintra_prob = prob; - } -#endif -} diff --git a/vp9/common/entropymode.h b/vp9/common/entropymode.h deleted file mode 100644 index 9f20ae94d..000000000 --- a/vp9/common/entropymode.h +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_ENTROPYMODE_H -#define __INC_ENTROPYMODE_H - -#include "blockd.h" -#include "treecoder.h" - -#define SUBMVREF_COUNT 5 -#define VP9_NUMMBSPLITS 4 -#if CONFIG_COMP_INTRA_PRED -#define DEFAULT_COMP_INTRA_PROB 32 -#endif - -#if CONFIG_COMP_INTERINTRA_PRED -#define VP9_DEF_INTERINTRA_PROB 248 -#define VP9_UPD_INTERINTRA_PROB 192 -// whether to use a separate uv mode (1) or use the same as the y mode (0) -#define SEPARATE_INTERINTRA_UV 0 -#endif - -typedef const int vp9_mbsplit[16]; - -extern vp9_mbsplit vp9_mbsplits[VP9_NUMMBSPLITS]; - -extern const int vp9_mbsplit_count[VP9_NUMMBSPLITS]; /* # of subsets */ - -extern const vp9_prob vp9_mbsplit_probs[VP9_NUMMBSPLITS - 1]; - -extern int vp9_mv_cont(const int_mv *l, const int_mv *a); - -extern const vp9_prob vp9_sub_mv_ref_prob[VP9_SUBMVREFS - 1]; - -extern const vp9_prob vp9_sub_mv_ref_prob2[SUBMVREF_COUNT][VP9_SUBMVREFS - 1]; - -extern const unsigned int vp9_kf_default_bmode_counts[VP9_KF_BINTRAMODES] - [VP9_KF_BINTRAMODES] - [VP9_KF_BINTRAMODES]; - -extern const vp9_tree_index vp9_bmode_tree[]; -extern const vp9_tree_index vp9_kf_bmode_tree[]; - -extern const vp9_tree_index vp9_ymode_tree[]; -extern const vp9_tree_index vp9_kf_ymode_tree[]; -extern const vp9_tree_index vp9_uv_mode_tree[]; -#define vp9_sb_ymode_tree vp9_uv_mode_tree -#define vp9_sb_kf_ymode_tree vp9_uv_mode_tree -extern const vp9_tree_index vp9_i8x8_mode_tree[]; -extern const vp9_tree_index vp9_mbsplit_tree[]; -extern const vp9_tree_index vp9_mv_ref_tree[]; -extern const vp9_tree_index vp9_sb_mv_ref_tree[]; -extern const vp9_tree_index vp9_sub_mv_ref_tree[]; - -extern struct vp9_token_struct vp9_bmode_encodings[VP9_NKF_BINTRAMODES]; -extern struct vp9_token_struct vp9_kf_bmode_encodings[VP9_KF_BINTRAMODES]; -extern struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES]; -extern struct vp9_token_struct vp9_sb_ymode_encodings[VP9_I32X32_MODES]; -extern struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES]; -extern struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES]; -extern struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES]; -extern struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES]; -extern struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS]; - -/* Inter mode values do not start at zero */ - -extern struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS]; -extern struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS]; -extern struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS]; - -void vp9_entropy_mode_init(void); - -struct VP9Common; - -void vp9_init_mbmode_probs(struct VP9Common *x); - -extern void vp9_init_mode_contexts(struct VP9Common *pc); - -extern void vp9_update_mode_context(struct VP9Common *pc); - -extern void vp9_accum_mv_refs(struct VP9Common *pc, - MB_PREDICTION_MODE m, - const int context); - -void vp9_default_bmode_probs(vp9_prob dest[VP9_NKF_BINTRAMODES - 1]); - -void vp9_kf_default_bmode_probs(vp9_prob dest[VP9_KF_BINTRAMODES] - [VP9_KF_BINTRAMODES] - [VP9_KF_BINTRAMODES - 1]); - -void vp9_adapt_mode_probs(struct VP9Common *); - -#define VP9_SWITCHABLE_FILTERS 2 /* number of switchable filters */ - -extern const INTERPOLATIONFILTERTYPE vp9_switchable_interp - [VP9_SWITCHABLE_FILTERS]; - -extern const int vp9_switchable_interp_map[SWITCHABLE + 1]; - -extern const vp9_tree_index vp9_switchable_interp_tree - [2 * (VP9_SWITCHABLE_FILTERS - 1)]; - -extern struct vp9_token_struct vp9_switchable_interp_encodings - [VP9_SWITCHABLE_FILTERS]; - -extern const vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1] - [VP9_SWITCHABLE_FILTERS - 1]; - -#endif diff --git a/vp9/common/entropymv.c b/vp9/common/entropymv.c deleted file mode 100644 index 85c21ba2a..000000000 --- a/vp9/common/entropymv.c +++ /dev/null @@ -1,469 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "onyxc_int.h" -#include "entropymv.h" - -//#define MV_COUNT_TESTING - -#define MV_COUNT_SAT 16 -#define MV_MAX_UPDATE_FACTOR 160 - -#if CONFIG_NEW_MVREF -/* Integer pel reference mv threshold for use of high-precision 1/8 mv */ -#define COMPANDED_MVREF_THRESH 1000000 -#else -/* Integer pel reference mv threshold for use of high-precision 1/8 mv */ -#define COMPANDED_MVREF_THRESH 8 -#endif - -/* Smooth or bias the mv-counts before prob computation */ -/* #define SMOOTH_MV_COUNTS */ - -const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = { - -MV_JOINT_ZERO, 2, - -MV_JOINT_HNZVZ, 4, - -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ -}; -struct vp9_token_struct vp9_mv_joint_encodings[MV_JOINTS]; - -const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = { - -MV_CLASS_0, 2, - -MV_CLASS_1, 4, - 6, 8, - -MV_CLASS_2, -MV_CLASS_3, - 10, 12, - -MV_CLASS_4, -MV_CLASS_5, - -MV_CLASS_6, -MV_CLASS_7, -}; -struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES]; - -const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = { - -0, -1, -}; -struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE]; - -const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = { - -0, 2, - -1, 4, - -2, -3 -}; -struct vp9_token_struct vp9_mv_fp_encodings[4]; - -const nmv_context vp9_default_nmv_context = { - {32, 64, 96}, - { - { /* vert component */ - 128, /* sign */ - {224, 144, 192, 168, 192, 176, 192}, /* class */ - {216}, /* class0 */ - {136, 140, 148, 160, 176, 192, 224}, /* bits */ - {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ - {64, 96, 64}, /* fp */ - 160, /* class0_hp bit */ - 128, /* hp */ - }, - { /* hor component */ - 128, /* sign */ - {216, 128, 176, 160, 176, 176, 192}, /* class */ - {208}, /* class0 */ - {136, 140, 148, 160, 176, 192, 224}, /* bits */ - {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ - {64, 96, 64}, /* fp */ - 160, /* class0_hp bit */ - 128, /* hp */ - } - }, -}; - -MV_JOINT_TYPE vp9_get_mv_joint(MV mv) { - if (mv.row == 0 && mv.col == 0) return MV_JOINT_ZERO; - else if (mv.row == 0 && mv.col != 0) return MV_JOINT_HNZVZ; - else if (mv.row != 0 && mv.col == 0) return MV_JOINT_HZVNZ; - else return MV_JOINT_HNZVNZ; -} - -#define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0) - -MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { - MV_CLASS_TYPE c; - if (z < CLASS0_SIZE * 8) c = MV_CLASS_0; - else if (z < CLASS0_SIZE * 16) c = MV_CLASS_1; - else if (z < CLASS0_SIZE * 32) c = MV_CLASS_2; - else if (z < CLASS0_SIZE * 64) c = MV_CLASS_3; - else if (z < CLASS0_SIZE * 128) c = MV_CLASS_4; - else if (z < CLASS0_SIZE * 256) c = MV_CLASS_5; - else if (z < CLASS0_SIZE * 512) c = MV_CLASS_6; - else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7; - else assert(0); - if (offset) - *offset = z - mv_class_base(c); - return c; -} - -int vp9_use_nmv_hp(const MV *ref) { - if ((abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH && - (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH) - return 1; - else - return 0; -} - -int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) { - return mv_class_base(c) + offset; -} - -static void increment_nmv_component_count(int v, - nmv_component_counts *mvcomp, - int incr, - int usehp) { - assert (v != 0); /* should not be zero */ - mvcomp->mvcount[MV_MAX + v] += incr; -} - -static void increment_nmv_component(int v, - nmv_component_counts *mvcomp, - int incr, - int usehp) { - int s, z, c, o, d, e, f; - assert (v != 0); /* should not be zero */ - s = v < 0; - mvcomp->sign[s] += incr; - z = (s ? -v : v) - 1; /* magnitude - 1 */ - - c = vp9_get_mv_class(z, &o); - mvcomp->classes[c] += incr; - - d = (o >> 3); /* int mv data */ - f = (o >> 1) & 3; /* fractional pel mv data */ - e = (o & 1); /* high precision mv data */ - if (c == MV_CLASS_0) { - mvcomp->class0[d] += incr; - } else { - int i, b; - b = c + CLASS0_BITS - 1; /* number of bits */ - for (i = 0; i < b; ++i) - mvcomp->bits[i][((d >> i) & 1)] += incr; - } - - /* Code the fractional pel bits */ - if (c == MV_CLASS_0) { - mvcomp->class0_fp[d][f] += incr; - } else { - mvcomp->fp[f] += incr; - } - - /* Code the high precision bit */ - if (usehp) { - if (c == MV_CLASS_0) { - mvcomp->class0_hp[e] += incr; - } else { - mvcomp->hp[e] += incr; - } - } -} - -#ifdef SMOOTH_MV_COUNTS -static void smooth_counts(nmv_component_counts *mvcomp) { - static const int flen = 3; // (filter_length + 1) / 2 - static const int fval[] = {8, 3, 1}; - static const int fvalbits = 4; - int i; - unsigned int smvcount[MV_VALS]; - vpx_memcpy(smvcount, mvcomp->mvcount, sizeof(smvcount)); - smvcount[MV_MAX] = (smvcount[MV_MAX - 1] + smvcount[MV_MAX + 1]) >> 1; - for (i = flen - 1; i <= MV_VALS - flen; ++i) { - int j, s = smvcount[i] * fval[0]; - for (j = 1; j < flen; ++j) - s += (smvcount[i - j] + smvcount[i + j]) * fval[j]; - mvcomp->mvcount[i] = (s + (1 << (fvalbits - 1))) >> fvalbits; - } -} -#endif - -static void counts_to_context(nmv_component_counts *mvcomp, int usehp) { - int v; - vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount)); - for (v = 1; v <= MV_MAX; v++) { - increment_nmv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp); - increment_nmv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp); - } -} - -void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx, - int usehp) { - MV_JOINT_TYPE j = vp9_get_mv_joint(*mv); - mvctx->joints[j]++; - usehp = usehp && vp9_use_nmv_hp(ref); - if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) { - increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp); - } - if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) { - increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp); - } -} - -static void adapt_prob(vp9_prob *dest, vp9_prob prep, vp9_prob newp, - unsigned int ct[2]) { - int factor; - int prob; - int count = ct[0] + ct[1]; - if (count) { - count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count; - factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT); - prob = ((int)prep * (256 - factor) + (int)(newp) * factor + 128) >> 8; - prob += !prob; - prob = (prob > 255 ? 255 : prob); - *dest = prob; - } -} - -void vp9_counts_process(nmv_context_counts *NMVcount, int usehp) { - counts_to_context(&NMVcount->comps[0], usehp); - counts_to_context(&NMVcount->comps[1], usehp); -} - -void vp9_counts_to_nmv_context( - nmv_context_counts *NMVcount, - nmv_context *prob, - int usehp, - unsigned int (*branch_ct_joint)[2], - unsigned int (*branch_ct_sign)[2], - unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2], - unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2], - unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2], - unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2], - unsigned int (*branch_ct_fp)[4 - 1][2], - unsigned int (*branch_ct_class0_hp)[2], - unsigned int (*branch_ct_hp)[2]) { - int i, j, k; - vp9_counts_process(NMVcount, usehp); - vp9_tree_probs_from_distribution(MV_JOINTS, - vp9_mv_joint_encodings, - vp9_mv_joint_tree, - prob->joints, - branch_ct_joint, - NMVcount->joints, - 256, 1); - for (i = 0; i < 2; ++i) { - prob->comps[i].sign = - vp9_bin_prob_from_distribution(NMVcount->comps[i].sign); - branch_ct_sign[i][0] = NMVcount->comps[i].sign[0]; - branch_ct_sign[i][1] = NMVcount->comps[i].sign[1]; - vp9_tree_probs_from_distribution(MV_CLASSES, - vp9_mv_class_encodings, - vp9_mv_class_tree, - prob->comps[i].classes, - branch_ct_classes[i], - NMVcount->comps[i].classes, - 256, 1); - vp9_tree_probs_from_distribution(CLASS0_SIZE, - vp9_mv_class0_encodings, - vp9_mv_class0_tree, - prob->comps[i].class0, - branch_ct_class0[i], - NMVcount->comps[i].class0, - 256, 1); - for (j = 0; j < MV_OFFSET_BITS; ++j) { - prob->comps[i].bits[j] = vp9_bin_prob_from_distribution( - NMVcount->comps[i].bits[j]); - branch_ct_bits[i][j][0] = NMVcount->comps[i].bits[j][0]; - branch_ct_bits[i][j][1] = NMVcount->comps[i].bits[j][1]; - } - } - for (i = 0; i < 2; ++i) { - for (k = 0; k < CLASS0_SIZE; ++k) { - vp9_tree_probs_from_distribution(4, - vp9_mv_fp_encodings, - vp9_mv_fp_tree, - prob->comps[i].class0_fp[k], - branch_ct_class0_fp[i][k], - NMVcount->comps[i].class0_fp[k], - 256, 1); - } - vp9_tree_probs_from_distribution(4, - vp9_mv_fp_encodings, - vp9_mv_fp_tree, - prob->comps[i].fp, - branch_ct_fp[i], - NMVcount->comps[i].fp, - 256, 1); - } - if (usehp) { - for (i = 0; i < 2; ++i) { - prob->comps[i].class0_hp = vp9_bin_prob_from_distribution( - NMVcount->comps[i].class0_hp); - branch_ct_class0_hp[i][0] = NMVcount->comps[i].class0_hp[0]; - branch_ct_class0_hp[i][1] = NMVcount->comps[i].class0_hp[1]; - - prob->comps[i].hp = - vp9_bin_prob_from_distribution(NMVcount->comps[i].hp); - branch_ct_hp[i][0] = NMVcount->comps[i].hp[0]; - branch_ct_hp[i][1] = NMVcount->comps[i].hp[1]; - } - } -} - -void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) { - int i, j, k; - nmv_context prob; - unsigned int branch_ct_joint[MV_JOINTS - 1][2]; - unsigned int branch_ct_sign[2][2]; - unsigned int branch_ct_classes[2][MV_CLASSES - 1][2]; - unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2]; - unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2]; - unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2]; - unsigned int branch_ct_fp[2][4 - 1][2]; - unsigned int branch_ct_class0_hp[2][2]; - unsigned int branch_ct_hp[2][2]; -#ifdef MV_COUNT_TESTING - printf("joints count: "); - for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]); - printf("\n"); fflush(stdout); - printf("signs count:\n"); - for (i = 0; i < 2; ++i) - printf("%d/%d ", cm->fc.NMVcount.comps[i].sign[0], cm->fc.NMVcount.comps[i].sign[1]); - printf("\n"); fflush(stdout); - printf("classes count:\n"); - for (i = 0; i < 2; ++i) { - for (j = 0; j < MV_CLASSES; ++j) - printf("%d ", cm->fc.NMVcount.comps[i].classes[j]); - printf("\n"); fflush(stdout); - } - printf("class0 count:\n"); - for (i = 0; i < 2; ++i) { - for (j = 0; j < CLASS0_SIZE; ++j) - printf("%d ", cm->fc.NMVcount.comps[i].class0[j]); - printf("\n"); fflush(stdout); - } - printf("bits count:\n"); - for (i = 0; i < 2; ++i) { - for (j = 0; j < MV_OFFSET_BITS; ++j) - printf("%d/%d ", cm->fc.NMVcount.comps[i].bits[j][0], - cm->fc.NMVcount.comps[i].bits[j][1]); - printf("\n"); fflush(stdout); - } - printf("class0_fp count:\n"); - for (i = 0; i < 2; ++i) { - for (j = 0; j < CLASS0_SIZE; ++j) { - printf("{"); - for (k = 0; k < 4; ++k) - printf("%d ", cm->fc.NMVcount.comps[i].class0_fp[j][k]); - printf("}, "); - } - printf("\n"); fflush(stdout); - } - printf("fp count:\n"); - for (i = 0; i < 2; ++i) { - for (j = 0; j < 4; ++j) - printf("%d ", cm->fc.NMVcount.comps[i].fp[j]); - printf("\n"); fflush(stdout); - } - if (usehp) { - printf("class0_hp count:\n"); - for (i = 0; i < 2; ++i) - printf("%d/%d ", cm->fc.NMVcount.comps[i].class0_hp[0], - cm->fc.NMVcount.comps[i].class0_hp[1]); - printf("\n"); fflush(stdout); - printf("hp count:\n"); - for (i = 0; i < 2; ++i) - printf("%d/%d ", cm->fc.NMVcount.comps[i].hp[0], - cm->fc.NMVcount.comps[i].hp[1]); - printf("\n"); fflush(stdout); - } -#endif -#ifdef SMOOTH_MV_COUNTS - smooth_counts(&cm->fc.NMVcount.comps[0]); - smooth_counts(&cm->fc.NMVcount.comps[1]); -#endif - vp9_counts_to_nmv_context(&cm->fc.NMVcount, - &prob, - usehp, - branch_ct_joint, - branch_ct_sign, - branch_ct_classes, - branch_ct_class0, - branch_ct_bits, - branch_ct_class0_fp, - branch_ct_fp, - branch_ct_class0_hp, - branch_ct_hp); - - for (j = 0; j < MV_JOINTS - 1; ++j) { - adapt_prob(&cm->fc.nmvc.joints[j], - cm->fc.pre_nmvc.joints[j], - prob.joints[j], - branch_ct_joint[j]); - } - for (i = 0; i < 2; ++i) { - adapt_prob(&cm->fc.nmvc.comps[i].sign, - cm->fc.pre_nmvc.comps[i].sign, - prob.comps[i].sign, - branch_ct_sign[i]); - for (j = 0; j < MV_CLASSES - 1; ++j) { - adapt_prob(&cm->fc.nmvc.comps[i].classes[j], - cm->fc.pre_nmvc.comps[i].classes[j], - prob.comps[i].classes[j], - branch_ct_classes[i][j]); - } - for (j = 0; j < CLASS0_SIZE - 1; ++j) { - adapt_prob(&cm->fc.nmvc.comps[i].class0[j], - cm->fc.pre_nmvc.comps[i].class0[j], - prob.comps[i].class0[j], - branch_ct_class0[i][j]); - } - for (j = 0; j < MV_OFFSET_BITS; ++j) { - adapt_prob(&cm->fc.nmvc.comps[i].bits[j], - cm->fc.pre_nmvc.comps[i].bits[j], - prob.comps[i].bits[j], - branch_ct_bits[i][j]); - } - } - for (i = 0; i < 2; ++i) { - for (j = 0; j < CLASS0_SIZE; ++j) { - for (k = 0; k < 3; ++k) { - adapt_prob(&cm->fc.nmvc.comps[i].class0_fp[j][k], - cm->fc.pre_nmvc.comps[i].class0_fp[j][k], - prob.comps[i].class0_fp[j][k], - branch_ct_class0_fp[i][j][k]); - } - } - for (j = 0; j < 3; ++j) { - adapt_prob(&cm->fc.nmvc.comps[i].fp[j], - cm->fc.pre_nmvc.comps[i].fp[j], - prob.comps[i].fp[j], - branch_ct_fp[i][j]); - } - } - if (usehp) { - for (i = 0; i < 2; ++i) { - adapt_prob(&cm->fc.nmvc.comps[i].class0_hp, - cm->fc.pre_nmvc.comps[i].class0_hp, - prob.comps[i].class0_hp, - branch_ct_class0_hp[i]); - adapt_prob(&cm->fc.nmvc.comps[i].hp, - cm->fc.pre_nmvc.comps[i].hp, - prob.comps[i].hp, - branch_ct_hp[i]); - } - } -} - -void vp9_entropy_mv_init() { - vp9_tokens_from_tree(vp9_mv_joint_encodings, vp9_mv_joint_tree); - vp9_tokens_from_tree(vp9_mv_class_encodings, vp9_mv_class_tree); - vp9_tokens_from_tree(vp9_mv_class0_encodings, vp9_mv_class0_tree); - vp9_tokens_from_tree(vp9_mv_fp_encodings, vp9_mv_fp_tree); -} - -void vp9_init_mv_probs(VP9_COMMON *cm) { - vpx_memcpy(&cm->fc.nmvc, &vp9_default_nmv_context, sizeof(nmv_context)); -} diff --git a/vp9/common/entropymv.h b/vp9/common/entropymv.h deleted file mode 100644 index 7a5fadff1..000000000 --- a/vp9/common/entropymv.h +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_ENTROPYMV_H -#define __INC_ENTROPYMV_H - -#include "treecoder.h" -#include "vpx_config.h" -#include "blockd.h" - -struct VP9Common; - -void vp9_entropy_mv_init(); -void vp9_init_mv_probs(struct VP9Common *cm); - -void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp); -int vp9_use_nmv_hp(const MV *ref); - -#define VP9_NMV_UPDATE_PROB 255 -//#define MV_GROUP_UPDATE - -#define LOW_PRECISION_MV_UPDATE /* Use 7 bit forward update */ - -/* Symbols for coding which components are zero jointly */ -#define MV_JOINTS 4 -typedef enum { - MV_JOINT_ZERO = 0, /* Zero vector */ - MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */ - MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */ - MV_JOINT_HNZVNZ = 3, /* Both components nonzero */ -} MV_JOINT_TYPE; - -extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2]; -extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS]; - -/* Symbols for coding magnitude class of nonzero components */ -#define MV_CLASSES 8 -typedef enum { - MV_CLASS_0 = 0, /* (0, 2] integer pel */ - MV_CLASS_1 = 1, /* (2, 4] integer pel */ - MV_CLASS_2 = 2, /* (4, 8] integer pel */ - MV_CLASS_3 = 3, /* (8, 16] integer pel */ - MV_CLASS_4 = 4, /* (16, 32] integer pel */ - MV_CLASS_5 = 5, /* (32, 64] integer pel */ - MV_CLASS_6 = 6, /* (64, 128] integer pel */ - MV_CLASS_7 = 7, /* (128, 256] integer pel */ -} MV_CLASS_TYPE; - -extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2]; -extern struct vp9_token_struct vp9_mv_class_encodings [MV_CLASSES]; - -#define CLASS0_BITS 1 /* bits at integer precision for class 0 */ -#define CLASS0_SIZE (1 << CLASS0_BITS) -#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2) - -#define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2) -#define MV_MAX ((1 << MV_MAX_BITS) - 1) -#define MV_VALS ((MV_MAX << 1) + 1) - -extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2]; -extern struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE]; - -extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2]; -extern struct vp9_token_struct vp9_mv_fp_encodings[4]; - -typedef struct { - vp9_prob sign; - vp9_prob classes[MV_CLASSES - 1]; - vp9_prob class0[CLASS0_SIZE - 1]; - vp9_prob bits[MV_OFFSET_BITS]; - vp9_prob class0_fp[CLASS0_SIZE][4 - 1]; - vp9_prob fp[4 - 1]; - vp9_prob class0_hp; - vp9_prob hp; -} nmv_component; - -typedef struct { - vp9_prob joints[MV_JOINTS - 1]; - nmv_component comps[2]; -} nmv_context; - -MV_JOINT_TYPE vp9_get_mv_joint(MV mv); -MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset); -int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset); - - -typedef struct { - unsigned int mvcount[MV_VALS]; - unsigned int sign[2]; - unsigned int classes[MV_CLASSES]; - unsigned int class0[CLASS0_SIZE]; - unsigned int bits[MV_OFFSET_BITS][2]; - unsigned int class0_fp[CLASS0_SIZE][4]; - unsigned int fp[4]; - unsigned int class0_hp[2]; - unsigned int hp[2]; -} nmv_component_counts; - -typedef struct { - unsigned int joints[MV_JOINTS]; - nmv_component_counts comps[2]; -} nmv_context_counts; - -void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx, - int usehp); -extern const nmv_context vp9_default_nmv_context; -void vp9_counts_to_nmv_context( - nmv_context_counts *NMVcount, - nmv_context *prob, - int usehp, - unsigned int (*branch_ct_joint)[2], - unsigned int (*branch_ct_sign)[2], - unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2], - unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2], - unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2], - unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2], - unsigned int (*branch_ct_fp)[4 - 1][2], - unsigned int (*branch_ct_class0_hp)[2], - unsigned int (*branch_ct_hp)[2]); -void vp9_counts_process(nmv_context_counts *NMVcount, int usehp); -#endif diff --git a/vp9/common/extend.c b/vp9/common/extend.c deleted file mode 100644 index ad3b89b44..000000000 --- a/vp9/common/extend.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "extend.h" -#include "vpx_mem/vpx_mem.h" - -static void copy_and_extend_plane(unsigned char *s, /* source */ - int sp, /* source pitch */ - unsigned char *d, /* destination */ - int dp, /* destination pitch */ - int h, /* height */ - int w, /* width */ - int et, /* extend top border */ - int el, /* extend left border */ - int eb, /* extend bottom border */ - int er) { /* extend right border */ - int i; - unsigned char *src_ptr1, *src_ptr2; - unsigned char *dest_ptr1, *dest_ptr2; - int linesize; - - /* copy the left and right most columns out */ - src_ptr1 = s; - src_ptr2 = s + w - 1; - dest_ptr1 = d - el; - dest_ptr2 = d + w; - - for (i = 0; i < h; i++) { - vpx_memset(dest_ptr1, src_ptr1[0], el); - vpx_memcpy(dest_ptr1 + el, src_ptr1, w); - vpx_memset(dest_ptr2, src_ptr2[0], er); - src_ptr1 += sp; - src_ptr2 += sp; - dest_ptr1 += dp; - dest_ptr2 += dp; - } - - /* Now copy the top and bottom lines into each line of the respective - * borders - */ - src_ptr1 = d - el; - src_ptr2 = d + dp * (h - 1) - el; - dest_ptr1 = d + dp * (-et) - el; - dest_ptr2 = d + dp * (h) - el; - linesize = el + er + w; - - for (i = 0; i < et; i++) { - vpx_memcpy(dest_ptr1, src_ptr1, linesize); - dest_ptr1 += dp; - } - - for (i = 0; i < eb; i++) { - vpx_memcpy(dest_ptr2, src_ptr2, linesize); - dest_ptr2 += dp; - } -} - -void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst) { - int et = dst->border; - int el = dst->border; - int eb = dst->border + dst->y_height - src->y_height; - int er = dst->border + dst->y_width - src->y_width; - - copy_and_extend_plane(src->y_buffer, src->y_stride, - dst->y_buffer, dst->y_stride, - src->y_height, src->y_width, - et, el, eb, er); - - et = dst->border >> 1; - el = dst->border >> 1; - eb = (dst->border >> 1) + dst->uv_height - src->uv_height; - er = (dst->border >> 1) + dst->uv_width - src->uv_width; - - copy_and_extend_plane(src->u_buffer, src->uv_stride, - dst->u_buffer, dst->uv_stride, - src->uv_height, src->uv_width, - et, el, eb, er); - - copy_and_extend_plane(src->v_buffer, src->uv_stride, - dst->v_buffer, dst->uv_stride, - src->uv_height, src->uv_width, - et, el, eb, er); -} - -void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, - int srcy, int srcx, - int srch, int srcw) { - int et = dst->border; - int el = dst->border; - int eb = dst->border + dst->y_height - src->y_height; - int er = dst->border + dst->y_width - src->y_width; - int src_y_offset = srcy * src->y_stride + srcx; - int dst_y_offset = srcy * dst->y_stride + srcx; - int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); - int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); - - // If the side is not touching the bounder then don't extend. - if (srcy) - et = 0; - if (srcx) - el = 0; - if (srcy + srch != src->y_height) - eb = 0; - if (srcx + srcw != src->y_width) - er = 0; - - copy_and_extend_plane(src->y_buffer + src_y_offset, - src->y_stride, - dst->y_buffer + dst_y_offset, - dst->y_stride, - srch, srcw, - et, el, eb, er); - - et = (et + 1) >> 1; - el = (el + 1) >> 1; - eb = (eb + 1) >> 1; - er = (er + 1) >> 1; - srch = (srch + 1) >> 1; - srcw = (srcw + 1) >> 1; - - copy_and_extend_plane(src->u_buffer + src_uv_offset, - src->uv_stride, - dst->u_buffer + dst_uv_offset, - dst->uv_stride, - srch, srcw, - et, el, eb, er); - - copy_and_extend_plane(src->v_buffer + src_uv_offset, - src->uv_stride, - dst->v_buffer + dst_uv_offset, - dst->uv_stride, - srch, srcw, - et, el, eb, er); -} - -/* note the extension is only for the last row, for intra prediction purpose */ -void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, - unsigned char *UPtr, unsigned char *VPtr) { - int i; - - YPtr += ybf->y_stride * 14; - UPtr += ybf->uv_stride * 6; - VPtr += ybf->uv_stride * 6; - - for (i = 0; i < 4; i++) { - YPtr[i] = YPtr[-1]; - UPtr[i] = UPtr[-1]; - VPtr[i] = VPtr[-1]; - } - - YPtr += ybf->y_stride; - UPtr += ybf->uv_stride; - VPtr += ybf->uv_stride; - - for (i = 0; i < 4; i++) { - YPtr[i] = YPtr[-1]; - UPtr[i] = UPtr[-1]; - VPtr[i] = VPtr[-1]; - } -} diff --git a/vp9/common/extend.h b/vp9/common/extend.h deleted file mode 100644 index c3c590479..000000000 --- a/vp9/common/extend.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef __INC_EXTEND_H -#define __INC_EXTEND_H - -#include "vpx_scale/yv12config.h" - -void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, - unsigned char *UPtr, unsigned char *VPtr); - -void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst); - -void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *dst, - int srcy, int srcx, - int srch, int srcw); - -#endif // __INC_EXTEND_H diff --git a/vp9/common/filter.c b/vp9/common/filter.c deleted file mode 100644 index 429f408c7..000000000 --- a/vp9/common/filter.c +++ /dev/null @@ -1,1159 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include -#include "filter.h" -#include "vpx_ports/mem.h" -#include "vp9_rtcd.h" - -DECLARE_ALIGNED(16, const short, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = { - { 128, 0 }, - { 120, 8 }, - { 112, 16 }, - { 104, 24 }, - { 96, 32 }, - { 88, 40 }, - { 80, 48 }, - { 72, 56 }, - { 64, 64 }, - { 56, 72 }, - { 48, 80 }, - { 40, 88 }, - { 32, 96 }, - { 24, 104 }, - { 16, 112 }, - { 8, 120 } -}; - -#define FILTER_ALPHA 0 -#define FILTER_ALPHA_SHARP 1 -DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = { -#if FILTER_ALPHA == 0 - /* Lagrangian interpolation filter */ - { 0, 0, 0, 128, 0, 0, 0, 0}, - { 0, 1, -5, 126, 8, -3, 1, 0}, - { -1, 3, -10, 122, 18, -6, 2, 0}, - { -1, 4, -13, 118, 27, -9, 3, -1}, - { -1, 4, -16, 112, 37, -11, 4, -1}, - { -1, 5, -18, 105, 48, -14, 4, -1}, - { -1, 5, -19, 97, 58, -16, 5, -1}, - { -1, 6, -19, 88, 68, -18, 5, -1}, - { -1, 6, -19, 78, 78, -19, 6, -1}, - { -1, 5, -18, 68, 88, -19, 6, -1}, - { -1, 5, -16, 58, 97, -19, 5, -1}, - { -1, 4, -14, 48, 105, -18, 5, -1}, - { -1, 4, -11, 37, 112, -16, 4, -1}, - { -1, 3, -9, 27, 118, -13, 4, -1}, - { 0, 2, -6, 18, 122, -10, 3, -1}, - { 0, 1, -3, 8, 126, -5, 1, 0} -#elif FILTER_ALPHA == 50 - /* Generated using MATLAB: - * alpha = 0.5; - * b=intfilt(8,4,alpha); - * bi=round(128*b); - * ba=flipud(reshape([bi 0], 8, 8)); - * disp(num2str(ba, '%d,')) - */ - { 0, 0, 0, 128, 0, 0, 0, 0}, - { 0, 1, -5, 126, 8, -3, 1, 0}, - { 0, 2, -10, 122, 18, -6, 2, 0}, - { -1, 3, -13, 118, 27, -9, 3, 0}, - { -1, 4, -16, 112, 37, -11, 3, 0}, - { -1, 5, -17, 104, 48, -14, 4, -1}, - { -1, 5, -18, 96, 58, -16, 5, -1}, - { -1, 5, -19, 88, 68, -17, 5, -1}, - { -1, 5, -18, 78, 78, -18, 5, -1}, - { -1, 5, -17, 68, 88, -19, 5, -1}, - { -1, 5, -16, 58, 96, -18, 5, -1}, - { -1, 4, -14, 48, 104, -17, 5, -1}, - { 0, 3, -11, 37, 112, -16, 4, -1}, - { 0, 3, -9, 27, 118, -13, 3, -1}, - { 0, 2, -6, 18, 122, -10, 2, 0}, - { 0, 1, -3, 8, 126, -5, 1, 0} -#endif /* FILTER_ALPHA */ -}; - -DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = { -#if FILTER_ALPHA_SHARP == 1 - /* dct based filter */ - {0, 0, 0, 128, 0, 0, 0, 0}, - {-1, 3, -7, 127, 8, -3, 1, 0}, - {-2, 5, -13, 125, 17, -6, 3, -1}, - {-3, 7, -17, 121, 27, -10, 5, -2}, - {-4, 9, -20, 115, 37, -13, 6, -2}, - {-4, 10, -23, 108, 48, -16, 8, -3}, - {-4, 10, -24, 100, 59, -19, 9, -3}, - {-4, 11, -24, 90, 70, -21, 10, -4}, - {-4, 11, -23, 80, 80, -23, 11, -4}, - {-4, 10, -21, 70, 90, -24, 11, -4}, - {-3, 9, -19, 59, 100, -24, 10, -4}, - {-3, 8, -16, 48, 108, -23, 10, -4}, - {-2, 6, -13, 37, 115, -20, 9, -4}, - {-2, 5, -10, 27, 121, -17, 7, -3}, - {-1, 3, -6, 17, 125, -13, 5, -2}, - {0, 1, -3, 8, 127, -7, 3, -1} -#elif FILTER_ALPHA_SHARP == 75 - /* alpha = 0.75 */ - {0, 0, 0, 128, 0, 0, 0, 0}, - {-1, 2, -6, 126, 9, -3, 2, -1}, - {-1, 4, -11, 123, 18, -7, 3, -1}, - {-2, 6, -16, 119, 28, -10, 5, -2}, - {-2, 7, -19, 113, 38, -13, 6, -2}, - {-3, 8, -21, 106, 49, -16, 7, -2}, - {-3, 9, -22, 99, 59, -19, 8, -3}, - {-3, 9, -23, 90, 70, -21, 9, -3}, - {-3, 9, -22, 80, 80, -22, 9, -3}, - {-3, 9, -21, 70, 90, -23, 9, -3}, - {-3, 8, -19, 59, 99, -22, 9, -3}, - {-2, 7, -16, 49, 106, -21, 8, -3}, - {-2, 6, -13, 38, 113, -19, 7, -2}, - {-2, 5, -10, 28, 119, -16, 6, -2}, - {-1, 3, -7, 18, 123, -11, 4, -1}, - {-1, 2, -3, 9, 126, -6, 2, -1} -#endif /* FILTER_ALPHA_SHARP */ -}; - -DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = { - {0, 0, 128, 0, 0, 0}, - {1, -5, 125, 8, -2, 1}, - {1, -8, 122, 17, -5, 1}, - {2, -11, 116, 27, -8, 2}, - {3, -14, 110, 37, -10, 2}, - {3, -15, 103, 47, -12, 2}, - {3, -16, 95, 57, -14, 3}, - {3, -16, 86, 67, -15, 3}, - {3, -16, 77, 77, -16, 3}, - {3, -15, 67, 86, -16, 3}, - {3, -14, 57, 95, -16, 3}, - {2, -12, 47, 103, -15, 3}, - {2, -10, 37, 110, -14, 3}, - {2, -8, 27, 116, -11, 2}, - {1, -5, 17, 122, -8, 1}, - {1, -2, 8, 125, -5, 1} -}; - -static void filter_block2d_first_pass_6(unsigned char *src_ptr, - int *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter) { - unsigned int i, j; - int Temp; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) + - ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) + - ((int)src_ptr[0] * vp9_filter[2]) + - ((int)src_ptr[pixel_step] * vp9_filter[3]) + - ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) + - ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) + - (VP9_FILTER_WEIGHT >> 1); /* Rounding */ - - /* Normalize back to 0-255 */ - Temp = Temp >> VP9_FILTER_SHIFT; - - if (Temp < 0) - Temp = 0; - else if (Temp > 255) - Temp = 255; - - output_ptr[j] = Temp; - src_ptr++; - } - - /* Next row... */ - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -static void filter_block2d_second_pass_6(int *src_ptr, - unsigned char *output_ptr, - int output_pitch, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter) { - unsigned int i, j; - int Temp; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - /* Apply filter */ - Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) + - ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) + - ((int)src_ptr[0] * vp9_filter[2]) + - ((int)src_ptr[pixel_step] * vp9_filter[3]) + - ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) + - ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) + - (VP9_FILTER_WEIGHT >> 1); /* Rounding */ - - /* Normalize back to 0-255 */ - Temp = Temp >> VP9_FILTER_SHIFT; - - if (Temp < 0) - Temp = 0; - else if (Temp > 255) - Temp = 255; - - output_ptr[j] = (unsigned char)Temp; - src_ptr++; - } - - /* Start next row */ - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_pitch; - } -} - -/* - * The only functional difference between filter_block2d_second_pass() - * and this function is that filter_block2d_second_pass() does a sixtap - * filter on the input and stores it in the output. This function - * (filter_block2d_second_pass_avg()) does a sixtap filter on the input, - * and then averages that with the content already present in the output - * ((filter_result + dest + 1) >> 1) and stores that in the output. - */ -static void filter_block2d_second_pass_avg_6(int *src_ptr, - unsigned char *output_ptr, - int output_pitch, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter) { - unsigned int i, j; - int Temp; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - /* Apply filter */ - Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) + - ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) + - ((int)src_ptr[0] * vp9_filter[2]) + - ((int)src_ptr[pixel_step] * vp9_filter[3]) + - ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) + - ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) + - (VP9_FILTER_WEIGHT >> 1); /* Rounding */ - - /* Normalize back to 0-255 */ - Temp = Temp >> VP9_FILTER_SHIFT; - - if (Temp < 0) - Temp = 0; - else if (Temp > 255) - Temp = 255; - - output_ptr[j] = (unsigned char)((output_ptr[j] + Temp + 1) >> 1); - src_ptr++; - } - - /* Start next row */ - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_pitch; - } -} - -#define Interp_Extend 3 -static void filter_block2d_6(unsigned char *src_ptr, - unsigned char *output_ptr, - unsigned int src_pixels_per_line, - int output_pitch, - const short *HFilter, - const short *VFilter) { - int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1, - 3 + Interp_Extend * 2, 4, HFilter); - - /* then filter verticaly... */ - filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr, output_pitch, 4, 4, 4, 4, VFilter); -} - - -void vp9_sixtap_predict_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter); -} - -/* - * The difference between filter_block2d_6() and filter_block2d_avg_6 is - * that filter_block2d_6() does a 6-tap filter and stores it in the output - * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and - * then averages that with the content already present in the output - * ((filter_result + dest + 1) >> 1) and stores that in the output. - */ -static void filter_block2d_avg_6(unsigned char *src_ptr, - unsigned char *output_ptr, - unsigned int src_pixels_per_line, - int output_pitch, - const short *HFilter, - const short *VFilter) { - int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), - FData, src_pixels_per_line, 1, - 3 + Interp_Extend * 2, 4, HFilter); - - /* then filter verticaly... */ - filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr, - output_pitch, 4, 4, 4, 4, VFilter); -} - -void vp9_sixtap_predict_avg_c -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line, - dst_pitch, HFilter, VFilter); -} - -void vp9_sixtap_predict8x8_c -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - // int FData[(7+Interp_Extend*2)*16]; /* Temp data buffer used in filtering */ - int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1, - 7 + Interp_Extend * 2, 8, HFilter); - - - /* then filter verticaly... */ - filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter); - -} - -void vp9_sixtap_predict_avg8x8_c -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - // int FData[(7+Interp_Extend*2)*16]; /* Temp data buffer used in filtering */ - int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1, - 7 + Interp_Extend * 2, 8, HFilter); - - /* then filter verticaly... */ - filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter); -} - -void vp9_sixtap_predict8x4_c -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - // int FData[(7+Interp_Extend*2)*16]; /* Temp data buffer used in filtering */ - int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1, - 3 + Interp_Extend * 2, 8, HFilter); - - - /* then filter verticaly... */ - filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter); - -} - -void vp9_sixtap_predict16x16_c -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - // int FData[(15+Interp_Extend*2)*24]; /* Temp data buffer used in filtering */ - int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */ - - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1, - 15 + Interp_Extend * 2, 16, HFilter); - - /* then filter verticaly... */ - filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter); - -} - -void vp9_sixtap_predict_avg16x16_c -( - unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch -) { - const short *HFilter; - const short *VFilter; - // int FData[(15+Interp_Extend*2)*24]; /* Temp data buffer used in filtering */ - int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter); - - /* then filter verticaly... */ - filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch, - 16, 16, 16, 16, VFilter); -} - -typedef enum { - VPX_FILTER_4x4 = 0, - VPX_FILTER_8x8 = 1, - VPX_FILTER_8x4 = 2, - VPX_FILTER_16x16 = 3, -} filter_size_t; - -static const unsigned int filter_size_to_wh[][2] = { - {4, 4}, - {8, 8}, - {8, 4}, - {16,16}, -}; - -static const unsigned int filter_max_height = 16; -static const unsigned int filter_max_width = 16; - -static void filter_block2d_8_c(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *HFilter, - const short *VFilter, - const filter_size_t filter_size, - unsigned char *dst_ptr, - unsigned int dst_stride) { - const unsigned int output_width = filter_size_to_wh[filter_size][0]; - const unsigned int output_height = filter_size_to_wh[filter_size][1]; - - // Between passes, we use an intermediate buffer whose height is extended to - // have enough horizontally filtered values as input for the vertical pass. - // This buffer is allocated to be big enough for the largest block type we - // support. - const int kInterp_Extend = 4; - const unsigned int intermediate_height = - (kInterp_Extend - 1) + output_height + kInterp_Extend; - const unsigned int max_intermediate_height = - (kInterp_Extend - 1) + filter_max_height + kInterp_Extend; -#ifdef _MSC_VER - // MSVC does not support C99 style declaration - unsigned char intermediate_buffer[23 * 16]; -#else - unsigned char intermediate_buffer[max_intermediate_height * filter_max_width]; -#endif - const int intermediate_next_stride = 1 - intermediate_height * output_width; - - // Horizontal pass (src -> transposed intermediate). - { - unsigned char *output_ptr = intermediate_buffer; - const int src_next_row_stride = src_stride - output_width; - unsigned int i, j; - src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); - for (i = 0; i < intermediate_height; i++) { - for (j = 0; j < output_width; j++) { - // Apply filter... - int temp = ((int)src_ptr[0] * HFilter[0]) + - ((int)src_ptr[1] * HFilter[1]) + - ((int)src_ptr[2] * HFilter[2]) + - ((int)src_ptr[3] * HFilter[3]) + - ((int)src_ptr[4] * HFilter[4]) + - ((int)src_ptr[5] * HFilter[5]) + - ((int)src_ptr[6] * HFilter[6]) + - ((int)src_ptr[7] * HFilter[7]) + - (VP9_FILTER_WEIGHT >> 1); // Rounding - - // Normalize back to 0-255... - temp >>= VP9_FILTER_SHIFT; - if (temp < 0) { - temp = 0; - } else if (temp > 255) { - temp = 255; - } - src_ptr++; - *output_ptr = temp; - output_ptr += intermediate_height; - } - src_ptr += src_next_row_stride; - output_ptr += intermediate_next_stride; - } - } - - // Vertical pass (transposed intermediate -> dst). - { - unsigned char *src_ptr = intermediate_buffer; - const int dst_next_row_stride = dst_stride - output_width; - unsigned int i, j; - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - // Apply filter... - int temp = ((int)src_ptr[0] * VFilter[0]) + - ((int)src_ptr[1] * VFilter[1]) + - ((int)src_ptr[2] * VFilter[2]) + - ((int)src_ptr[3] * VFilter[3]) + - ((int)src_ptr[4] * VFilter[4]) + - ((int)src_ptr[5] * VFilter[5]) + - ((int)src_ptr[6] * VFilter[6]) + - ((int)src_ptr[7] * VFilter[7]) + - (VP9_FILTER_WEIGHT >> 1); // Rounding - - // Normalize back to 0-255... - temp >>= VP9_FILTER_SHIFT; - if (temp < 0) { - temp = 0; - } else if (temp > 255) { - temp = 255; - } - - src_ptr += intermediate_height; - *dst_ptr++ = (unsigned char)temp; - } - src_ptr += intermediate_next_stride; - dst_ptr += dst_next_row_stride; - } - } -} - -void vp9_filter_block2d_4x4_8_c(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *HFilter_aligned16, - const short *VFilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, - HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_4x4, dst_ptr, dst_stride); -} - -void vp9_filter_block2d_8x4_8_c(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *HFilter_aligned16, - const short *VFilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, - HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_8x4, dst_ptr, dst_stride); -} - -void vp9_filter_block2d_8x8_8_c(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *HFilter_aligned16, - const short *VFilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, - HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_8x8, dst_ptr, dst_stride); -} - -void vp9_filter_block2d_16x16_8_c(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *HFilter_aligned16, - const short *VFilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, - HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_16x16, dst_ptr, dst_stride); -} - -static void block2d_average_c(unsigned char *src, - unsigned int src_stride, - unsigned char *output_ptr, - unsigned int output_stride, - const filter_size_t filter_size) { - const unsigned int output_width = filter_size_to_wh[filter_size][0]; - const unsigned int output_height = filter_size_to_wh[filter_size][1]; - - unsigned int i, j; - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1; - } - output_ptr += output_stride; - } -} - -#define block2d_average block2d_average_c - -void vp9_eighttap_predict_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - - HFilter = vp9_sub_pel_filters_8[xoffset]; - VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg4x4_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter = vp9_sub_pel_filters_8[xoffset]; - const short *VFilter = vp9_sub_pel_filters_8[yoffset]; - unsigned char tmp[4 * 4]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - tmp, 4); - block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4); -} - -void vp9_eighttap_predict_sharp_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - - HFilter = vp9_sub_pel_filters_8s[xoffset]; - VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg4x4_sharp_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const short *VFilter = vp9_sub_pel_filters_8s[yoffset]; - unsigned char tmp[4 * 4]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - tmp, 4); - block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4); -} - -void vp9_eighttap_predict8x8_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter = vp9_sub_pel_filters_8[xoffset]; - const short *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict8x8_sharp_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const short *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg8x8_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - unsigned char tmp[8 * 8]; - const short *HFilter = vp9_sub_pel_filters_8[xoffset]; - const short *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - tmp, 8); - block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8); -} - -void vp9_eighttap_predict_avg8x8_sharp_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - unsigned char tmp[8 * 8]; - const short *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const short *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - tmp, 8); - block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8); -} - -void vp9_eighttap_predict8x4_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter = vp9_sub_pel_filters_8[xoffset]; - const short *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict8x4_sharp_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const short *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict16x16_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter = vp9_sub_pel_filters_8[xoffset]; - const short *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict16x16_sharp_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const short *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg16x16_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16); - const short *HFilter = vp9_sub_pel_filters_8[xoffset]; - const short *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - tmp, 16); - block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16); -} - -void vp9_eighttap_predict_avg16x16_sharp_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16); - const short *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const short *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - tmp, 16); - block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16); -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_first_pass - * - * INPUTS : UINT8 *src_ptr : Pointer to source block. - * UINT32 src_stride : Stride of source block. - * UINT32 height : Block height. - * UINT32 width : Block width. - * INT32 *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : INT32 *dst_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block - * in the horizontal direction to produce the filtered output - * block. Used to implement first-pass of 2-D separable filter. - * - * SPECIAL NOTES : Produces INT32 output to retain precision for next pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * - ****************************************************************************/ -static void filter_block2d_bil_first_pass(unsigned char *src_ptr, - unsigned short *dst_ptr, - unsigned int src_stride, - unsigned int height, - unsigned int width, - const short *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - /* Apply bilinear filter */ - dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[1] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT; - src_ptr++; - } - - /* Next row... */ - src_ptr += src_stride - width; - dst_ptr += width; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_second_pass - * - * INPUTS : INT32 *src_ptr : Pointer to source block. - * UINT32 dst_pitch : Destination block pitch. - * UINT32 height : Block height. - * UINT32 width : Block width. - * INT32 *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block - * in the vertical direction to produce the filtered output - * block. Used to implement second-pass of 2-D separable filter. - * - * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * - ****************************************************************************/ -static void filter_block2d_bil_second_pass(unsigned short *src_ptr, - unsigned char *dst_ptr, - int dst_pitch, - unsigned int height, - unsigned int width, - const short *vp9_filter) { - unsigned int i, j; - int Temp; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - /* Apply filter */ - Temp = ((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[width] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2); - dst_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT); - src_ptr++; - } - - /* Next row... */ - dst_ptr += dst_pitch; - } -} - -/* - * As before for filter_block2d_second_pass_avg(), the functional difference - * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg() - * is that filter_block2d_bil_second_pass() does a bilinear filter on input - * and stores the result in output; filter_block2d_bil_second_pass_avg(), - * instead, does a bilinear filter on input, averages the resulting value - * with the values already present in the output and stores the result of - * that back into the output ((filter_result + dest + 1) >> 1). - */ -static void filter_block2d_bil_second_pass_avg(unsigned short *src_ptr, - unsigned char *dst_ptr, - int dst_pitch, - unsigned int height, - unsigned int width, - const short *vp9_filter) { - unsigned int i, j; - int Temp; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - /* Apply filter */ - Temp = ((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[width] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2); - dst_ptr[j] = (unsigned int)(((Temp >> VP9_FILTER_SHIFT) + dst_ptr[j] + 1) >> 1); - src_ptr++; - } - - /* Next row... */ - dst_ptr += dst_pitch; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil - * - * INPUTS : UINT8 *src_ptr : Pointer to source block. - * UINT32 src_pitch : Stride of source block. - * UINT32 dst_pitch : Stride of destination block. - * INT32 *HFilter : Array of 2 horizontal filter taps. - * INT32 *VFilter : Array of 2 vertical filter taps. - * INT32 Width : Block width - * INT32 Height : Block height - * - * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : 2-D filters an input block by applying a 2-tap - * bi-linear filter horizontally followed by a 2-tap - * bi-linear filter vertically on the result. - * - * SPECIAL NOTES : The largest block size can be handled here is 16x16 - * - ****************************************************************************/ -static void filter_block2d_bil(unsigned char *src_ptr, - unsigned char *dst_ptr, - unsigned int src_pitch, - unsigned int dst_pitch, - const short *HFilter, - const short *VFilter, - int Width, - int Height) { - - unsigned short FData[17 * 16]; /* Temp data buffer used in filtering */ - - /* First filter 1-D horizontally... */ - filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); - - /* then 1-D vertically... */ - filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter); -} - -static void filter_block2d_bil_avg(unsigned char *src_ptr, - unsigned char *dst_ptr, - unsigned int src_pitch, - unsigned int dst_pitch, - const short *HFilter, - const short *VFilter, - int Width, - int Height) { - unsigned short FData[17 * 16]; /* Temp data buffer used in filtering */ - - /* First filter 1-D horizontally... */ - filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); - - /* then 1-D vertically... */ - filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter); -} - -void vp9_bilinear_predict4x4_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); -} - -void vp9_bilinear_predict_avg4x4_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line, - dst_pitch, HFilter, VFilter, 4, 4); -} - -void vp9_bilinear_predict8x8_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); - -} - -void vp9_bilinear_predict_avg8x8_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line, - dst_pitch, HFilter, VFilter, 8, 8); -} - -void vp9_bilinear_predict8x4_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); - -} - -void vp9_bilinear_predict16x16_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); -} - -void vp9_bilinear_predict_avg16x16_c(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line, - dst_pitch, HFilter, VFilter, 16, 16); -} diff --git a/vp9/common/filter.h b/vp9/common/filter.h deleted file mode 100644 index c194887dc..000000000 --- a/vp9/common/filter.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef FILTER_H -#define FILTER_H - -#include "vpx_config.h" -#include "vpx_scale/yv12config.h" - -#define BLOCK_HEIGHT_WIDTH 4 -#define VP9_FILTER_WEIGHT 128 -#define VP9_FILTER_SHIFT 7 - -#define SUBPEL_SHIFTS 16 - -extern const short vp9_bilinear_filters[SUBPEL_SHIFTS][2]; -extern const short vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]; -extern const short vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]; -extern const short vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]; - -#endif // FILTER_H diff --git a/vp9/common/findnearmv.c b/vp9/common/findnearmv.c deleted file mode 100644 index a551db810..000000000 --- a/vp9/common/findnearmv.c +++ /dev/null @@ -1,296 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "findnearmv.h" -#include "vp9/common/sadmxn.h" -#include "vp9/common/subpelvar.h" -#include - -const unsigned char vp9_mbsplit_offset[4][16] = { - { 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} -}; - -static void lower_mv_precision(int_mv *mv, int usehp) -{ - if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) { - if (mv->as_mv.row & 1) - mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1); - if (mv->as_mv.col & 1) - mv->as_mv.col += (mv->as_mv.col > 0 ? -1 : 1); - } -} - -vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc, - vp9_prob p[4], const int context - ) { - p[0] = pc->fc.vp9_mode_contexts[context][0]; - p[1] = pc->fc.vp9_mode_contexts[context][1]; - p[2] = pc->fc.vp9_mode_contexts[context][2]; - p[3] = pc->fc.vp9_mode_contexts[context][3]; - return p; -} - -#define SP(x) (((x) & 7) << 1) -unsigned int vp9_sad3x16_c(const unsigned char *src_ptr, - int src_stride, - const unsigned char *ref_ptr, - int ref_stride) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 3, 16); -} -unsigned int vp9_sad16x3_c(const unsigned char *src_ptr, - int src_stride, - const unsigned char *ref_ptr, - int ref_stride) { - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 3); -} - -#if CONFIG_SUBPELREFMV -unsigned int vp9_variance2x16_c(const unsigned char *src_ptr, - const int source_stride, - const unsigned char *ref_ptr, - const int recon_stride, - unsigned int *sse) { - int sum; - variance(src_ptr, source_stride, ref_ptr, recon_stride, 2, 16, sse, &sum); - return (*sse - (((unsigned int)sum * sum) >> 5)); -} - -unsigned int vp9_variance16x2_c(const unsigned char *src_ptr, - const int source_stride, - const unsigned char *ref_ptr, - const int recon_stride, - unsigned int *sse) { - int sum; - variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 2, sse, &sum); - return (*sse - (((unsigned int)sum * sum) >> 5)); -} - -unsigned int vp9_sub_pixel_variance16x2_c(const unsigned char *src_ptr, - const int src_pixels_per_line, - const int xoffset, - const int yoffset, - const unsigned char *dst_ptr, - const int dst_pixels_per_line, - unsigned int *sse) { - unsigned short FData3[16 * 3]; // Temp data buffer used in filtering - unsigned char temp2[2 * 16]; - const short *HFilter, *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - var_filter_block2d_bil_first_pass(src_ptr, FData3, - src_pixels_per_line, 1, 3, 16, HFilter); - var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 2, 16, VFilter); - - return vp9_variance16x2_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse); -} - -unsigned int vp9_sub_pixel_variance2x16_c(const unsigned char *src_ptr, - const int src_pixels_per_line, - const int xoffset, - const int yoffset, - const unsigned char *dst_ptr, - const int dst_pixels_per_line, - unsigned int *sse) { - unsigned short FData3[2 * 17]; // Temp data buffer used in filtering - unsigned char temp2[2 * 16]; - const short *HFilter, *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - var_filter_block2d_bil_first_pass(src_ptr, FData3, - src_pixels_per_line, 1, 17, 2, HFilter); - var_filter_block2d_bil_second_pass(FData3, temp2, 2, 2, 16, 2, VFilter); - - return vp9_variance2x16_c(temp2, 2, dst_ptr, dst_pixels_per_line, sse); -} -#endif - -/* check a list of motion vectors by sad score using a number rows of pixels - * above and a number cols of pixels in the left to select the one with best - * score to use as ref motion vector - */ -void vp9_find_best_ref_mvs(MACROBLOCKD *xd, - unsigned char *ref_y_buffer, - int ref_y_stride, - int_mv *mvlist, - int_mv *best_mv, - int_mv *nearest, - int_mv *near) { - int i, j; - unsigned char *above_src; - unsigned char *left_src; - unsigned char *above_ref; - unsigned char *left_ref; - unsigned int score; - unsigned int sse; - unsigned int ref_scores[MAX_MV_REFS] = {0}; - int_mv sorted_mvs[MAX_MV_REFS]; - int zero_seen = FALSE; - - // Default all to 0,0 if nothing else available - best_mv->as_int = nearest->as_int = near->as_int = 0; - vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs)); - -#if CONFIG_SUBPELREFMV - above_src = xd->dst.y_buffer - xd->dst.y_stride * 2; - left_src = xd->dst.y_buffer - 2; - above_ref = ref_y_buffer - ref_y_stride * 2; - left_ref = ref_y_buffer - 2; -#else - above_src = xd->dst.y_buffer - xd->dst.y_stride * 3; - left_src = xd->dst.y_buffer - 3; - above_ref = ref_y_buffer - ref_y_stride * 3; - left_ref = ref_y_buffer - 3; -#endif - - //for(i = 0; i < MAX_MV_REFS; ++i) { - // Limit search to the predicted best 4 - for(i = 0; i < 4; ++i) { - int_mv this_mv; - int offset = 0; - int row_offset, col_offset; - - this_mv.as_int = mvlist[i].as_int; - - // If we see a 0,0 vector for a second time we have reached the end of - // the list of valid candidate vectors. - if (!this_mv.as_int && zero_seen) - break; - - zero_seen = zero_seen || !this_mv.as_int; - - clamp_mv(&this_mv, - xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24, - xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, - xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24, - xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); - -#if CONFIG_SUBPELREFMV - row_offset = this_mv.as_mv.row >> 3; - col_offset = this_mv.as_mv.col >> 3; - offset = ref_y_stride * row_offset + col_offset; - score = 0; - if (xd->up_available) { - vp9_sub_pixel_variance16x2_c(above_ref + offset, ref_y_stride, - SP(this_mv.as_mv.col), - SP(this_mv.as_mv.row), - above_src, xd->dst.y_stride, &sse); - score += sse; -#if CONFIG_SUPERBLOCKS - if (xd->mode_info_context->mbmi.encoded_as_sb) { - vp9_sub_pixel_variance16x2_c(above_ref + offset + 16, - ref_y_stride, - SP(this_mv.as_mv.col), - SP(this_mv.as_mv.row), - above_src + 16, xd->dst.y_stride, &sse); - score += sse; - } -#endif - } - if (xd->left_available) { - vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride, - SP(this_mv.as_mv.col), - SP(this_mv.as_mv.row), - left_src, xd->dst.y_stride, &sse); - score += sse; -#if CONFIG_SUPERBLOCKS - if (xd->mode_info_context->mbmi.encoded_as_sb) { - vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 16, - ref_y_stride, - SP(this_mv.as_mv.col), - SP(this_mv.as_mv.row), - left_src + xd->dst.y_stride * 16, - xd->dst.y_stride, &sse); - score += sse; - } -#endif - } -#else - row_offset = (this_mv.as_mv.row > 0) ? - ((this_mv.as_mv.row + 3) >> 3):((this_mv.as_mv.row + 4) >> 3); - col_offset = (this_mv.as_mv.col > 0) ? - ((this_mv.as_mv.col + 3) >> 3):((this_mv.as_mv.col + 4) >> 3); - offset = ref_y_stride * row_offset + col_offset; - score = 0; - if (xd->up_available) { - score += vp9_sad16x3(above_src, xd->dst.y_stride, - above_ref + offset, ref_y_stride); -#if CONFIG_SUPERBLOCKS - if (xd->mode_info_context->mbmi.encoded_as_sb) { - score += vp9_sad16x3(above_src + 16, xd->dst.y_stride, - above_ref + offset + 16, ref_y_stride); - } -#endif - } - if (xd->left_available) { - score += vp9_sad3x16(left_src, xd->dst.y_stride, - left_ref + offset, ref_y_stride); -#if CONFIG_SUPERBLOCKS - if (xd->mode_info_context->mbmi.encoded_as_sb) { - score += vp9_sad3x16(left_src + xd->dst.y_stride * 16, - xd->dst.y_stride, - left_ref + offset + ref_y_stride * 16, - ref_y_stride); - } -#endif - } -#endif - // Add the entry to our list and then resort the list on score. - ref_scores[i] = score; - sorted_mvs[i].as_int = this_mv.as_int; - j = i; - while (j > 0) { - if (ref_scores[j] < ref_scores[j-1]) { - ref_scores[j] = ref_scores[j-1]; - sorted_mvs[j].as_int = sorted_mvs[j-1].as_int; - ref_scores[j-1] = score; - sorted_mvs[j-1].as_int = this_mv.as_int; - j--; - } else - break; - } - } - - // Make sure all the candidates are properly clamped etc - for (i = 0; i < 4; ++i) { - lower_mv_precision(&sorted_mvs[i], xd->allow_high_precision_mv); - clamp_mv2(&sorted_mvs[i], xd); - } - - // Set the best mv to the first entry in the sorted list - best_mv->as_int = sorted_mvs[0].as_int; - - // Provided that there are non zero vectors available there will not - // be more than one 0,0 entry in the sorted list. - // The best ref mv is always set to the first entry (which gave the best - // results. The nearest is set to the first non zero vector if available and - // near to the second non zero vector if available. - // We do not use 0,0 as a nearest or near as 0,0 has its own mode. - if ( sorted_mvs[0].as_int ) { - nearest->as_int = sorted_mvs[0].as_int; - if ( sorted_mvs[1].as_int ) - near->as_int = sorted_mvs[1].as_int; - else - near->as_int = sorted_mvs[2].as_int; - } else { - nearest->as_int = sorted_mvs[1].as_int; - near->as_int = sorted_mvs[2].as_int; - } - - // Copy back the re-ordered mv list - vpx_memcpy(mvlist, sorted_mvs, sizeof(sorted_mvs)); -} diff --git a/vp9/common/findnearmv.h b/vp9/common/findnearmv.h deleted file mode 100644 index 4e6418a51..000000000 --- a/vp9/common/findnearmv.h +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_FINDNEARMV_H -#define __INC_FINDNEARMV_H - -#include "mv.h" -#include "blockd.h" -#include "treecoder.h" -#include "onyxc_int.h" - -/* check a list of motion vectors by sad score using a number rows of pixels - * above and a number cols of pixels in the left to select the one with best - * score to use as ref motion vector - */ -void vp9_find_best_ref_mvs(MACROBLOCKD *xd, - unsigned char *ref_y_buffer, - int ref_y_stride, - int_mv *mvlist, - int_mv *best_mv, - int_mv *nearest, - int_mv *near); - -static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias) { - MV xmv; - xmv = mvp->as_mv; - - if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) { - xmv.row *= -1; - xmv.col *= -1; - } - - mvp->as_mv = xmv; -} - -#define LEFT_TOP_MARGIN (16 << 3) -#define RIGHT_BOTTOM_MARGIN (16 << 3) - -static void clamp_mv(int_mv *mv, - int mb_to_left_edge, - int mb_to_right_edge, - int mb_to_top_edge, - int mb_to_bottom_edge) { - mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ? - mb_to_left_edge : mv->as_mv.col; - mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ? - mb_to_right_edge : mv->as_mv.col; - mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ? - mb_to_top_edge : mv->as_mv.row; - mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ? - mb_to_bottom_edge : mv->as_mv.row; -} - -static void clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) { - clamp_mv(mv, - xd->mb_to_left_edge - LEFT_TOP_MARGIN, - xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, - xd->mb_to_top_edge - LEFT_TOP_MARGIN, - xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); -} - -static unsigned int check_mv_bounds(int_mv *mv, - int mb_to_left_edge, - int mb_to_right_edge, - int mb_to_top_edge, - int mb_to_bottom_edge) { - return (mv->as_mv.col < mb_to_left_edge) || - (mv->as_mv.col > mb_to_right_edge) || - (mv->as_mv.row < mb_to_top_edge) || - (mv->as_mv.row > mb_to_bottom_edge); -} - -vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc, - vp9_prob p[VP9_MVREFS - 1], - const int context); - -extern const unsigned char vp9_mbsplit_offset[4][16]; - -static int left_block_mv(const MODE_INFO *cur_mb, int b) { - if (!(b & 3)) { - /* On L edge, get from MB to left of us */ - --cur_mb; - - if (cur_mb->mbmi.mode != SPLITMV) - return cur_mb->mbmi.mv[0].as_int; - b += 4; - } - - return (cur_mb->bmi + b - 1)->as_mv.first.as_int; -} - -static int left_block_second_mv(const MODE_INFO *cur_mb, int b) { - if (!(b & 3)) { - /* On L edge, get from MB to left of us */ - --cur_mb; - - if (cur_mb->mbmi.mode != SPLITMV) - return cur_mb->mbmi.second_ref_frame > 0 ? - cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int; - b += 4; - } - - return cur_mb->mbmi.second_ref_frame > 0 ? - (cur_mb->bmi + b - 1)->as_mv.second.as_int : - (cur_mb->bmi + b - 1)->as_mv.first.as_int; -} - -static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) { - if (!(b >> 2)) { - /* On top edge, get from MB above us */ - cur_mb -= mi_stride; - - if (cur_mb->mbmi.mode != SPLITMV) - return cur_mb->mbmi.mv[0].as_int; - b += 16; - } - - return (cur_mb->bmi + b - 4)->as_mv.first.as_int; -} - -static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) { - if (!(b >> 2)) { - /* On top edge, get from MB above us */ - cur_mb -= mi_stride; - - if (cur_mb->mbmi.mode != SPLITMV) - return cur_mb->mbmi.second_ref_frame > 0 ? - cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int; - b += 16; - } - - return cur_mb->mbmi.second_ref_frame > 0 ? - (cur_mb->bmi + b - 4)->as_mv.second.as_int : - (cur_mb->bmi + b - 4)->as_mv.first.as_int; -} - -static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) { - if (!(b & 3)) { - /* On L edge, get from MB to left of us */ - --cur_mb; - - if (cur_mb->mbmi.mode < I8X8_PRED) { - return pred_mode_conv(cur_mb->mbmi.mode); - } else if (cur_mb->mbmi.mode == I8X8_PRED) { - return pred_mode_conv((cur_mb->bmi + 3 + b)->as_mode.first); - } else if (cur_mb->mbmi.mode == B_PRED) { - return ((cur_mb->bmi + 3 + b)->as_mode.first); - } else { - return B_DC_PRED; - } - } - return (cur_mb->bmi + b - 1)->as_mode.first; -} - -static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, - int b, int mi_stride) { - if (!(b >> 2)) { - /* On top edge, get from MB above us */ - cur_mb -= mi_stride; - - if (cur_mb->mbmi.mode < I8X8_PRED) { - return pred_mode_conv(cur_mb->mbmi.mode); - } else if (cur_mb->mbmi.mode == I8X8_PRED) { - return pred_mode_conv((cur_mb->bmi + 12 + b)->as_mode.first); - } else if (cur_mb->mbmi.mode == B_PRED) { - return ((cur_mb->bmi + 12 + b)->as_mode.first); - } else { - return B_DC_PRED; - } - } - - return (cur_mb->bmi + b - 4)->as_mode.first; -} - -#endif diff --git a/vp9/common/generic/systemdependent.c b/vp9/common/generic/systemdependent.c deleted file mode 100644 index 6d1a271f5..000000000 --- a/vp9/common/generic/systemdependent.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include "vp9_rtcd.h" -#include "vp9/common/subpixel.h" -#include "vp9/common/loopfilter.h" -#include "vp9/common/onyxc_int.h" - -extern void vp9_arch_x86_common_init(VP9_COMMON *ctx); -extern void vp9_arch_arm_common_init(VP9_COMMON *ctx); - -void vp9_machine_specific_config(VP9_COMMON *ctx) { -#if CONFIG_RUNTIME_CPU_DETECT - VP9_COMMON_RTCD *rtcd = &ctx->rtcd; - -#if CONFIG_POSTPROC || (CONFIG_VP9_ENCODER && CONFIG_INTERNAL_STATS) - rtcd->postproc.down = vp9_mbpost_proc_down_c; - rtcd->postproc.across = vp9_mbpost_proc_across_ip_c; - rtcd->postproc.downacross = vp9_post_proc_down_and_across_c; - rtcd->postproc.addnoise = vp9_plane_add_noise_c; - rtcd->postproc.blend_mb_inner = vp9_blend_mb_inner_c; - rtcd->postproc.blend_mb_outer = vp9_blend_mb_outer_c; - rtcd->postproc.blend_b = vp9_blend_b_c; -#endif - -#endif - -#if ARCH_X86 || ARCH_X86_64 - vp9_arch_x86_common_init(ctx); -#endif - -#if ARCH_ARM - vp9_arch_arm_common_init(ctx); -#endif - - vp9_rtcd(); -} diff --git a/vp9/common/generic/vp9_systemdependent.c b/vp9/common/generic/vp9_systemdependent.c new file mode 100644 index 000000000..29d1c3057 --- /dev/null +++ b/vp9/common/generic/vp9_systemdependent.c @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vp9_rtcd.h" +#include "vp9/common/vp9_subpixel.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_onyxc_int.h" + +extern void vp9_arch_x86_common_init(VP9_COMMON *ctx); +extern void vp9_arch_arm_common_init(VP9_COMMON *ctx); + +void vp9_machine_specific_config(VP9_COMMON *ctx) { +#if CONFIG_RUNTIME_CPU_DETECT + VP9_COMMON_RTCD *rtcd = &ctx->rtcd; + +#if CONFIG_POSTPROC || (CONFIG_VP9_ENCODER && CONFIG_INTERNAL_STATS) + rtcd->postproc.down = vp9_mbpost_proc_down_c; + rtcd->postproc.across = vp9_mbpost_proc_across_ip_c; + rtcd->postproc.downacross = vp9_post_proc_down_and_across_c; + rtcd->postproc.addnoise = vp9_plane_add_noise_c; + rtcd->postproc.blend_mb_inner = vp9_blend_mb_inner_c; + rtcd->postproc.blend_mb_outer = vp9_blend_mb_outer_c; + rtcd->postproc.blend_b = vp9_blend_b_c; +#endif + +#endif + +#if ARCH_X86 || ARCH_X86_64 + vp9_arch_x86_common_init(ctx); +#endif + +#if ARCH_ARM + vp9_arch_arm_common_init(ctx); +#endif + + vp9_rtcd(); +} diff --git a/vp9/common/header.h b/vp9/common/header.h deleted file mode 100644 index a88b6e3e3..000000000 --- a/vp9/common/header.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_HEADER_H -#define __INC_HEADER_H - -/* 24 bits total */ -typedef struct { - unsigned int type: 1; - unsigned int version: 3; - unsigned int show_frame: 1; - - /* Allow 2^20 bytes = 8 megabits for first partition */ - - unsigned int first_partition_length_in_bytes: 19; - -#ifdef PACKET_TESTING - unsigned int frame_number; - unsigned int update_gold: 1; - unsigned int uses_gold: 1; - unsigned int update_last: 1; - unsigned int uses_last: 1; -#endif - -} VP9_HEADER; - -#ifdef PACKET_TESTING -#define VP9_HEADER_SIZE 8 -#else -#define VP9_HEADER_SIZE 3 -#endif - - -#endif diff --git a/vp9/common/idctllm.c b/vp9/common/idctllm.c deleted file mode 100644 index 55b7a8571..000000000 --- a/vp9/common/idctllm.c +++ /dev/null @@ -1,1784 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -/**************************************************************************** - * Notes: - * - * This implementation makes use of 16 bit fixed point verio of two multiply - * constants: - * 1. sqrt(2) * cos (pi/8) - * 2. sqrt(2) * sin (pi/8) - * Becuase the first constant is bigger than 1, to maintain the same 16 bit - * fixed point precision as the second one, we use a trick of - * x * a = x + x*(a-1) - * so - * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). - **************************************************************************/ -#include -#include -#include "vpx_ports/config.h" -#include "vp9/common/systemdependent.h" - -#include "vp9/common/blockd.h" - -static const int cospi8sqrt2minus1 = 20091; -static const int sinpi8sqrt2 = 35468; -static const int rounding = 0; - -// TODO: these transforms can be further converted into integer forms -// for complexity optimization -static const float idct_4[16] = { - 0.500000000000000, 0.653281482438188, 0.500000000000000, 0.270598050073099, - 0.500000000000000, 0.270598050073099, -0.500000000000000, -0.653281482438188, - 0.500000000000000, -0.270598050073099, -0.500000000000000, 0.653281482438188, - 0.500000000000000, -0.653281482438188, 0.500000000000000, -0.270598050073099 -}; - -static const float iadst_4[16] = { - 0.228013428883779, 0.577350269189626, 0.656538502008139, 0.428525073124360, - 0.428525073124360, 0.577350269189626, -0.228013428883779, -0.656538502008139, - 0.577350269189626, 0, -0.577350269189626, 0.577350269189626, - 0.656538502008139, -0.577350269189626, 0.428525073124359, -0.228013428883779 -}; - -static const float idct_8[64] = { - 0.353553390593274, 0.490392640201615, 0.461939766255643, 0.415734806151273, - 0.353553390593274, 0.277785116509801, 0.191341716182545, 0.097545161008064, - 0.353553390593274, 0.415734806151273, 0.191341716182545, -0.097545161008064, - -0.353553390593274, -0.490392640201615, -0.461939766255643, -0.277785116509801, - 0.353553390593274, 0.277785116509801, -0.191341716182545, -0.490392640201615, - -0.353553390593274, 0.097545161008064, 0.461939766255643, 0.415734806151273, - 0.353553390593274, 0.097545161008064, -0.461939766255643, -0.277785116509801, - 0.353553390593274, 0.415734806151273, -0.191341716182545, -0.490392640201615, - 0.353553390593274, -0.097545161008064, -0.461939766255643, 0.277785116509801, - 0.353553390593274, -0.415734806151273, -0.191341716182545, 0.490392640201615, - 0.353553390593274, -0.277785116509801, -0.191341716182545, 0.490392640201615, - -0.353553390593274, -0.097545161008064, 0.461939766255643, -0.415734806151273, - 0.353553390593274, -0.415734806151273, 0.191341716182545, 0.097545161008064, - -0.353553390593274, 0.490392640201615, -0.461939766255643, 0.277785116509801, - 0.353553390593274, -0.490392640201615, 0.461939766255643, -0.415734806151273, - 0.353553390593274, -0.277785116509801, 0.191341716182545, -0.097545161008064 -}; - -static const float iadst_8[64] = { - 0.089131608307533, 0.255357107325376, 0.387095214016349, 0.466553967085785, - 0.483002021635509, 0.434217976756762, 0.326790388032145, 0.175227946595735, - 0.175227946595735, 0.434217976756762, 0.466553967085785, 0.255357107325376, - -0.089131608307533, -0.387095214016348, -0.483002021635509, -0.326790388032145, - 0.255357107325376, 0.483002021635509, 0.175227946595735, -0.326790388032145, - -0.466553967085785, -0.089131608307533, 0.387095214016349, 0.434217976756762, - 0.326790388032145, 0.387095214016349, -0.255357107325376, -0.434217976756762, - 0.175227946595735, 0.466553967085786, -0.089131608307534, -0.483002021635509, - 0.387095214016349, 0.175227946595735, -0.483002021635509, 0.089131608307533, - 0.434217976756762, -0.326790388032145, -0.255357107325377, 0.466553967085785, - 0.434217976756762, -0.089131608307533, -0.326790388032145, 0.483002021635509, - -0.255357107325376, -0.175227946595735, 0.466553967085785, -0.387095214016348, - 0.466553967085785, -0.326790388032145, 0.089131608307533, 0.175227946595735, - -0.387095214016348, 0.483002021635509, -0.434217976756762, 0.255357107325376, - 0.483002021635509, -0.466553967085785, 0.434217976756762, -0.387095214016348, - 0.326790388032145, -0.255357107325375, 0.175227946595736, -0.089131608307532 -}; - -static const int16_t idct_i4[16] = { - 8192, 10703, 8192, 4433, - 8192, 4433, -8192, -10703, - 8192, -4433, -8192, 10703, - 8192, -10703, 8192, -4433 -}; - -static const int16_t iadst_i4[16] = { - 3736, 9459, 10757, 7021, - 7021, 9459, -3736, -10757, - 9459, 0, -9459, 9459, - 10757, -9459, 7021, -3736 -}; - -static const int16_t idct_i8[64] = { - 5793, 8035, 7568, 6811, - 5793, 4551, 3135, 1598, - 5793, 6811, 3135, -1598, - -5793, -8035, -7568, -4551, - 5793, 4551, -3135, -8035, - -5793, 1598, 7568, 6811, - 5793, 1598, -7568, -4551, - 5793, 6811, -3135, -8035, - 5793, -1598, -7568, 4551, - 5793, -6811, -3135, 8035, - 5793, -4551, -3135, 8035, - -5793, -1598, 7568, -6811, - 5793, -6811, 3135, 1598, - -5793, 8035, -7568, 4551, - 5793, -8035, 7568, -6811, - 5793, -4551, 3135, -1598 -}; - -static const int16_t iadst_i8[64] = { - 1460, 4184, 6342, 7644, - 7914, 7114, 5354, 2871, - 2871, 7114, 7644, 4184, - -1460, -6342, -7914, -5354, - 4184, 7914, 2871, -5354, - -7644, -1460, 6342, 7114, - 5354, 6342, -4184, -7114, - 2871, 7644, -1460, -7914, - 6342, 2871, -7914, 1460, - 7114, -5354, -4184, 7644, - 7114, -1460, -5354, 7914, - -4184, -2871, 7644, -6342, - 7644, -5354, 1460, 2871, - -6342, 7914, -7114, 4184, - 7914, -7644, 7114, -6342, - 5354, -4184, 2871, -1460 -}; - -static float idct_16[256] = { - 0.250000, 0.351851, 0.346760, 0.338330, 0.326641, 0.311806, 0.293969, 0.273300, - 0.250000, 0.224292, 0.196424, 0.166664, 0.135299, 0.102631, 0.068975, 0.034654, - 0.250000, 0.338330, 0.293969, 0.224292, 0.135299, 0.034654, -0.068975, -0.166664, - -0.250000, -0.311806, -0.346760, -0.351851, -0.326641, -0.273300, -0.196424, -0.102631, - 0.250000, 0.311806, 0.196424, 0.034654, -0.135299, -0.273300, -0.346760, -0.338330, - -0.250000, -0.102631, 0.068975, 0.224292, 0.326641, 0.351851, 0.293969, 0.166664, - 0.250000, 0.273300, 0.068975, -0.166664, -0.326641, -0.338330, -0.196424, 0.034654, - 0.250000, 0.351851, 0.293969, 0.102631, -0.135299, -0.311806, -0.346760, -0.224292, - 0.250000, 0.224292, -0.068975, -0.311806, -0.326641, -0.102631, 0.196424, 0.351851, - 0.250000, -0.034654, -0.293969, -0.338330, -0.135299, 0.166664, 0.346760, 0.273300, - 0.250000, 0.166664, -0.196424, -0.351851, -0.135299, 0.224292, 0.346760, 0.102631, - -0.250000, -0.338330, -0.068975, 0.273300, 0.326641, 0.034654, -0.293969, -0.311806, - 0.250000, 0.102631, -0.293969, -0.273300, 0.135299, 0.351851, 0.068975, -0.311806, - -0.250000, 0.166664, 0.346760, 0.034654, -0.326641, -0.224292, 0.196424, 0.338330, - 0.250000, 0.034654, -0.346760, -0.102631, 0.326641, 0.166664, -0.293969, -0.224292, - 0.250000, 0.273300, -0.196424, -0.311806, 0.135299, 0.338330, -0.068975, -0.351851, - 0.250000, -0.034654, -0.346760, 0.102631, 0.326641, -0.166664, -0.293969, 0.224292, - 0.250000, -0.273300, -0.196424, 0.311806, 0.135299, -0.338330, -0.068975, 0.351851, - 0.250000, -0.102631, -0.293969, 0.273300, 0.135299, -0.351851, 0.068975, 0.311806, - -0.250000, -0.166664, 0.346760, -0.034654, -0.326641, 0.224292, 0.196424, -0.338330, - 0.250000, -0.166664, -0.196424, 0.351851, -0.135299, -0.224292, 0.346760, -0.102631, - -0.250000, 0.338330, -0.068975, -0.273300, 0.326641, -0.034654, -0.293969, 0.311806, - 0.250000, -0.224292, -0.068975, 0.311806, -0.326641, 0.102631, 0.196424, -0.351851, - 0.250000, 0.034654, -0.293969, 0.338330, -0.135299, -0.166664, 0.346760, -0.273300, - 0.250000, -0.273300, 0.068975, 0.166664, -0.326641, 0.338330, -0.196424, -0.034654, - 0.250000, -0.351851, 0.293969, -0.102631, -0.135299, 0.311806, -0.346760, 0.224292, - 0.250000, -0.311806, 0.196424, -0.034654, -0.135299, 0.273300, -0.346760, 0.338330, - -0.250000, 0.102631, 0.068975, -0.224292, 0.326641, -0.351851, 0.293969, -0.166664, - 0.250000, -0.338330, 0.293969, -0.224292, 0.135299, -0.034654, -0.068975, 0.166664, - -0.250000, 0.311806, -0.346760, 0.351851, -0.326641, 0.273300, -0.196424, 0.102631, - 0.250000, -0.351851, 0.346760, -0.338330, 0.326641, -0.311806, 0.293969, -0.273300, - 0.250000, -0.224292, 0.196424, -0.166664, 0.135299, -0.102631, 0.068975, -0.034654 -}; - -static float iadst_16[256] = { - 0.033094, 0.098087, 0.159534, 0.215215, 0.263118, 0.301511, 0.329007, 0.344612, - 0.347761, 0.338341, 0.316693, 0.283599, 0.240255, 0.188227, 0.129396, 0.065889, - 0.065889, 0.188227, 0.283599, 0.338341, 0.344612, 0.301511, 0.215215, 0.098087, - -0.033094, -0.159534, -0.263118, -0.329007, -0.347761, -0.316693, -0.240255, -0.129396, - 0.098087, 0.263118, 0.344612, 0.316693, 0.188227, 0.000000, -0.188227, -0.316693, - -0.344612, -0.263118, -0.098087, 0.098087, 0.263118, 0.344612, 0.316693, 0.188227, - 0.129396, 0.316693, 0.329007, 0.159534, -0.098087, -0.301511, -0.338341, -0.188227, - 0.065889, 0.283599, 0.344612, 0.215215, -0.033094, -0.263118, -0.347761, -0.240255, - 0.159534, 0.344612, 0.240255, -0.065889, -0.316693, -0.301511, -0.033094, 0.263118, - 0.338341, 0.129396, -0.188227, -0.347761, -0.215215, 0.098087, 0.329007, 0.283599, - 0.188227, 0.344612, 0.098087, -0.263118, -0.316693, -0.000000, 0.316693, 0.263118, - -0.098087, -0.344612, -0.188227, 0.188227, 0.344612, 0.098087, -0.263118, -0.316693, - 0.215215, 0.316693, -0.065889, -0.347761, -0.098087, 0.301511, 0.240255, -0.188227, - -0.329007, 0.033094, 0.344612, 0.129396, -0.283599, -0.263118, 0.159534, 0.338341, - 0.240255, 0.263118, -0.215215, -0.283599, 0.188227, 0.301511, -0.159534, -0.316693, - 0.129396, 0.329007, -0.098087, -0.338341, 0.065889, 0.344612, -0.033094, -0.347761, - 0.263118, 0.188227, -0.316693, -0.098087, 0.344612, 0.000000, -0.344612, 0.098087, - 0.316693, -0.188227, -0.263118, 0.263118, 0.188227, -0.316693, -0.098087, 0.344612, - 0.283599, 0.098087, -0.347761, 0.129396, 0.263118, -0.301511, -0.065889, 0.344612, - -0.159534, -0.240255, 0.316693, 0.033094, -0.338341, 0.188227, 0.215215, -0.329007, - 0.301511, 0.000000, -0.301511, 0.301511, 0.000000, -0.301511, 0.301511, 0.000000, - -0.301511, 0.301511, 0.000000, -0.301511, 0.301511, 0.000000, -0.301511, 0.301511, - 0.316693, -0.098087, -0.188227, 0.344612, -0.263118, -0.000000, 0.263118, -0.344612, - 0.188227, 0.098087, -0.316693, 0.316693, -0.098087, -0.188227, 0.344612, -0.263118, - 0.329007, -0.188227, -0.033094, 0.240255, -0.344612, 0.301511, -0.129396, -0.098087, - 0.283599, -0.347761, 0.263118, -0.065889, -0.159534, 0.316693, -0.338341, 0.215215, - 0.338341, -0.263118, 0.129396, 0.033094, -0.188227, 0.301511, -0.347761, 0.316693, - -0.215215, 0.065889, 0.098087, -0.240255, 0.329007, -0.344612, 0.283599, -0.159534, - 0.344612, -0.316693, 0.263118, -0.188227, 0.098087, 0.000000, -0.098087, 0.188227, - -0.263118, 0.316693, -0.344612, 0.344612, -0.316693, 0.263118, -0.188227, 0.098087, - 0.347761, -0.344612, 0.338341, -0.329007, 0.316693, -0.301511, 0.283599, -0.263118, - 0.240255, -0.215215, 0.188227, -0.159534, 0.129396, -0.098087, 0.065889, -0.033094 -}; - -static const int16_t idct_i16[256] = { - 4096, 5765, 5681, 5543, 5352, 5109, 4816, 4478, - 4096, 3675, 3218, 2731, 2217, 1682, 1130, 568, - 4096, 5543, 4816, 3675, 2217, 568, -1130, -2731, - -4096, -5109, -5681, -5765, -5352, -4478, -3218, -1682, - 4096, 5109, 3218, 568, -2217, -4478, -5681, -5543, - -4096, -1682, 1130, 3675, 5352, 5765, 4816, 2731, - 4096, 4478, 1130, -2731, -5352, -5543, -3218, 568, - 4096, 5765, 4816, 1682, -2217, -5109, -5681, -3675, - 4096, 3675, -1130, -5109, -5352, -1682, 3218, 5765, - 4096, -568, -4816, -5543, -2217, 2731, 5681, 4478, - 4096, 2731, -3218, -5765, -2217, 3675, 5681, 1682, - -4096, -5543, -1130, 4478, 5352, 568, -4816, -5109, - 4096, 1682, -4816, -4478, 2217, 5765, 1130, -5109, - -4096, 2731, 5681, 568, -5352, -3675, 3218, 5543, - 4096, 568, -5681, -1682, 5352, 2731, -4816, -3675, - 4096, 4478, -3218, -5109, 2217, 5543, -1130, -5765, - 4096, -568, -5681, 1682, 5352, -2731, -4816, 3675, - 4096, -4478, -3218, 5109, 2217, -5543, -1130, 5765, - 4096, -1682, -4816, 4478, 2217, -5765, 1130, 5109, - -4096, -2731, 5681, -568, -5352, 3675, 3218, -5543, - 4096, -2731, -3218, 5765, -2217, -3675, 5681, -1682, - -4096, 5543, -1130, -4478, 5352, -568, -4816, 5109, - 4096, -3675, -1130, 5109, -5352, 1682, 3218, -5765, - 4096, 568, -4816, 5543, -2217, -2731, 5681, -4478, - 4096, -4478, 1130, 2731, -5352, 5543, -3218, -568, - 4096, -5765, 4816, -1682, -2217, 5109, -5681, 3675, - 4096, -5109, 3218, -568, -2217, 4478, -5681, 5543, - -4096, 1682, 1130, -3675, 5352, -5765, 4816, -2731, - 4096, -5543, 4816, -3675, 2217, -568, -1130, 2731, - -4096, 5109, -5681, 5765, -5352, 4478, -3218, 1682, - 4096, -5765, 5681, -5543, 5352, -5109, 4816, -4478, - 4096, -3675, 3218, -2731, 2217, -1682, 1130, -568 -}; - -static const int16_t iadst_i16[256] = { - 542, 1607, 2614, 3526, 4311, 4940, 5390, 5646, - 5698, 5543, 5189, 4646, 3936, 3084, 2120, 1080, - 1080, 3084, 4646, 5543, 5646, 4940, 3526, 1607, - -542, -2614, -4311, -5390, -5698, -5189, -3936, -2120, - 1607, 4311, 5646, 5189, 3084, 0, -3084, -5189, - -5646, -4311, -1607, 1607, 4311, 5646, 5189, 3084, - 2120, 5189, 5390, 2614, -1607, -4940, -5543, -3084, - 1080, 4646, 5646, 3526, -542, -4311, -5698, -3936, - 2614, 5646, 3936, -1080, -5189, -4940, -542, 4311, - 5543, 2120, -3084, -5698, -3526, 1607, 5390, 4646, - 3084, 5646, 1607, -4311, -5189, 0, 5189, 4311, - -1607, -5646, -3084, 3084, 5646, 1607, -4311, -5189, - 3526, 5189, -1080, -5698, -1607, 4940, 3936, -3084, - -5390, 542, 5646, 2120, -4646, -4311, 2614, 5543, - 3936, 4311, -3526, -4646, 3084, 4940, -2614, -5189, - 2120, 5390, -1607, -5543, 1080, 5646, -542, -5698, - 4311, 3084, -5189, -1607, 5646, 0, -5646, 1607, - 5189, -3084, -4311, 4311, 3084, -5189, -1607, 5646, - 4646, 1607, -5698, 2120, 4311, -4940, -1080, 5646, - -2614, -3936, 5189, 542, -5543, 3084, 3526, -5390, - 4940, 0, -4940, 4940, 0, -4940, 4940, 0, - -4940, 4940, 0, -4940, 4940, 0, -4940, 4940, - 5189, -1607, -3084, 5646, -4311, 0, 4311, -5646, - 3084, 1607, -5189, 5189, -1607, -3084, 5646, -4311, - 5390, -3084, -542, 3936, -5646, 4940, -2120, -1607, - 4646, -5698, 4311, -1080, -2614, 5189, -5543, 3526, - 5543, -4311, 2120, 542, -3084, 4940, -5698, 5189, - -3526, 1080, 1607, -3936, 5390, -5646, 4646, -2614, - 5646, -5189, 4311, -3084, 1607, 0, -1607, 3084, - -4311, 5189, -5646, 5646, -5189, 4311, -3084, 1607, - 5698, -5646, 5543, -5390, 5189, -4940, 4646, -4311, - 3936, -3526, 3084, -2614, 2120, -1607, 1080, -542 -}; - -/* For test */ -#define TEST_INT 1 -#if TEST_INT -#define vp9_ihtllm_int_c vp9_ihtllm_c -#else -#define vp9_ihtllm_float_c vp9_ihtllm_c -#endif - -void vp9_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch, - TX_TYPE tx_type, int tx_dim) { - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - int i, j, k; - float bufa[256], bufb[256]; // buffers are for floating-point test purpose - // the implementation could be simplified in - // conjunction with integer transform - const int16_t *ip = input; - int16_t *op = output; - int shortpitch = pitch >> 1; - - float *pfa = &bufa[0]; - float *pfb = &bufb[0]; - - // pointers to vertical and horizontal transforms - const float *ptv, *pth; - - assert(tx_type != DCT_DCT); - // load and convert residual array into floating-point - for(j = 0; j < tx_dim; j++) { - for(i = 0; i < tx_dim; i++) { - pfa[i] = (float)ip[i]; - } - pfa += tx_dim; - ip += tx_dim; - } - - // vertical transformation - pfa = &bufa[0]; - pfb = &bufb[0]; - - switch(tx_type) { - case ADST_ADST : - case ADST_DCT : - ptv = (tx_dim == 4) ? &iadst_4[0] : - ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]); - break; - - default : - ptv = (tx_dim == 4) ? &idct_4[0] : - ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]); - break; - } - - for(j = 0; j < tx_dim; j++) { - for(i = 0; i < tx_dim; i++) { - pfb[i] = 0 ; - for(k = 0; k < tx_dim; k++) { - pfb[i] += ptv[k] * pfa[(k * tx_dim)]; - } - pfa += 1; - } - - pfb += tx_dim; - ptv += tx_dim; - pfa = &bufa[0]; - } - - // horizontal transformation - pfa = &bufa[0]; - pfb = &bufb[0]; - - switch(tx_type) { - case ADST_ADST : - case DCT_ADST : - pth = (tx_dim == 4) ? &iadst_4[0] : - ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]); - break; - - default : - pth = (tx_dim == 4) ? &idct_4[0] : - ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]); - break; - } - - for(j = 0; j < tx_dim; j++) { - for(i = 0; i < tx_dim; i++) { - pfa[i] = 0; - for(k = 0; k < tx_dim; k++) { - pfa[i] += pfb[k] * pth[k]; - } - pth += tx_dim; - } - - pfa += tx_dim; - pfb += tx_dim; - - switch(tx_type) { - case ADST_ADST : - case DCT_ADST : - pth = (tx_dim == 4) ? &iadst_4[0] : - ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]); - break; - - default : - pth = (tx_dim == 4) ? &idct_4[0] : - ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]); - break; - } - } - - // convert to short integer format and load BLOCKD buffer - op = output; - pfa = &bufa[0]; - - for(j = 0; j < tx_dim; j++) { - for(i = 0; i < tx_dim; i++) { - op[i] = (pfa[i] > 0 ) ? (int16_t)( pfa[i] / 8 + 0.49) : - -(int16_t)( - pfa[i] / 8 + 0.49); - } - - op += shortpitch; - pfa += tx_dim; - } - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -/* Converted the transforms to integer form. */ -#define VERTICAL_SHIFT 14 // 16 -#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1) -#define HORIZONTAL_SHIFT 17 // 15 -#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1) -void vp9_ihtllm_int_c(const int16_t *input, int16_t *output, int pitch, - TX_TYPE tx_type, int tx_dim) { - int i, j, k; - int16_t imbuf[256]; - - const int16_t *ip = input; - int16_t *op = output; - int16_t *im = &imbuf[0]; - - /* pointers to vertical and horizontal transforms. */ - const int16_t *ptv = NULL, *pth = NULL; - int shortpitch = pitch >> 1; - - switch (tx_type) { - case ADST_ADST : - ptv = pth = (tx_dim == 4) ? &iadst_i4[0] - : ((tx_dim == 8) ? &iadst_i8[0] - : &iadst_i16[0]); - break; - case ADST_DCT : - ptv = (tx_dim == 4) ? &iadst_i4[0] - : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]); - pth = (tx_dim == 4) ? &idct_i4[0] - : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]); - break; - case DCT_ADST : - ptv = (tx_dim == 4) ? &idct_i4[0] - : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]); - pth = (tx_dim == 4) ? &iadst_i4[0] - : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]); - break; - case DCT_DCT : - ptv = pth = (tx_dim == 4) ? &idct_i4[0] - : ((tx_dim == 8) ? &idct_i8[0] - : &idct_i16[0]); - break; - default: - assert(0); - break; - } - - /* vertical transformation */ - for (j = 0; j < tx_dim; j++) { - for (i = 0; i < tx_dim; i++) { - int temp = 0; - - for (k = 0; k < tx_dim; k++) { - temp += ptv[k] * ip[(k * tx_dim)]; - } - - im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT); - ip++; - } - im += tx_dim; // 16 - ptv += tx_dim; - ip = input; - } - - /* horizontal transformation */ - im = &imbuf[0]; - - for (j = 0; j < tx_dim; j++) { - const int16_t *pthc = pth; - - for (i = 0; i < tx_dim; i++) { - int temp = 0; - - for (k = 0; k < tx_dim; k++) { - temp += im[k] * pthc[k]; - } - - op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT); - pthc += tx_dim; - } - - im += tx_dim; // 16 - op += shortpitch; - } -} - -void vp9_short_idct4x4llm_c(short *input, short *output, int pitch) { - int i; - int a1, b1, c1, d1; - - short *ip = input; - short *op = output; - int temp1, temp2; - int shortpitch = pitch >> 1; - - for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[8]; - b1 = ip[0] - ip[8]; - - temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16; - temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16); - c1 = temp1 - temp2; - - temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16); - temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16; - d1 = temp1 + temp2; - - op[shortpitch * 0] = a1 + d1; - op[shortpitch * 3] = a1 - d1; - - op[shortpitch * 1] = b1 + c1; - op[shortpitch * 2] = b1 - c1; - - ip++; - op++; - } - - ip = output; - op = output; - - for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[2]; - b1 = ip[0] - ip[2]; - - temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16; - temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16); - c1 = temp1 - temp2; - - temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16); - temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16; - d1 = temp1 + temp2; - - op[0] = (a1 + d1 + 16) >> 5; - op[3] = (a1 - d1 + 16) >> 5; - - op[1] = (b1 + c1 + 16) >> 5; - op[2] = (b1 - c1 + 16) >> 5; - - ip += shortpitch; - op += shortpitch; - } -} - -void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch) { - int i; - int a1; - short *op = output; - int shortpitch = pitch >> 1; - a1 = ((input[0] + 16) >> 5); - for (i = 0; i < 4; i++) { - op[0] = a1; - op[1] = a1; - op[2] = a1; - op[3] = a1; - op += shortpitch; - } -} - -void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, - unsigned char *dst_ptr, int pitch, int stride) { - int a1 = ((input_dc + 16) >> 5); - int r, c; - - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - int a = a1 + pred_ptr[c]; - - if (a < 0) - a = 0; - - if (a > 255) - a = 255; - - dst_ptr[c] = (unsigned char) a; - } - - dst_ptr += stride; - pred_ptr += pitch; - } -} - -void vp9_short_inv_walsh4x4_c(short *input, short *output) { - int i; - int a1, b1, c1, d1; - short *ip = input; - short *op = output; - - for (i = 0; i < 4; i++) { - a1 = ((ip[0] + ip[3])); - b1 = ((ip[1] + ip[2])); - c1 = ((ip[1] - ip[2])); - d1 = ((ip[0] - ip[3])); - - op[0] = (a1 + b1 + 1) >> 1; - op[1] = (c1 + d1) >> 1; - op[2] = (a1 - b1) >> 1; - op[3] = (d1 - c1) >> 1; - - ip += 4; - op += 4; - } - - ip = output; - op = output; - for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[12]; - b1 = ip[4] + ip[8]; - c1 = ip[4] - ip[8]; - d1 = ip[0] - ip[12]; - op[0] = (a1 + b1 + 1) >> 1; - op[4] = (c1 + d1) >> 1; - op[8] = (a1 - b1) >> 1; - op[12] = (d1 - c1) >> 1; - ip++; - op++; - } -} - -void vp9_short_inv_walsh4x4_1_c(short *in, short *out) { - int i; - short tmp[4]; - short *ip = in; - short *op = tmp; - - op[0] = (ip[0] + 1) >> 1; - op[1] = op[2] = op[3] = (ip[0] >> 1); - - ip = tmp; - op = out; - for (i = 0; i < 4; i++) { - op[0] = (ip[0] + 1) >> 1; - op[4] = op[8] = op[12] = (ip[0] >> 1); - ip++; - op++; - } -} - -#if CONFIG_LOSSLESS -void vp9_short_inv_walsh4x4_lossless_c(short *input, short *output) { - int i; - int a1, b1, c1, d1; - short *ip = input; - short *op = output; - - for (i = 0; i < 4; i++) { - a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR; - b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR; - c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR; - d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR; - - op[0] = (a1 + b1 + 1) >> 1; - op[1] = (c1 + d1) >> 1; - op[2] = (a1 - b1) >> 1; - op[3] = (d1 - c1) >> 1; - - ip += 4; - op += 4; - } - - ip = output; - op = output; - for (i = 0; i < 4; i++) { - a1 = ip[0] + ip[12]; - b1 = ip[4] + ip[8]; - c1 = ip[4] - ip[8]; - d1 = ip[0] - ip[12]; - - - op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - - ip++; - op++; - } -} - -void vp9_short_inv_walsh4x4_1_lossless_c(short *in, short *out) { - int i; - short tmp[4]; - short *ip = in; - short *op = tmp; - - op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1; - op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1); - - ip = tmp; - op = out; - for (i = 0; i < 4; i++) { - op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR; - op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR; - ip++; - op++; - } -} - -void vp9_short_inv_walsh4x4_x8_c(short *input, short *output, int pitch) { - int i; - int a1, b1, c1, d1; - short *ip = input; - short *op = output; - int shortpitch = pitch >> 1; - - for (i = 0; i < 4; i++) { - a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR; - b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR; - c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR; - d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR; - - op[0] = (a1 + b1 + 1) >> 1; - op[1] = (c1 + d1) >> 1; - op[2] = (a1 - b1) >> 1; - op[3] = (d1 - c1) >> 1; - - ip += 4; - op += shortpitch; - } - - ip = output; - op = output; - for (i = 0; i < 4; i++) { - a1 = ip[shortpitch * 0] + ip[shortpitch * 3]; - b1 = ip[shortpitch * 1] + ip[shortpitch * 2]; - c1 = ip[shortpitch * 1] - ip[shortpitch * 2]; - d1 = ip[shortpitch * 0] - ip[shortpitch * 3]; - - - op[shortpitch * 0] = (a1 + b1 + 1) >> 1; - op[shortpitch * 1] = (c1 + d1) >> 1; - op[shortpitch * 2] = (a1 - b1) >> 1; - op[shortpitch * 3] = (d1 - c1) >> 1; - - ip++; - op++; - } -} - -void vp9_short_inv_walsh4x4_1_x8_c(short *in, short *out, int pitch) { - int i; - short tmp[4]; - short *ip = in; - short *op = tmp; - int shortpitch = pitch >> 1; - - op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1; - op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1); - - - ip = tmp; - op = out; - for (i = 0; i < 4; i++) { - op[shortpitch * 0] = (ip[0] + 1) >> 1; - op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1; - ip++; - op++; - } -} - -void vp9_dc_only_inv_walsh_add_c(short input_dc, unsigned char *pred_ptr, - unsigned char *dst_ptr, - int pitch, int stride) { - int r, c; - short tmp[16]; - vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1); - - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - int a = tmp[r * 4 + c] + pred_ptr[c]; - if (a < 0) - a = 0; - - if (a > 255) - a = 255; - - dst_ptr[c] = (unsigned char) a; - } - - dst_ptr += stride; - pred_ptr += pitch; - } -} -#endif - -void vp9_dc_only_idct_add_8x8_c(short input_dc, - unsigned char *pred_ptr, - unsigned char *dst_ptr, - int pitch, int stride) { - int a1 = ((input_dc + 16) >> 5); - int r, c, b; - unsigned char *orig_pred = pred_ptr; - unsigned char *orig_dst = dst_ptr; - for (b = 0; b < 4; b++) { - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - int a = a1 + pred_ptr[c]; - - if (a < 0) - a = 0; - - if (a > 255) - a = 255; - - dst_ptr[c] = (unsigned char) a; - } - - dst_ptr += stride; - pred_ptr += pitch; - } - dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride; - pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch; - } -} - -#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */ -#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */ -#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */ -#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */ -#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */ -#define W7 565 /* 2048*sqrt(2)*cos(7*pi/16) */ - -/* row (horizontal) IDCT - * - * 7 pi 1 dst[k] = sum c[l] * src[l] * cos( -- * - * ( k + - ) * l ) l=0 8 2 - * - * where: c[0] = 128 c[1..7] = 128*sqrt(2) */ - -static void idctrow(int *blk) { - int x0, x1, x2, x3, x4, x5, x6, x7, x8; - /* shortcut */ - if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) | - (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) { - blk[0] = blk[1] = blk[2] = blk[3] = blk[4] - = blk[5] = blk[6] = blk[7] = blk[0] << 3; - return; - } - - x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */ - /* first stage */ - x8 = W7 * (x4 + x5); - x4 = x8 + (W1 - W7) * x4; - x5 = x8 - (W1 + W7) * x5; - x8 = W3 * (x6 + x7); - x6 = x8 - (W3 - W5) * x6; - x7 = x8 - (W3 + W5) * x7; - - /* second stage */ - x8 = x0 + x1; - x0 -= x1; - x1 = W6 * (x3 + x2); - x2 = x1 - (W2 + W6) * x2; - x3 = x1 + (W2 - W6) * x3; - x1 = x4 + x6; - x4 -= x6; - x6 = x5 + x7; - x5 -= x7; - - /* third stage */ - x7 = x8 + x3; - x8 -= x3; - x3 = x0 + x2; - x0 -= x2; - x2 = (181 * (x4 + x5) + 128) >> 8; - x4 = (181 * (x4 - x5) + 128) >> 8; - - /* fourth stage */ - blk[0] = (x7 + x1) >> 8; - blk[1] = (x3 + x2) >> 8; - blk[2] = (x0 + x4) >> 8; - blk[3] = (x8 + x6) >> 8; - blk[4] = (x8 - x6) >> 8; - blk[5] = (x0 - x4) >> 8; - blk[6] = (x3 - x2) >> 8; - blk[7] = (x7 - x1) >> 8; -} - -/* column (vertical) IDCT - * - * 7 pi 1 dst[8*k] = sum c[l] * src[8*l] * - * cos( -- * ( k + - ) * l ) l=0 8 2 - * - * where: c[0] = 1/1024 c[1..7] = (1/1024)*sqrt(2) */ -static void idctcol(int *blk) { - int x0, x1, x2, x3, x4, x5, x6, x7, x8; - - /* shortcut */ - if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) | - (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) | - (x7 = blk[8 * 3]))) { - blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] - = blk[8 * 4] = blk[8 * 5] = blk[8 * 6] - = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6); - return; - } - - x0 = (blk[8 * 0] << 8) + 16384; - - /* first stage */ - x8 = W7 * (x4 + x5) + 4; - x4 = (x8 + (W1 - W7) * x4) >> 3; - x5 = (x8 - (W1 + W7) * x5) >> 3; - x8 = W3 * (x6 + x7) + 4; - x6 = (x8 - (W3 - W5) * x6) >> 3; - x7 = (x8 - (W3 + W5) * x7) >> 3; - - /* second stage */ - x8 = x0 + x1; - x0 -= x1; - x1 = W6 * (x3 + x2) + 4; - x2 = (x1 - (W2 + W6) * x2) >> 3; - x3 = (x1 + (W2 - W6) * x3) >> 3; - x1 = x4 + x6; - x4 -= x6; - x6 = x5 + x7; - x5 -= x7; - - /* third stage */ - x7 = x8 + x3; - x8 -= x3; - x3 = x0 + x2; - x0 -= x2; - x2 = (181 * (x4 + x5) + 128) >> 8; - x4 = (181 * (x4 - x5) + 128) >> 8; - - /* fourth stage */ - blk[8 * 0] = (x7 + x1) >> 14; - blk[8 * 1] = (x3 + x2) >> 14; - blk[8 * 2] = (x0 + x4) >> 14; - blk[8 * 3] = (x8 + x6) >> 14; - blk[8 * 4] = (x8 - x6) >> 14; - blk[8 * 5] = (x0 - x4) >> 14; - blk[8 * 6] = (x3 - x2) >> 14; - blk[8 * 7] = (x7 - x1) >> 14; -} - -#define TX_DIM 8 -void vp9_short_idct8x8_c(short *coefs, short *block, int pitch) { - int X[TX_DIM * TX_DIM]; - int i, j; - int shortpitch = pitch >> 1; - - for (i = 0; i < TX_DIM; i++) { - for (j = 0; j < TX_DIM; j++) { - X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1 - + (coefs[i * TX_DIM + j] < 0)) >> 2; - } - } - for (i = 0; i < 8; i++) - idctrow(X + 8 * i); - - for (i = 0; i < 8; i++) - idctcol(X + i); - - for (i = 0; i < TX_DIM; i++) { - for (j = 0; j < TX_DIM; j++) { - block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1; - } - } -} - -/* Row IDCT when only first 4 coefficients are non-zero. */ -static void idctrow10(int *blk) { - int x0, x1, x2, x3, x4, x5, x6, x7, x8; - - /* shortcut */ - if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) | - (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) { - blk[0] = blk[1] = blk[2] = blk[3] = blk[4] - = blk[5] = blk[6] = blk[7] = blk[0] << 3; - return; - } - - x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */ - /* first stage */ - x5 = W7 * x4; - x4 = W1 * x4; - x6 = W3 * x7; - x7 = -W5 * x7; - - /* second stage */ - x2 = W6 * x3; - x3 = W2 * x3; - x1 = x4 + x6; - x4 -= x6; - x6 = x5 + x7; - x5 -= x7; - - /* third stage */ - x7 = x0 + x3; - x8 = x0 - x3; - x3 = x0 + x2; - x0 -= x2; - x2 = (181 * (x4 + x5) + 128) >> 8; - x4 = (181 * (x4 - x5) + 128) >> 8; - - /* fourth stage */ - blk[0] = (x7 + x1) >> 8; - blk[1] = (x3 + x2) >> 8; - blk[2] = (x0 + x4) >> 8; - blk[3] = (x8 + x6) >> 8; - blk[4] = (x8 - x6) >> 8; - blk[5] = (x0 - x4) >> 8; - blk[6] = (x3 - x2) >> 8; - blk[7] = (x7 - x1) >> 8; -} - -/* Column (vertical) IDCT when only first 4 coefficients are non-zero. */ -static void idctcol10(int *blk) { - int x0, x1, x2, x3, x4, x5, x6, x7, x8; - - /* shortcut */ - if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) | - (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) | - (x7 = blk[8 * 3]))) { - blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] - = blk[8 * 4] = blk[8 * 5] = blk[8 * 6] - = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6); - return; - } - - x0 = (blk[8 * 0] << 8) + 16384; - - /* first stage */ - x5 = (W7 * x4 + 4) >> 3; - x4 = (W1 * x4 + 4) >> 3; - x6 = (W3 * x7 + 4) >> 3; - x7 = (-W5 * x7 + 4) >> 3; - - /* second stage */ - x2 = (W6 * x3 + 4) >> 3; - x3 = (W2 * x3 + 4) >> 3; - x1 = x4 + x6; - x4 -= x6; - x6 = x5 + x7; - x5 -= x7; - - /* third stage */ - x7 = x0 + x3; - x8 = x0 - x3; - x3 = x0 + x2; - x0 -= x2; - x2 = (181 * (x4 + x5) + 128) >> 8; - x4 = (181 * (x4 - x5) + 128) >> 8; - - /* fourth stage */ - blk[8 * 0] = (x7 + x1) >> 14; - blk[8 * 1] = (x3 + x2) >> 14; - blk[8 * 2] = (x0 + x4) >> 14; - blk[8 * 3] = (x8 + x6) >> 14; - blk[8 * 4] = (x8 - x6) >> 14; - blk[8 * 5] = (x0 - x4) >> 14; - blk[8 * 6] = (x3 - x2) >> 14; - blk[8 * 7] = (x7 - x1) >> 14; -} - -void vp9_short_idct10_8x8_c(short *coefs, short *block, int pitch) { - int X[TX_DIM * TX_DIM]; - int i, j; - int shortpitch = pitch >> 1; - - for (i = 0; i < TX_DIM; i++) { - for (j = 0; j < TX_DIM; j++) { - X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1 - + (coefs[i * TX_DIM + j] < 0)) >> 2; - } - } - - /* Do first 4 row idct only since non-zero dct coefficients are all in - * upper-left 4x4 area. */ - for (i = 0; i < 4; i++) - idctrow10(X + 8 * i); - - for (i = 0; i < 8; i++) - idctcol10(X + i); - - for (i = 0; i < TX_DIM; i++) { - for (j = 0; j < TX_DIM; j++) { - block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1; - } - } -} - -void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) { - int i; - short *ip = input; // 0,1, 4, 8 - short *op = output; - for (i = 0; i < 16; i++) { - op[i] = 0; - } - - op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1; - op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1; - op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1; - op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1; -} - - -#if 0 -// Keep a really bad float version as reference for now. -void vp9_short_idct16x16_c(short *input, short *output, int pitch) { - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double x; - const int short_pitch = pitch >> 1; - int i, j, k, l; - for (l = 0; l < 16; ++l) { - for (k = 0; k < 16; ++k) { - double s = 0; - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) { - x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32; - if (i != 0) - x *= sqrt(2.0); - if (j != 0) - x *= sqrt(2.0); - s += x; - } - } - output[k*short_pitch+l] = (short)round(s); - } - } - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} -#endif - -#define TEST_INT_16x16_IDCT 1 -#if !TEST_INT_16x16_IDCT -static const double C1 = 0.995184726672197; -static const double C2 = 0.98078528040323; -static const double C3 = 0.956940335732209; -static const double C4 = 0.923879532511287; -static const double C5 = 0.881921264348355; -static const double C6 = 0.831469612302545; -static const double C7 = 0.773010453362737; -static const double C8 = 0.707106781186548; -static const double C9 = 0.634393284163646; -static const double C10 = 0.555570233019602; -static const double C11 = 0.471396736825998; -static const double C12 = 0.38268343236509; -static const double C13 = 0.290284677254462; -static const double C14 = 0.195090322016128; -static const double C15 = 0.098017140329561; - - -static void butterfly_16x16_idct_1d(double input[16], double output[16]) { - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double step[16]; - double intermediate[16]; - double temp1, temp2; - - - // step 1 and 2 - step[ 0] = input[0] + input[8]; - step[ 1] = input[0] - input[8]; - - temp1 = input[4]*C12; - temp2 = input[12]*C4; - - temp1 -= temp2; - temp1 *= C8; - - step[ 2] = 2*(temp1); - - temp1 = input[4]*C4; - temp2 = input[12]*C12; - temp1 += temp2; - temp1 = (temp1); - temp1 *= C8; - step[ 3] = 2*(temp1); - - temp1 = input[2]*C8; - temp1 = 2*(temp1); - temp2 = input[6] + input[10]; - - step[ 4] = temp1 + temp2; - step[ 5] = temp1 - temp2; - - temp1 = input[14]*C8; - temp1 = 2*(temp1); - temp2 = input[6] - input[10]; - - step[ 6] = temp2 - temp1; - step[ 7] = temp2 + temp1; - - // for odd input - temp1 = input[3]*C12; - temp2 = input[13]*C4; - temp1 += temp2; - temp1 = (temp1); - temp1 *= C8; - intermediate[ 8] = 2*(temp1); - - temp1 = input[3]*C4; - temp2 = input[13]*C12; - temp2 -= temp1; - temp2 = (temp2); - temp2 *= C8; - intermediate[ 9] = 2*(temp2); - - intermediate[10] = 2*(input[9]*C8); - intermediate[11] = input[15] - input[1]; - intermediate[12] = input[15] + input[1]; - intermediate[13] = 2*((input[7]*C8)); - - temp1 = input[11]*C12; - temp2 = input[5]*C4; - temp2 -= temp1; - temp2 = (temp2); - temp2 *= C8; - intermediate[14] = 2*(temp2); - - temp1 = input[11]*C4; - temp2 = input[5]*C12; - temp1 += temp2; - temp1 = (temp1); - temp1 *= C8; - intermediate[15] = 2*(temp1); - - step[ 8] = intermediate[ 8] + intermediate[14]; - step[ 9] = intermediate[ 9] + intermediate[15]; - step[10] = intermediate[10] + intermediate[11]; - step[11] = intermediate[10] - intermediate[11]; - step[12] = intermediate[12] + intermediate[13]; - step[13] = intermediate[12] - intermediate[13]; - step[14] = intermediate[ 8] - intermediate[14]; - step[15] = intermediate[ 9] - intermediate[15]; - - // step 3 - output[0] = step[ 0] + step[ 3]; - output[1] = step[ 1] + step[ 2]; - output[2] = step[ 1] - step[ 2]; - output[3] = step[ 0] - step[ 3]; - - temp1 = step[ 4]*C14; - temp2 = step[ 7]*C2; - temp1 -= temp2; - output[4] = (temp1); - - temp1 = step[ 4]*C2; - temp2 = step[ 7]*C14; - temp1 += temp2; - output[7] = (temp1); - - temp1 = step[ 5]*C10; - temp2 = step[ 6]*C6; - temp1 -= temp2; - output[5] = (temp1); - - temp1 = step[ 5]*C6; - temp2 = step[ 6]*C10; - temp1 += temp2; - output[6] = (temp1); - - output[8] = step[ 8] + step[11]; - output[9] = step[ 9] + step[10]; - output[10] = step[ 9] - step[10]; - output[11] = step[ 8] - step[11]; - output[12] = step[12] + step[15]; - output[13] = step[13] + step[14]; - output[14] = step[13] - step[14]; - output[15] = step[12] - step[15]; - - // output 4 - step[ 0] = output[0] + output[7]; - step[ 1] = output[1] + output[6]; - step[ 2] = output[2] + output[5]; - step[ 3] = output[3] + output[4]; - step[ 4] = output[3] - output[4]; - step[ 5] = output[2] - output[5]; - step[ 6] = output[1] - output[6]; - step[ 7] = output[0] - output[7]; - - temp1 = output[8]*C7; - temp2 = output[15]*C9; - temp1 -= temp2; - step[ 8] = (temp1); - - temp1 = output[9]*C11; - temp2 = output[14]*C5; - temp1 += temp2; - step[ 9] = (temp1); - - temp1 = output[10]*C3; - temp2 = output[13]*C13; - temp1 -= temp2; - step[10] = (temp1); - - temp1 = output[11]*C15; - temp2 = output[12]*C1; - temp1 += temp2; - step[11] = (temp1); - - temp1 = output[11]*C1; - temp2 = output[12]*C15; - temp2 -= temp1; - step[12] = (temp2); - - temp1 = output[10]*C13; - temp2 = output[13]*C3; - temp1 += temp2; - step[13] = (temp1); - - temp1 = output[9]*C5; - temp2 = output[14]*C11; - temp2 -= temp1; - step[14] = (temp2); - - temp1 = output[8]*C9; - temp2 = output[15]*C7; - temp1 += temp2; - step[15] = (temp1); - - // step 5 - output[0] = (step[0] + step[15]); - output[1] = (step[1] + step[14]); - output[2] = (step[2] + step[13]); - output[3] = (step[3] + step[12]); - output[4] = (step[4] + step[11]); - output[5] = (step[5] + step[10]); - output[6] = (step[6] + step[ 9]); - output[7] = (step[7] + step[ 8]); - - output[15] = (step[0] - step[15]); - output[14] = (step[1] - step[14]); - output[13] = (step[2] - step[13]); - output[12] = (step[3] - step[12]); - output[11] = (step[4] - step[11]); - output[10] = (step[5] - step[10]); - output[9] = (step[6] - step[ 9]); - output[8] = (step[7] - step[ 8]); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -// Remove once an int version of iDCT is written -#if 0 -void reference_16x16_idct_1d(double input[16], double output[16]) { - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - const double kPi = 3.141592653589793238462643383279502884; - const double kSqrt2 = 1.414213562373095048801688724209698; - for (int k = 0; k < 16; k++) { - output[k] = 0.0; - for (int n = 0; n < 16; n++) { - output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0); - if (n == 0) - output[k] = output[k]/kSqrt2; - } - } - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} -#endif - -void vp9_short_idct16x16_c(short *input, short *output, int pitch) { - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double out[16*16], out2[16*16]; - const int short_pitch = pitch >> 1; - int i, j; - // First transform rows - for (i = 0; i < 16; ++i) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; ++j) - temp_in[j] = input[j + i*short_pitch]; - butterfly_16x16_idct_1d(temp_in, temp_out); - for (j = 0; j < 16; ++j) - out[j + i*16] = temp_out[j]; - } - // Then transform columns - for (i = 0; i < 16; ++i) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; ++j) - temp_in[j] = out[j*16 + i]; - butterfly_16x16_idct_1d(temp_in, temp_out); - for (j = 0; j < 16; ++j) - out2[j*16 + i] = temp_out[j]; - } - for (i = 0; i < 16*16; ++i) - output[i] = round(out2[i]/128); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -#else -static const int16_t C1 = 16305; -static const int16_t C2 = 16069; -static const int16_t C3 = 15679; -static const int16_t C4 = 15137; -static const int16_t C5 = 14449; -static const int16_t C6 = 13623; -static const int16_t C7 = 12665; -static const int16_t C8 = 11585; -static const int16_t C9 = 10394; -static const int16_t C10 = 9102; -static const int16_t C11 = 7723; -static const int16_t C12 = 6270; -static const int16_t C13 = 4756; -static const int16_t C14 = 3196; -static const int16_t C15 = 1606; - -#define INITIAL_SHIFT 2 -#define INITIAL_ROUNDING (1 << (INITIAL_SHIFT - 1)) -#define RIGHT_SHIFT 14 -#define RIGHT_ROUNDING (1 << (RIGHT_SHIFT - 1)) - -static void butterfly_16x16_idct_1d(int16_t input[16], int16_t output[16], - int last_shift_bits) { - int16_t step[16]; - int intermediate[16]; - int temp1, temp2; - - int step1_shift = RIGHT_SHIFT + INITIAL_SHIFT; - int step1_rounding = 1 << (step1_shift - 1); - int last_rounding = 0; - - if (last_shift_bits > 0) - last_rounding = 1 << (last_shift_bits - 1); - - // step 1 and 2 - step[ 0] = (input[0] + input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[ 1] = (input[0] - input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - - temp1 = input[4] * C12; - temp2 = input[12] * C4; - temp1 = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp1 *= C8; - step[ 2] = (2 * (temp1) + step1_rounding) >> step1_shift; - - temp1 = input[4] * C4; - temp2 = input[12] * C12; - temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp1 *= C8; - step[ 3] = (2 * (temp1) + step1_rounding) >> step1_shift; - - temp1 = input[2] * C8; - temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp2 = input[6] + input[10]; - step[ 4] = (temp1 + temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[ 5] = (temp1 - temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT; - - temp1 = input[14] * C8; - temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp2 = input[6] - input[10]; - step[ 6] = (temp2 - temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[ 7] = (temp2 + temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; - - // for odd input - temp1 = input[3] * C12; - temp2 = input[13] * C4; - temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp1 *= C8; - intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = input[3] * C4; - temp2 = input[13] * C12; - temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp2 *= C8; - intermediate[ 9] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - intermediate[10] = (2 * (input[9] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - intermediate[11] = input[15] - input[1]; - intermediate[12] = input[15] + input[1]; - intermediate[13] = (2 * (input[7] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = input[11] * C12; - temp2 = input[5] * C4; - temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp2 *= C8; - intermediate[14] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = input[11] * C4; - temp2 = input[5] * C12; - temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp1 *= C8; - intermediate[15] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - step[ 8] = (intermediate[ 8] + intermediate[14] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - step[ 9] = (intermediate[ 9] + intermediate[15] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - step[10] = (intermediate[10] + intermediate[11] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - step[11] = (intermediate[10] - intermediate[11] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - step[12] = (intermediate[12] + intermediate[13] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - step[13] = (intermediate[12] - intermediate[13] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - step[14] = (intermediate[ 8] - intermediate[14] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - step[15] = (intermediate[ 9] - intermediate[15] + INITIAL_ROUNDING) - >> INITIAL_SHIFT; - - // step 3 - output[0] = step[ 0] + step[ 3]; - output[1] = step[ 1] + step[ 2]; - output[2] = step[ 1] - step[ 2]; - output[3] = step[ 0] - step[ 3]; - - temp1 = step[ 4] * C14; - temp2 = step[ 7] * C2; - output[4] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 4] * C2; - temp2 = step[ 7] * C14; - output[7] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 5] * C10; - temp2 = step[ 6] * C6; - output[5] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 5] * C6; - temp2 = step[ 6] * C10; - output[6] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - output[8] = step[ 8] + step[11]; - output[9] = step[ 9] + step[10]; - output[10] = step[ 9] - step[10]; - output[11] = step[ 8] - step[11]; - output[12] = step[12] + step[15]; - output[13] = step[13] + step[14]; - output[14] = step[13] - step[14]; - output[15] = step[12] - step[15]; - - // output 4 - step[ 0] = output[0] + output[7]; - step[ 1] = output[1] + output[6]; - step[ 2] = output[2] + output[5]; - step[ 3] = output[3] + output[4]; - step[ 4] = output[3] - output[4]; - step[ 5] = output[2] - output[5]; - step[ 6] = output[1] - output[6]; - step[ 7] = output[0] - output[7]; - - temp1 = output[8] * C7; - temp2 = output[15] * C9; - step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[9] * C11; - temp2 = output[14] * C5; - step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[10] * C3; - temp2 = output[13] * C13; - step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[11] * C15; - temp2 = output[12] * C1; - step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[11] * C1; - temp2 = output[12] * C15; - step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[10] * C13; - temp2 = output[13] * C3; - step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[9] * C5; - temp2 = output[14] * C11; - step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[8] * C9; - temp2 = output[15] * C7; - step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - // step 5 - output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits; - output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits; - output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits; - output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits; - output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits; - output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits; - output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits; - output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits; - - output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits; - output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits; - output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits; - output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits; - output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits; - output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits; - output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits; - output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits; -} - -void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) { - int16_t out[16 * 16]; - int16_t *outptr = &out[0]; - const int short_pitch = pitch >> 1; - int i, j; - int16_t temp_in[16], temp_out[16]; - - // First transform rows - for (i = 0; i < 16; ++i) { - butterfly_16x16_idct_1d(input, outptr, 0); - input += short_pitch; - outptr += 16; - } - - // Then transform columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j * 16 + i]; - butterfly_16x16_idct_1d(temp_in, temp_out, 3); - for (j = 0; j < 16; ++j) - output[j * 16 + i] = temp_out[j]; - } -} - -/* The following function is called when we know the maximum number of non-zero - * dct coefficients is less or equal 10. - */ -static void butterfly_16x16_idct10_1d(int16_t input[16], int16_t output[16], - int last_shift_bits) { - int16_t step[16] = {0}; - int intermediate[16] = {0}; - int temp1, temp2; - int last_rounding = 0; - - if (last_shift_bits > 0) - last_rounding = 1 << (last_shift_bits - 1); - - // step 1 and 2 - step[ 0] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[ 1] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - - temp1 = (2 * (input[2] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - step[ 4] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[ 5] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; - - // for odd input - temp1 = (input[3] * C12 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp1 *= C8; - intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = (-input[3] * C4 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - temp1 *= C8; - intermediate[ 9] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - step[ 8] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[ 9] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[10] = (-input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[11] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[12] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[13] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[14] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - step[15] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT; - - // step 3 - output[0] = step[ 0]; - output[1] = step[ 1]; - output[2] = step[ 1]; - output[3] = step[ 0]; - - temp1 = step[ 4] * C14; - output[4] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 4] * C2; - output[7] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 5] * C10; - output[5] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = step[ 5] * C6; - output[6] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - output[8] = step[ 8] + step[11]; - output[9] = step[ 9] + step[10]; - output[10] = step[ 9] - step[10]; - output[11] = step[ 8] - step[11]; - output[12] = step[12] + step[15]; - output[13] = step[13] + step[14]; - output[14] = step[13] - step[14]; - output[15] = step[12] - step[15]; - - // output 4 - step[ 0] = output[0] + output[7]; - step[ 1] = output[1] + output[6]; - step[ 2] = output[2] + output[5]; - step[ 3] = output[3] + output[4]; - step[ 4] = output[3] - output[4]; - step[ 5] = output[2] - output[5]; - step[ 6] = output[1] - output[6]; - step[ 7] = output[0] - output[7]; - - temp1 = output[8] * C7; - temp2 = output[15] * C9; - step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[9] * C11; - temp2 = output[14] * C5; - step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[10] * C3; - temp2 = output[13] * C13; - step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[11] * C15; - temp2 = output[12] * C1; - step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[11] * C1; - temp2 = output[12] * C15; - step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[10] * C13; - temp2 = output[13] * C3; - step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[9] * C5; - temp2 = output[14] * C11; - step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - temp1 = output[8] * C9; - temp2 = output[15] * C7; - step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; - - // step 5 - output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits; - output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits; - output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits; - output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits; - output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits; - output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits; - output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits; - output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits; - - output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits; - output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits; - output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits; - output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits; - output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits; - output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits; - output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits; - output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits; -} - -void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) { - int16_t out[16 * 16]; - int16_t *outptr = &out[0]; - const int short_pitch = pitch >> 1; - int i, j; - int16_t temp_in[16], temp_out[16]; - - /* First transform rows. Since all non-zero dct coefficients are in - * upper-left 4x4 area, we only need to calculate first 4 rows here. - */ - vpx_memset(out, 0, sizeof(out)); - for (i = 0; i < 4; ++i) { - butterfly_16x16_idct10_1d(input, outptr, 0); - input += short_pitch; - outptr += 16; - } - - // Then transform columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j*16 + i]; - butterfly_16x16_idct10_1d(temp_in, temp_out, 3); - for (j = 0; j < 16; ++j) - output[j*16 + i] = temp_out[j]; - } -} -#undef INITIAL_SHIFT -#undef INITIAL_ROUNDING -#undef RIGHT_SHIFT -#undef RIGHT_ROUNDING -#endif diff --git a/vp9/common/implicit_segmentation.c b/vp9/common/implicit_segmentation.c deleted file mode 100644 index 9194e2a30..000000000 --- a/vp9/common/implicit_segmentation.c +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp9/common/onyxc_int.h" - -#define MAX_REGIONS 24000 -#ifndef NULL -#define NULL 0 -#endif - -#define min_mbs_in_region 3 - -// this linked list structure holds equivalences for connected -// component labeling -struct list_el { - int label; - int seg_value; - int count; - struct list_el *next; -}; -typedef struct list_el item; - -// connected colorsegments -typedef struct { - int min_x; - int min_y; - int max_x; - int max_y; - long long sum_x; - long long sum_y; - int pixels; - int seg_value; - int label; -} segment_info; - - -typedef enum { - SEGMENT_MODE, - SEGMENT_MV, - SEGMENT_REFFRAME, - SEGMENT_SKIPPED -} SEGMENT_TYPE; - - -// this merges the two equivalence lists and -// then makes sure that every label points to the same -// equivalence list -void merge(item *labels, int u, int v) { - item *a = labels[u].next; - item *b = labels[v].next; - item c; - item *it = &c; - int count; - - // check if they are already merged - if (u == v || a == b) - return; - - count = a->count + b->count; - - // merge 2 sorted linked lists. - while (a != NULL && b != NULL) { - if (a->label < b->label) { - it->next = a; - a = a->next; - } else { - it->next = b; - b = b->next; - } - - it = it->next; - } - - if (a == NULL) - it->next = b; - else - it->next = a; - - it = c.next; - - // make sure every equivalence in the linked list points to this new ll - while (it != NULL) { - labels[it->label].next = c.next; - it = it->next; - } - c.next->count = count; - -} - -void segment_via_mode_info(VP9_COMMON *oci, int how) { - MODE_INFO *mi = oci->mi; - int i, j; - int mb_index = 0; - - int label = 1; - int pitch = oci->mb_cols; - - // holds linked list equivalences - // the max should probably be allocated at a higher level in oci - item equivalences[MAX_REGIONS]; - int eq_ptr = 0; - item labels[MAX_REGIONS]; - segment_info segments[MAX_REGIONS]; - int label_count = 1; - int labeling[400 * 300]; - int *lp = labeling; - - label_count = 1; - memset(labels, 0, sizeof(labels)); - memset(segments, 0, sizeof(segments)); - - /* Go through each macroblock first pass labelling */ - for (i = 0; i < oci->mb_rows; i++, lp += pitch) { - for (j = 0; j < oci->mb_cols; j++) { - // int above seg_value, left seg_value, this seg_value... - int a = -1, l = -1, n = -1; - - // above label, left label - int al = -1, ll = -1; - if (i) { - al = lp[j - pitch]; - a = labels[al].next->seg_value; - } - if (j) { - ll = lp[j - 1]; - l = labels[ll].next->seg_value; - } - - // what setting are we going to do the implicit segmentation on - switch (how) { - case SEGMENT_MODE: - n = mi[mb_index].mbmi.mode; - break; - case SEGMENT_MV: - n = mi[mb_index].mbmi.mv[0].as_int; - if (mi[mb_index].mbmi.ref_frame == INTRA_FRAME) - n = -9999999; - break; - case SEGMENT_REFFRAME: - n = mi[mb_index].mbmi.ref_frame; - break; - case SEGMENT_SKIPPED: - n = mi[mb_index].mbmi.mb_skip_coeff; - break; - } - - // above and left both have the same seg_value - if (n == a && n == l) { - // pick the lowest label - lp[j] = (al < ll ? al : ll); - labels[lp[j]].next->count++; - - // merge the above and left equivalencies - merge(labels, al, ll); - } - // this matches above seg_value - else if (n == a) { - // give it the same label as above - lp[j] = al; - labels[al].next->count++; - } - // this matches left seg_value - else if (n == l) { - // give it the same label as above - lp[j] = ll; - labels[ll].next->count++; - } else { - // new label doesn't match either - item *e = &labels[label]; - item *nl = &equivalences[eq_ptr++]; - lp[j] = label; - nl->label = label; - nl->next = 0; - nl->seg_value = n; - nl->count = 1; - e->next = nl; - label++; - } - mb_index++; - } - mb_index++; - } - lp = labeling; - - // give new labels to regions - for (i = 1; i < label; i++) - if (labels[i].next->count > min_mbs_in_region && labels[labels[i].next->label].label == 0) { - segment_info *cs = &segments[label_count]; - cs->label = label_count; - labels[labels[i].next->label].label = label_count++; - labels[labels[i].next->label].seg_value = labels[i].next->seg_value; - cs->seg_value = labels[labels[i].next->label].seg_value; - cs->min_x = oci->mb_cols; - cs->min_y = oci->mb_rows; - cs->max_x = 0; - cs->max_y = 0; - cs->sum_x = 0; - cs->sum_y = 0; - cs->pixels = 0; - - } - lp = labeling; - - // this is just to gather stats... - for (i = 0; i < oci->mb_rows; i++, lp += pitch) { - for (j = 0; j < oci->mb_cols; j++) { - segment_info *cs; - int oldlab = labels[lp[j]].next->label; - int lab = labels[oldlab].label; - lp[j] = lab; - - cs = &segments[lab]; - - cs->min_x = (j < cs->min_x ? j : cs->min_x); - cs->max_x = (j > cs->max_x ? j : cs->max_x); - cs->min_y = (i < cs->min_y ? i : cs->min_y); - cs->max_y = (i > cs->max_y ? i : cs->max_y); - cs->sum_x += j; - cs->sum_y += i; - cs->pixels++; - - lp[j] = lab; - mb_index++; - } - mb_index++; - } - - { - lp = labeling; - printf("labelling \n"); - mb_index = 0; - for (i = 0; i < oci->mb_rows; i++, lp += pitch) { - for (j = 0; j < oci->mb_cols; j++) { - printf("%4d", lp[j]); - } - printf(" "); - for (j = 0; j < oci->mb_cols; j++, mb_index++) { - // printf("%3d",mi[mb_index].mbmi.mode ); - printf("%4d:%4d", mi[mb_index].mbmi.mv[0].as_mv.row, - mi[mb_index].mbmi.mv[0].as_mv.col); - } - printf("\n"); - ++mb_index; - } - printf("\n"); - } -} - diff --git a/vp9/common/invtrans.c b/vp9/common/invtrans.c deleted file mode 100644 index ac5553e6e..000000000 --- a/vp9/common/invtrans.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "invtrans.h" -#include "./vp9_rtcd.h" - -static void recon_dcblock(MACROBLOCKD *xd) { - BLOCKD *b = &xd->block[24]; - int i; - - for (i = 0; i < 16; i++) { - xd->block[i].dqcoeff[0] = b->diff[i]; - } -} - -static void recon_dcblock_8x8(MACROBLOCKD *xd) { - BLOCKD *b = &xd->block[24]; // for coeff 0, 2, 8, 10 - - xd->block[0].dqcoeff[0] = b->diff[0]; - xd->block[4].dqcoeff[0] = b->diff[1]; - xd->block[8].dqcoeff[0] = b->diff[4]; - xd->block[12].dqcoeff[0] = b->diff[8]; -} - -void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch) { - BLOCKD *b = &xd->block[block]; - if (b->eob <= 1) - xd->inv_xform4x4_1_x8(b->dqcoeff, b->diff, pitch); - else - xd->inv_xform4x4_x8(b->dqcoeff, b->diff, pitch); -} - -void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) { - int i; - BLOCKD *blockd = xd->block; - - if (xd->mode_info_context->mbmi.mode != SPLITMV) { - /* do 2nd order transform on the dc block */ - vp9_short_inv_walsh4x4(blockd[24].dqcoeff, blockd[24].diff); - recon_dcblock(xd); - } - - for (i = 0; i < 16; i++) { - vp9_inverse_transform_b_4x4(xd, i, 32); - } -} - -void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd) { - int i; - for (i = 16; i < 24; i++) { - vp9_inverse_transform_b_4x4(xd, i, 16); - } -} - -void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd) { - vp9_inverse_transform_mby_4x4(xd); - vp9_inverse_transform_mbuv_4x4(xd); -} - -void vp9_inverse_transform_b_8x8(short *input_dqcoeff, short *output_coeff, - int pitch) { - vp9_short_idct8x8(input_dqcoeff, output_coeff, pitch); -} - -void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) { - int i; - BLOCKD *blockd = xd->block; - - if (xd->mode_info_context->mbmi.mode != SPLITMV) { - // do 2nd order transform on the dc block - vp9_short_ihaar2x2(blockd[24].dqcoeff, blockd[24].diff, 8); - recon_dcblock_8x8(xd); // need to change for 8x8 - } - - for (i = 0; i < 9; i += 8) { - vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0], - &blockd[i].diff[0], 32); - } - for (i = 2; i < 11; i += 8) { - vp9_inverse_transform_b_8x8(&blockd[i + 2].dqcoeff[0], - &blockd[i].diff[0], 32); - } -} - -void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd) { - int i; - BLOCKD *blockd = xd->block; - - for (i = 16; i < 24; i += 4) { - vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0], - &blockd[i].diff[0], 16); - } -} - -void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd) { - vp9_inverse_transform_mby_8x8(xd); - vp9_inverse_transform_mbuv_8x8(xd); -} - -void vp9_inverse_transform_b_16x16(short *input_dqcoeff, - short *output_coeff, int pitch) { - vp9_short_idct16x16(input_dqcoeff, output_coeff, pitch); -} - -void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) { - vp9_inverse_transform_b_16x16(&xd->block[0].dqcoeff[0], - &xd->block[0].diff[0], 32); -} - -void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd) { - vp9_inverse_transform_mby_16x16(xd); - vp9_inverse_transform_mbuv_8x8(xd); -} diff --git a/vp9/common/invtrans.h b/vp9/common/invtrans.h deleted file mode 100644 index 58dc4d704..000000000 --- a/vp9/common/invtrans.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef __INC_INVTRANS_H -#define __INC_INVTRANS_H - -#include "vpx_ports/config.h" -#include "blockd.h" - -extern void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch); - -extern void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd); - -extern void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd); - -extern void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd); - -extern void vp9_inverse_transform_b_8x8(short *input_dqcoeff, - short *output_coeff, int pitch); - -extern void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd); - -extern void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd); - -extern void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd); - -extern void vp9_inverse_transform_b_16x16(short *input_dqcoeff, - short *output_coeff, int pitch); - -extern void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd); - -extern void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd); - -#endif // __INC_INVTRANS_H diff --git a/vp9/common/loopfilter.c b/vp9/common/loopfilter.c deleted file mode 100644 index 0d295c66a..000000000 --- a/vp9/common/loopfilter.c +++ /dev/null @@ -1,524 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "loopfilter.h" -#include "onyxc_int.h" -#include "vpx_mem/vpx_mem.h" - -#include "vp9/common/seg_common.h" - -static void lf_init_lut(loop_filter_info_n *lfi) { - int filt_lvl; - - for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) { - if (filt_lvl >= 40) { - lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2; - lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3; - } else if (filt_lvl >= 20) { - lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; - lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2; - } else if (filt_lvl >= 15) { - lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; - lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1; - } else { - lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0; - lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0; - } - } - - lfi->mode_lf_lut[DC_PRED] = 1; - lfi->mode_lf_lut[D45_PRED] = 1; - lfi->mode_lf_lut[D135_PRED] = 1; - lfi->mode_lf_lut[D117_PRED] = 1; - lfi->mode_lf_lut[D153_PRED] = 1; - lfi->mode_lf_lut[D27_PRED] = 1; - lfi->mode_lf_lut[D63_PRED] = 1; - lfi->mode_lf_lut[V_PRED] = 1; - lfi->mode_lf_lut[H_PRED] = 1; - lfi->mode_lf_lut[TM_PRED] = 1; - lfi->mode_lf_lut[B_PRED] = 0; - lfi->mode_lf_lut[I8X8_PRED] = 0; - lfi->mode_lf_lut[ZEROMV] = 1; - lfi->mode_lf_lut[NEARESTMV] = 2; - lfi->mode_lf_lut[NEARMV] = 2; - lfi->mode_lf_lut[NEWMV] = 2; - lfi->mode_lf_lut[SPLITMV] = 3; -} - -void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi, - int sharpness_lvl) { - int i; - - /* For each possible value for the loop filter fill out limits */ - for (i = 0; i <= MAX_LOOP_FILTER; i++) { - int filt_lvl = i; - int block_inside_limit = 0; - - /* Set loop filter paramaeters that control sharpness. */ - block_inside_limit = filt_lvl >> (sharpness_lvl > 0); - block_inside_limit = block_inside_limit >> (sharpness_lvl > 4); - - if (sharpness_lvl > 0) { - if (block_inside_limit > (9 - sharpness_lvl)) - block_inside_limit = (9 - sharpness_lvl); - } - - if (block_inside_limit < 1) - block_inside_limit = 1; - - vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH); - vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), - SIMD_WIDTH); - vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit), - SIMD_WIDTH); - } -} - -void vp9_loop_filter_init(VP9_COMMON *cm) { - loop_filter_info_n *lfi = &cm->lf_info; - int i; - - /* init limits for given sharpness*/ - vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level); - cm->last_sharpness_level = cm->sharpness_level; - - /* init LUT for lvl and hev thr picking */ - lf_init_lut(lfi); - - /* init hev threshold const vectors */ - for (i = 0; i < 4; i++) { - vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH); - } -} - -void vp9_loop_filter_frame_init(VP9_COMMON *cm, - MACROBLOCKD *xd, - int default_filt_lvl) { - int seg, /* segment number */ - ref, /* index in ref_lf_deltas */ - mode; /* index in mode_lf_deltas */ - - loop_filter_info_n *lfi = &cm->lf_info; - - /* update limits if sharpness has changed */ - if (cm->last_sharpness_level != cm->sharpness_level) { - vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level); - cm->last_sharpness_level = cm->sharpness_level; - } - - for (seg = 0; seg < MAX_MB_SEGMENTS; seg++) { - int lvl_seg = default_filt_lvl; - int lvl_ref, lvl_mode; - - - // Set the baseline filter values for each segment - if (vp9_segfeature_active(xd, seg, SEG_LVL_ALT_LF)) { - /* Abs value */ - if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) { - lvl_seg = vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF); - } else { /* Delta Value */ - lvl_seg += vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF); - lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0; - } - } - - if (!xd->mode_ref_lf_delta_enabled) { - /* we could get rid of this if we assume that deltas are set to - * zero when not in use; encoder always uses deltas - */ - vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4); - continue; - } - - lvl_ref = lvl_seg; - - /* INTRA_FRAME */ - ref = INTRA_FRAME; - - /* Apply delta for reference frame */ - lvl_ref += xd->ref_lf_deltas[ref]; - - /* Apply delta for Intra modes */ - mode = 0; /* B_PRED */ - /* Only the split mode BPRED has a further special case */ - lvl_mode = lvl_ref + xd->mode_lf_deltas[mode]; - lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ - - lfi->lvl[seg][ref][mode] = lvl_mode; - - mode = 1; /* all the rest of Intra modes */ - lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; /* clamp */ - lfi->lvl[seg][ref][mode] = lvl_mode; - - /* LAST, GOLDEN, ALT */ - for (ref = 1; ref < MAX_REF_FRAMES; ref++) { - int lvl_ref = lvl_seg; - - /* Apply delta for reference frame */ - lvl_ref += xd->ref_lf_deltas[ref]; - - /* Apply delta for Inter modes */ - for (mode = 1; mode < 4; mode++) { - lvl_mode = lvl_ref + xd->mode_lf_deltas[mode]; - lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ - - lfi->lvl[seg][ref][mode] = lvl_mode; - } - } - } -} - -void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) { - YV12_BUFFER_CONFIG *post = cm->frame_to_show; - loop_filter_info_n *lfi_n = &cm->lf_info; - struct loop_filter_info lfi; - - FRAME_TYPE frame_type = cm->frame_type; - - int mb_row; - int mb_col; - - int filter_level; - - unsigned char *y_ptr, *u_ptr, *v_ptr; - - /* Point at base of Mb MODE_INFO list */ - const MODE_INFO *mode_info_context = cm->mi; - - /* Initialize the loop filter for this frame. */ - vp9_loop_filter_frame_init(cm, xd, cm->filter_level); - - /* Set up the buffer pointers */ - y_ptr = post->y_buffer; - u_ptr = post->u_buffer; - v_ptr = post->v_buffer; - - /* vp9_filter each macro block */ - for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { - for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { - int skip_lf = (mode_info_context->mbmi.mode != B_PRED && - mode_info_context->mbmi.mode != I8X8_PRED && - mode_info_context->mbmi.mode != SPLITMV && - mode_info_context->mbmi.mb_skip_coeff); - - const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; - const int seg = mode_info_context->mbmi.segment_id; - const int ref_frame = mode_info_context->mbmi.ref_frame; - int tx_type = mode_info_context->mbmi.txfm_size; - filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; - - if (filter_level) { - if (cm->filter_type == NORMAL_LOOPFILTER) { - const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; - lfi.mblim = lfi_n->mblim[filter_level]; - lfi.blim = lfi_n->blim[filter_level]; - lfi.lim = lfi_n->lim[filter_level]; - lfi.hev_thr = lfi_n->hev_thr[hev_index]; - - if (mb_col > 0 -#if CONFIG_SUPERBLOCKS - && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb && - mode_info_context[0].mbmi.mb_skip_coeff && - mode_info_context[-1].mbmi.mb_skip_coeff) -#endif - ) - vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, - post->uv_stride, &lfi); - - if (!skip_lf && tx_type != TX_16X16) { - if (tx_type == TX_8X8) - vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride, - post->uv_stride, &lfi); - else - vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr, post->y_stride, - post->uv_stride, &lfi); - - } - - /* don't apply across umv border */ - if (mb_row > 0 -#if CONFIG_SUPERBLOCKS - && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb && - mode_info_context[0].mbmi.mb_skip_coeff && - mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff) -#endif - ) - vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, - post->uv_stride, &lfi); - - if (!skip_lf && tx_type != TX_16X16) { - if (tx_type == TX_8X8) - vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride, - post->uv_stride, &lfi); - else - vp9_loop_filter_bh(y_ptr, u_ptr, v_ptr, post->y_stride, - post->uv_stride, &lfi); - } - } else { - // FIXME: Not 8x8 aware - if (mb_col > 0 -#if CONFIG_SUPERBLOCKS - && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb && - mode_info_context[0].mbmi.mb_skip_coeff && - mode_info_context[-1].mbmi.mb_skip_coeff) -#endif - ) - vp9_loop_filter_simple_mbv(y_ptr, post->y_stride, - lfi_n->mblim[filter_level]); - - if (!skip_lf) - vp9_loop_filter_simple_bv(y_ptr, post->y_stride, - lfi_n->blim[filter_level]); - - /* don't apply across umv border */ - if (mb_row > 0 -#if CONFIG_SUPERBLOCKS - && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb && - mode_info_context[0].mbmi.mb_skip_coeff && - mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff) -#endif - ) - vp9_loop_filter_simple_mbh(y_ptr, post->y_stride, - lfi_n->mblim[filter_level]); - - if (!skip_lf) - vp9_loop_filter_simple_bh(y_ptr, post->y_stride, - lfi_n->blim[filter_level]); - } - } - - y_ptr += 16; - u_ptr += 8; - v_ptr += 8; - - mode_info_context++; /* step to next MB */ - } - - y_ptr += post->y_stride * 16 - post->y_width; - u_ptr += post->uv_stride * 8 - post->uv_width; - v_ptr += post->uv_stride * 8 - post->uv_width; - - mode_info_context++; /* Skip border mb */ - } -} - -void vp9_loop_filter_frame_yonly(VP9_COMMON *cm, MACROBLOCKD *xd, - int default_filt_lvl) { - YV12_BUFFER_CONFIG *post = cm->frame_to_show; - - unsigned char *y_ptr; - int mb_row; - int mb_col; - - loop_filter_info_n *lfi_n = &cm->lf_info; - struct loop_filter_info lfi; - - int filter_level; - FRAME_TYPE frame_type = cm->frame_type; - - /* Point at base of Mb MODE_INFO list */ - const MODE_INFO *mode_info_context = cm->mi; - -#if 0 - if (default_filt_lvl == 0) /* no filter applied */ - return; -#endif - - /* Initialize the loop filter for this frame. */ - vp9_loop_filter_frame_init(cm, xd, default_filt_lvl); - - /* Set up the buffer pointers */ - y_ptr = post->y_buffer; - - /* vp9_filter each macro block */ - for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { - for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { - int skip_lf = (mode_info_context->mbmi.mode != B_PRED && - mode_info_context->mbmi.mode != I8X8_PRED && - mode_info_context->mbmi.mode != SPLITMV && - mode_info_context->mbmi.mb_skip_coeff); - - const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; - const int seg = mode_info_context->mbmi.segment_id; - const int ref_frame = mode_info_context->mbmi.ref_frame; - int tx_type = mode_info_context->mbmi.txfm_size; - filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; - - if (filter_level) { - if (cm->filter_type == NORMAL_LOOPFILTER) { - const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; - lfi.mblim = lfi_n->mblim[filter_level]; - lfi.blim = lfi_n->blim[filter_level]; - lfi.lim = lfi_n->lim[filter_level]; - lfi.hev_thr = lfi_n->hev_thr[hev_index]; - - if (mb_col > 0) - vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi); - - if (!skip_lf && tx_type != TX_16X16) { - if (tx_type == TX_8X8) - vp9_loop_filter_bv8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi); - else - vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi); - } - - /* don't apply across umv border */ - if (mb_row > 0) - vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi); - - if (!skip_lf && tx_type != TX_16X16) { - if (tx_type == TX_8X8) - vp9_loop_filter_bh8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi); - else - vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi); - } - } else { - // FIXME: Not 8x8 aware - if (mb_col > 0) - vp9_loop_filter_simple_mbv(y_ptr, post->y_stride, - lfi_n->mblim[filter_level]); - - if (!skip_lf) - vp9_loop_filter_simple_bv(y_ptr, post->y_stride, - lfi_n->blim[filter_level]); - - /* don't apply across umv border */ - if (mb_row > 0) - vp9_loop_filter_simple_mbh(y_ptr, post->y_stride, - lfi_n->mblim[filter_level]); - - if (!skip_lf) - vp9_loop_filter_simple_bh(y_ptr, post->y_stride, - lfi_n->blim[filter_level]); - } - } - - y_ptr += 16; - mode_info_context++; /* step to next MB */ - } - - y_ptr += post->y_stride * 16 - post->y_width; - mode_info_context++; /* Skip border mb */ - } -} - -void vp9_loop_filter_partial_frame(VP9_COMMON *cm, MACROBLOCKD *xd, - int default_filt_lvl) { - YV12_BUFFER_CONFIG *post = cm->frame_to_show; - - unsigned char *y_ptr; - int mb_row; - int mb_col; - int mb_cols = post->y_width >> 4; - - int linestocopy, i; - - loop_filter_info_n *lfi_n = &cm->lf_info; - struct loop_filter_info lfi; - - int filter_level; - int alt_flt_enabled = xd->segmentation_enabled; - FRAME_TYPE frame_type = cm->frame_type; - - const MODE_INFO *mode_info_context; - - int lvl_seg[MAX_MB_SEGMENTS]; - - mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); - - /* 3 is a magic number. 4 is probably magic too */ - linestocopy = (post->y_height >> (4 + 3)); - - if (linestocopy < 1) - linestocopy = 1; - - linestocopy <<= 4; - - /* Note the baseline filter values for each segment */ - /* See vp9_loop_filter_frame_init. Rather than call that for each change - * to default_filt_lvl, copy the relevant calculation here. - */ - if (alt_flt_enabled) { - for (i = 0; i < MAX_MB_SEGMENTS; i++) { - /* Abs value */ - if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) { - lvl_seg[i] = vp9_get_segdata(xd, i, SEG_LVL_ALT_LF); - } - /* Delta Value */ - else { - lvl_seg[i] = default_filt_lvl + - vp9_get_segdata(xd, i, SEG_LVL_ALT_LF); - lvl_seg[i] = (lvl_seg[i] > 0) ? - ((lvl_seg[i] > 63) ? 63 : lvl_seg[i]) : 0; - } - } - } - - /* Set up the buffer pointers */ - y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride; - - /* vp9_filter each macro block */ - for (mb_row = 0; mb_row < (linestocopy >> 4); mb_row++) { - for (mb_col = 0; mb_col < mb_cols; mb_col++) { - int skip_lf = (mode_info_context->mbmi.mode != B_PRED && - mode_info_context->mbmi.mode != I8X8_PRED && - mode_info_context->mbmi.mode != SPLITMV && - mode_info_context->mbmi.mb_skip_coeff); - - if (alt_flt_enabled) - filter_level = lvl_seg[mode_info_context->mbmi.segment_id]; - else - filter_level = default_filt_lvl; - - if (filter_level) { - if (cm->filter_type == NORMAL_LOOPFILTER) { - const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; - lfi.mblim = lfi_n->mblim[filter_level]; - lfi.blim = lfi_n->blim[filter_level]; - lfi.lim = lfi_n->lim[filter_level]; - lfi.hev_thr = lfi_n->hev_thr[hev_index]; - - if (mb_col > 0) - vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi); - - if (!skip_lf) - vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi); - - vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi); - - if (!skip_lf) - vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi); - } else { - if (mb_col > 0) - vp9_loop_filter_simple_mbv (y_ptr, post->y_stride, - lfi_n->mblim[filter_level]); - - if (!skip_lf) - vp9_loop_filter_simple_bv(y_ptr, post->y_stride, - lfi_n->blim[filter_level]); - - vp9_loop_filter_simple_mbh(y_ptr, post->y_stride, - lfi_n->mblim[filter_level]); - - if (!skip_lf) - vp9_loop_filter_simple_bh(y_ptr, post->y_stride, - lfi_n->blim[filter_level]); - } - } - - y_ptr += 16; - mode_info_context += 1; /* step to next MB */ - } - - y_ptr += post->y_stride * 16 - post->y_width; - mode_info_context += 1; /* Skip border mb */ - } -} diff --git a/vp9/common/loopfilter.h b/vp9/common/loopfilter.h deleted file mode 100644 index 0b7de682c..000000000 --- a/vp9/common/loopfilter.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef loopfilter_h -#define loopfilter_h - -#include "vpx_ports/mem.h" -#include "vpx_config.h" -#include "blockd.h" - -#define MAX_LOOP_FILTER 63 - -typedef enum { - NORMAL_LOOPFILTER = 0, - SIMPLE_LOOPFILTER = 1 -} LOOPFILTERTYPE; - -#if ARCH_ARM -#define SIMD_WIDTH 1 -#else -#define SIMD_WIDTH 16 -#endif - -/* Need to align this structure so when it is declared and - * passed it can be loaded into vector registers. - */ -typedef struct { - DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, - mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); - DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, - blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); - DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, - lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); - DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, - hev_thr[4][SIMD_WIDTH]); - unsigned char lvl[4][4][4]; - unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1]; - unsigned char mode_lf_lut[MB_MODE_COUNT]; -} loop_filter_info_n; - -struct loop_filter_info { - const unsigned char *mblim; - const unsigned char *blim; - const unsigned char *lim; - const unsigned char *hev_thr; -}; - -#define prototype_loopfilter(sym) \ - void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ - const unsigned char *limit, const unsigned char *thresh, int count) - -#define prototype_loopfilter_block(sym) \ - void sym(unsigned char *y, unsigned char *u, unsigned char *v, \ - int ystride, int uv_stride, struct loop_filter_info *lfi) - -#define prototype_simple_loopfilter(sym) \ - void sym(unsigned char *y, int ystride, const unsigned char *blimit) - -#if ARCH_X86 || ARCH_X86_64 -#include "x86/loopfilter_x86.h" -#endif - -#if ARCH_ARM -#include "arm/loopfilter_arm.h" -#endif - -typedef void loop_filter_uvfunction(unsigned char *u, /* source pointer */ - int p, /* pitch */ - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - unsigned char *v); - -/* assorted loopfilter functions which get used elsewhere */ -struct VP9Common; -struct macroblockd; - -void vp9_loop_filter_init(struct VP9Common *cm); - -void vp9_loop_filter_frame_init(struct VP9Common *cm, - struct macroblockd *mbd, - int default_filt_lvl); - -void vp9_loop_filter_frame(struct VP9Common *cm, struct macroblockd *mbd); - -void vp9_loop_filter_partial_frame(struct VP9Common *cm, - struct macroblockd *mbd, - int default_filt_lvl); - -void vp9_loop_filter_frame_yonly(struct VP9Common *cm, - struct macroblockd *mbd, - int default_filt_lvl); - -void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi, - int sharpness_lvl); - -#endif // loopfilter_h diff --git a/vp9/common/loopfilter_filters.c b/vp9/common/loopfilter_filters.c deleted file mode 100644 index c719b44c3..000000000 --- a/vp9/common/loopfilter_filters.c +++ /dev/null @@ -1,480 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include "vpx_config.h" -#include "loopfilter.h" -#include "onyxc_int.h" - -typedef unsigned char uc; - -static __inline signed char signed_char_clamp(int t) { - t = (t < -128 ? -128 : t); - t = (t > 127 ? 127 : t); - return (signed char) t; -} - - -/* should we apply any filter at all ( 11111111 yes, 00000000 no) */ -static __inline signed char filter_mask(uc limit, uc blimit, - uc p3, uc p2, uc p1, uc p0, - uc q0, uc q1, uc q2, uc q3) { - signed char mask = 0; - mask |= (abs(p3 - p2) > limit) * -1; - mask |= (abs(p2 - p1) > limit) * -1; - mask |= (abs(p1 - p0) > limit) * -1; - mask |= (abs(q1 - q0) > limit) * -1; - mask |= (abs(q2 - q1) > limit) * -1; - mask |= (abs(q3 - q2) > limit) * -1; - mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = ~mask; - return mask; -} - -/* is there high variance internal edge ( 11111111 yes, 00000000 no) */ -static __inline signed char hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) { - signed char hev = 0; - hev |= (abs(p1 - p0) > thresh) * -1; - hev |= (abs(q1 - q0) > thresh) * -1; - return hev; -} - -static __inline void filter(signed char mask, uc hev, uc *op1, - uc *op0, uc *oq0, uc *oq1) - -{ - signed char ps0, qs0; - signed char ps1, qs1; - signed char filter, Filter1, Filter2; - signed char u; - - ps1 = (signed char) * op1 ^ 0x80; - ps0 = (signed char) * op0 ^ 0x80; - qs0 = (signed char) * oq0 ^ 0x80; - qs1 = (signed char) * oq1 ^ 0x80; - - /* add outer taps if we have high edge variance */ - filter = signed_char_clamp(ps1 - qs1); - filter &= hev; - - /* inner taps */ - filter = signed_char_clamp(filter + 3 * (qs0 - ps0)); - filter &= mask; - - /* save bottom 3 bits so that we round one side +4 and the other +3 - * if it equals 4 we'll set to adjust by -1 to account for the fact - * we'd round 3 the other way - */ - Filter1 = signed_char_clamp(filter + 4); - Filter2 = signed_char_clamp(filter + 3); - Filter1 >>= 3; - Filter2 >>= 3; - u = signed_char_clamp(qs0 - Filter1); - *oq0 = u ^ 0x80; - u = signed_char_clamp(ps0 + Filter2); - *op0 = u ^ 0x80; - filter = Filter1; - - /* outer tap adjustments */ - filter += 1; - filter >>= 1; - filter &= ~hev; - - u = signed_char_clamp(qs1 - filter); - *oq1 = u ^ 0x80; - u = signed_char_clamp(ps1 + filter); - *op1 = u ^ 0x80; - -} - -void vp9_loop_filter_horizontal_edge_c -( - unsigned char *s, - int p, /* pitch */ - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - int count -) { - int hev = 0; /* high edge variance */ - signed char mask = 0; - int i = 0; - - /* loop filter designed to work using chars so that we can make maximum use - * of 8 bit simd instructions. - */ - do { - mask = filter_mask(limit[0], blimit[0], - s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p], - s[0 * p], s[1 * p], s[2 * p], s[3 * p]); - - hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]); - - filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); - - ++s; - } while (++i < count * 8); -} - -void vp9_loop_filter_vertical_edge_c(unsigned char *s, - int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - int count) { - int hev = 0; /* high edge variance */ - signed char mask = 0; - int i = 0; - - /* loop filter designed to work using chars so that we can make maximum use - * of 8 bit simd instructions. - */ - do { - mask = filter_mask(limit[0], blimit[0], - s[-4], s[-3], s[-2], s[-1], - s[0], s[1], s[2], s[3]); - - hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); - - filter(mask, hev, s - 2, s - 1, s, s + 1); - - s += p; - } while (++i < count * 8); -} -static __inline signed char flatmask(uc thresh, - uc p4, uc p3, uc p2, uc p1, uc p0, - uc q0, uc q1, uc q2, uc q3, uc q4) { - signed char flat = 0; - flat |= (abs(p1 - p0) > 1) * -1; - flat |= (abs(q1 - q0) > 1) * -1; - flat |= (abs(p0 - p2) > 1) * -1; - flat |= (abs(q0 - q2) > 1) * -1; - flat |= (abs(p3 - p0) > 1) * -1; - flat |= (abs(q3 - q0) > 1) * -1; - flat |= (abs(p4 - p0) > 1) * -1; - flat |= (abs(q4 - q0) > 1) * -1; - flat = ~flat; - return flat; -} - -static __inline void mbfilter(signed char mask, uc hev, uc flat, - uc *op4, uc *op3, uc *op2, uc *op1, uc *op0, - uc *oq0, uc *oq1, uc *oq2, uc *oq3, uc *oq4) { - /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ - if (flat && mask) { - unsigned char p0, q0; - unsigned char p1, q1; - unsigned char p2, q2; - unsigned char p3, q3; - unsigned char p4, q4; - - p4 = *op4; - p3 = *op3; - p2 = *op2; - p1 = *op1; - p0 = *op0; - q0 = *oq0; - q1 = *oq1; - q2 = *oq2; - q3 = *oq3; - q4 = *oq4; - - *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3; - *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3; - *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3; - *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3; - *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3; - *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3; - } else { - signed char ps0, qs0; - signed char ps1, qs1; - signed char filter, Filter1, Filter2; - signed char u; - - ps1 = (signed char) * op1 ^ 0x80; - ps0 = (signed char) * op0 ^ 0x80; - qs0 = (signed char) * oq0 ^ 0x80; - qs1 = (signed char) * oq1 ^ 0x80; - - /* add outer taps if we have high edge variance */ - filter = signed_char_clamp(ps1 - qs1); - filter &= hev; - - /* inner taps */ - filter = signed_char_clamp(filter + 3 * (qs0 - ps0)); - filter &= mask; - - Filter1 = signed_char_clamp(filter + 4); - Filter2 = signed_char_clamp(filter + 3); - Filter1 >>= 3; - Filter2 >>= 3; - - u = signed_char_clamp(qs0 - Filter1); - *oq0 = u ^ 0x80; - u = signed_char_clamp(ps0 + Filter2); - *op0 = u ^ 0x80; - filter = Filter1; - - /* outer tap adjustments */ - filter += 1; - filter >>= 1; - filter &= ~hev; - - u = signed_char_clamp(qs1 - filter); - *oq1 = u ^ 0x80; - u = signed_char_clamp(ps1 + filter); - *op1 = u ^ 0x80; - } -} -void vp9_mbloop_filter_horizontal_edge_c -( - unsigned char *s, - int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - int count -) { - signed char hev = 0; /* high edge variance */ - signed char mask = 0; - signed char flat = 0; - int i = 0; - - /* loop filter designed to work using chars so that we can make maximum use - * of 8 bit simd instructions. - */ - do { - - mask = filter_mask(limit[0], blimit[0], - s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p], - s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]); - - hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]); - - flat = flatmask(thresh[0], - s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p], - s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]); - mbfilter(mask, hev, flat, - s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, - s, s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p); - - ++s; - } while (++i < count * 8); - -} -void vp9_mbloop_filter_vertical_edge_c -( - unsigned char *s, - int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - int count -) { - signed char hev = 0; /* high edge variance */ - signed char mask = 0; - signed char flat = 0; - int i = 0; - - do { - - mask = filter_mask(limit[0], blimit[0], - s[-4], s[-3], s[-2], s[-1], - s[0], s[1], s[2], s[3]); - - hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); - flat = flatmask(thresh[0], - s[-5], s[-4], s[-3], s[-2], s[-1], - s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]); - mbfilter(mask, hev, flat, - s - 5, s - 4, s - 3, s - 2, s - 1, - s, s + 1, s + 2, s + 3, s + 4); - s += p; - } while (++i < count * 8); - -} - -/* should we apply any filter at all ( 11111111 yes, 00000000 no) */ -static __inline signed char simple_filter_mask(uc blimit, - uc p1, uc p0, - uc q0, uc q1) { - /* Why does this cause problems for win32? - * error C2143: syntax error : missing ';' before 'type' - * (void) limit; - */ - signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1; - return mask; -} - -static __inline void simple_filter(signed char mask, - uc *op1, uc *op0, - uc *oq0, uc *oq1) { - signed char filter, Filter1, Filter2; - signed char p1 = (signed char) * op1 ^ 0x80; - signed char p0 = (signed char) * op0 ^ 0x80; - signed char q0 = (signed char) * oq0 ^ 0x80; - signed char q1 = (signed char) * oq1 ^ 0x80; - signed char u; - - filter = signed_char_clamp(p1 - q1); - filter = signed_char_clamp(filter + 3 * (q0 - p0)); - filter &= mask; - - /* save bottom 3 bits so that we round one side +4 and the other +3 */ - Filter1 = signed_char_clamp(filter + 4); - Filter1 >>= 3; - u = signed_char_clamp(q0 - Filter1); - *oq0 = u ^ 0x80; - - Filter2 = signed_char_clamp(filter + 3); - Filter2 >>= 3; - u = signed_char_clamp(p0 + Filter2); - *op0 = u ^ 0x80; -} - -void vp9_loop_filter_simple_horizontal_edge_c -( - unsigned char *s, - int p, - const unsigned char *blimit -) { - signed char mask = 0; - int i = 0; - - do { - mask = simple_filter_mask(blimit[0], - s[-2 * p], s[-1 * p], - s[0 * p], s[1 * p]); - simple_filter(mask, - s - 2 * p, s - 1 * p, - s, s + 1 * p); - ++s; - } while (++i < 16); -} - -void vp9_loop_filter_simple_vertical_edge_c -( - unsigned char *s, - int p, - const unsigned char *blimit -) { - signed char mask = 0; - int i = 0; - - do { - mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]); - simple_filter(mask, s - 2, s - 1, s, s + 1); - s += p; - } while (++i < 16); - -} - -/* Vertical MB Filtering */ -void vp9_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, int uv_stride, - struct loop_filter_info *lfi) { - vp9_mbloop_filter_vertical_edge_c(y_ptr, y_stride, - lfi->mblim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp9_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, - lfi->mblim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, - lfi->mblim, lfi->lim, lfi->hev_thr, 1); -} - -/* Vertical B Filtering */ -void vp9_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, int uv_stride, - struct loop_filter_info *lfi) { - vp9_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); -} - -/* Horizontal MB filtering */ -void vp9_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, int uv_stride, - struct loop_filter_info *lfi) { - vp9_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, - lfi->mblim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp9_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, - lfi->mblim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, - lfi->mblim, lfi->lim, lfi->hev_thr, 1); -} - -/* Horizontal B Filtering */ -void vp9_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, int uv_stride, - struct loop_filter_info *lfi) { - vp9_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); -} - -void vp9_loop_filter_bh8x8_c(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, int uv_stride, - struct loop_filter_info *lfi) { - vp9_mbloop_filter_horizontal_edge_c( - y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); -} - -void vp9_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, - y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, - y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, - y_stride, blimit); -} - -void vp9_loop_filter_bv8x8_c(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, int uv_stride, - struct loop_filter_info *lfi) { - vp9_mbloop_filter_vertical_edge_c( - y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); -} - -void vp9_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit); -} diff --git a/vp9/common/maskingmv.c b/vp9/common/maskingmv.c deleted file mode 100644 index 05cfb4a46..000000000 --- a/vp9/common/maskingmv.c +++ /dev/null @@ -1,806 +0,0 @@ -/* - ============================================================================ - Name : maskingmv.c - Author : jimbankoski - Version : - Copyright : Your copyright notice - Description : Hello World in C, Ansi-style - ============================================================================ - */ - -#include -#include -#include -extern unsigned int vp9_sad16x16_sse3( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride, - int max_err); - -extern void vp9_sad16x16x3_sse3( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride, - int *results); - -extern int vp8_growmaskmb_sse3( - unsigned char *om, - unsigned char *nm); - -extern void vp8_makemask_sse3( - unsigned char *y, - unsigned char *u, - unsigned char *v, - unsigned char *ym, - int yp, - int uvp, - int ys, - int us, - int vs, - int yt, - int ut, - int vt); - -unsigned int vp9_sad16x16_unmasked_wmt( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride, - unsigned char *mask); - -unsigned int vp9_sad16x16_masked_wmt( - unsigned char *src_ptr, - int src_stride, - unsigned char *ref_ptr, - int ref_stride, - unsigned char *mask); - -unsigned int vp8_masked_predictor_wmt( - unsigned char *masked, - unsigned char *unmasked, - int src_stride, - unsigned char *dst_ptr, - int dst_stride, - unsigned char *mask); -unsigned int vp8_masked_predictor_uv_wmt( - unsigned char *masked, - unsigned char *unmasked, - int src_stride, - unsigned char *dst_ptr, - int dst_stride, - unsigned char *mask); -unsigned int vp8_uv_from_y_mask( - unsigned char *ymask, - unsigned char *uvmask); -int yp = 16; -unsigned char sxy[] = { - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90 -}; - -unsigned char sts[] = { - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, -}; -unsigned char str[] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 -}; - -unsigned char y[] = { - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, - 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, - 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, - 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, - 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, - 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, - 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, - 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, - 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40 -}; -int uvp = 8; -unsigned char u[] = { - 90, 80, 70, 70, 90, 90, 90, 17, - 90, 80, 70, 70, 90, 90, 90, 17, - 84, 70, 70, 90, 90, 90, 17, 17, - 84, 70, 70, 90, 90, 90, 17, 17, - 80, 70, 70, 90, 90, 90, 17, 17, - 90, 80, 70, 70, 90, 90, 90, 17, - 90, 80, 70, 70, 90, 90, 90, 17, - 90, 80, 70, 70, 90, 90, 90, 17 -}; - -unsigned char v[] = { - 80, 80, 80, 80, 80, 80, 80, 80, - 80, 80, 80, 80, 80, 80, 80, 80, - 80, 80, 80, 80, 80, 80, 80, 80, - 80, 80, 80, 80, 80, 80, 80, 80, - 80, 80, 80, 80, 80, 80, 80, 80, - 80, 80, 80, 80, 80, 80, 80, 80, - 80, 80, 80, 80, 80, 80, 80, 80, - 80, 80, 80, 80, 80, 80, 80, 80 -}; - -unsigned char ym[256]; -unsigned char uvm[64]; -typedef struct { - unsigned char y; - unsigned char yt; - unsigned char u; - unsigned char ut; - unsigned char v; - unsigned char vt; - unsigned char use; -} COLOR_SEG_ELEMENT; - -/* -COLOR_SEG_ELEMENT segmentation[]= -{ - { 60,4,80,17,80,10, 1}, - { 40,4,15,10,80,10, 1}, -}; -*/ - -COLOR_SEG_ELEMENT segmentation[] = { - { 79, 44, 92, 44, 237, 60, 1}, -}; - -unsigned char pixel_mask(unsigned char y, unsigned char u, unsigned char v, - COLOR_SEG_ELEMENT sgm[], - int c) { - COLOR_SEG_ELEMENT *s = sgm; - unsigned char m = 0; - int i; - for (i = 0; i < c; i++, s++) - m |= (abs(y - s->y) < s->yt && - abs(u - s->u) < s->ut && - abs(v - s->v) < s->vt ? 255 : 0); - - return m; -} -int neighbors[256][8]; -int makeneighbors(void) { - int i, j; - for (i = 0; i < 256; i++) { - int r = (i >> 4), c = (i & 15); - int ni = 0; - for (j = 0; j < 8; j++) - neighbors[i][j] = i; - for (j = 0; j < 256; j++) { - int nr = (j >> 4), nc = (j & 15); - if (abs(nr - r) < 2 && abs(nc - c) < 2) - neighbors[i][ni++] = j; - } - } - return 0; -} -void grow_ymask(unsigned char *ym) { - unsigned char nym[256]; - int i, j; - - for (i = 0; i < 256; i++) { - nym[i] = ym[i]; - for (j = 0; j < 8; j++) { - nym[i] |= ym[neighbors[i][j]]; - } - } - for (i = 0; i < 256; i++) - ym[i] = nym[i]; -} -void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v, - unsigned char *ym, unsigned char *uvm, - int yp, int uvp, - COLOR_SEG_ELEMENT sgm[], - int count) { - int r, c; - unsigned char *oym = ym; - - memset(ym, 20, 256); - for (r = 0; r < 8; r++, uvm += 8, u += uvp, v += uvp, y += (yp + yp), ym += 32) - for (c = 0; c < 8; c++) { - int y1 = y[c << 1]; - int u1 = u[c]; - int v1 = v[c]; - int m = pixel_mask(y1, u1, v1, sgm, count); - uvm[c] = m; - ym[c << 1] = uvm[c]; // = pixel_mask(y[c<<1],u[c],v[c],sgm,count); - ym[(c << 1) + 1] = pixel_mask(y[1 + (c << 1)], u[c], v[c], sgm, count); - ym[(c << 1) + 16] = pixel_mask(y[yp + (c << 1)], u[c], v[c], sgm, count); - ym[(c << 1) + 17] = pixel_mask(y[1 + yp + (c << 1)], u[c], v[c], sgm, count); - } - grow_ymask(oym); -} - -int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp, - unsigned char *ym) { - int i, j; - unsigned sad = 0; - for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16) - for (j = 0; j < 16; j++) - if (ym[j]) - sad += abs(src[j] - dst[j]); - - return sad; -} - -int compare_masks(unsigned char *sym, unsigned char *ym) { - int i, j; - unsigned sad = 0; - for (i = 0; i < 16; i++, sym += 16, ym += 16) - for (j = 0; j < 16; j++) - sad += (sym[j] != ym[j] ? 1 : 0); - - return sad; -} -int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp, - unsigned char *ym) { - int i, j; - unsigned sad = 0; - for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16) - for (j = 0; j < 16; j++) - if (!ym[j]) - sad += abs(src[j] - dst[j]); - - return sad; -} -int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v, - int yp, int uvp, - unsigned char *dy, unsigned char *du, unsigned char *dv, - int dyp, int duvp, - COLOR_SEG_ELEMENT sgm[], - int count, - int *mi, - int *mj, - int *ui, - int *uj, - int *wm) { - int i, j; - - unsigned char ym[256]; - unsigned char uvm[64]; - unsigned char dym[256]; - unsigned char duvm[64]; - unsigned int e = 0; - int beste = 256; - int bmi = -32, bmj = -32; - int bui = -32, buj = -32; - int beste1 = 256; - int bmi1 = -32, bmj1 = -32; - int bui1 = -32, buj1 = -32; - int obeste; - - // first try finding best mask and then unmasked - beste = 0xffffffff; - - // find best unmasked mv - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - unsigned char *duz = i / 2 * duvp + du; - unsigned char *dvz = i / 2 * duvp + dv; - for (j = -32; j < 32; j++) { - // 0,0 masked destination - make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count); - - e = unmasked_sad(y, yp, dyz + j, dyp, dym); - - if (e < beste) { - bui = i; - buj = j; - beste = e; - } - } - } - // bui=0;buj=0; - // best mv masked destination - make_mb_mask(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2, - dym, duvm, dyp, duvp, sgm, count); - - obeste = beste; - beste = 0xffffffff; - - // find best masked - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - for (j = -32; j < 32; j++) { - e = masked_sad(y, yp, dyz + j, dyp, dym); - - if (e < beste) { - bmi = i; - bmj = j; - beste = e; - } - } - } - beste1 = beste + obeste; - bmi1 = bmi; - bmj1 = bmj; - bui1 = bui; - buj1 = buj; - - beste = 0xffffffff; - // source mask - make_mb_mask(y, u, v, ym, uvm, yp, uvp, sgm, count); - - // find best mask - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - unsigned char *duz = i / 2 * duvp + du; - unsigned char *dvz = i / 2 * duvp + dv; - for (j = -32; j < 32; j++) { - // 0,0 masked destination - make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count); - - e = compare_masks(ym, dym); - - if (e < beste) { - bmi = i; - bmj = j; - beste = e; - } - } - } - - - // best mv masked destination - make_mb_mask(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2, - dym, duvm, dyp, duvp, sgm, count); - - obeste = masked_sad(y, yp, dy + bmi * dyp + bmj, dyp, dym); - - beste = 0xffffffff; - - // find best unmasked mv - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - for (j = -32; j < 32; j++) { - e = unmasked_sad(y, yp, dyz + j, dyp, dym); - - if (e < beste) { - bui = i; - buj = j; - beste = e; - } - } - } - beste += obeste; - - - if (beste < beste1) { - *mi = bmi; - *mj = bmj; - *ui = bui; - *uj = buj; - *wm = 1; - } else { - *mi = bmi1; - *mj = bmj1; - *ui = bui1; - *uj = buj1; - *wm = 0; - - } - return 0; -} - -int predict(unsigned char *src, int p, unsigned char *dst, int dp, - unsigned char *ym, unsigned char *prd) { - int i, j; - for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16, prd += 16) - for (j = 0; j < 16; j++) - prd[j] = (ym[j] ? src[j] : dst[j]); - return 0; -} - -int fast_masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v, - int yp, int uvp, - unsigned char *dy, unsigned char *du, unsigned char *dv, - int dyp, int duvp, - COLOR_SEG_ELEMENT sgm[], - int count, - int *mi, - int *mj, - int *ui, - int *uj, - int *wm) { - int i, j; - - unsigned char ym[256]; - unsigned char ym2[256]; - unsigned char uvm[64]; - unsigned char dym2[256]; - unsigned char dym[256]; - unsigned char duvm[64]; - unsigned int e = 0; - int beste = 256; - int bmi = -32, bmj = -32; - int bui = -32, buj = -32; - int beste1 = 256; - int bmi1 = -32, bmj1 = -32; - int bui1 = -32, buj1 = -32; - int obeste; - - // first try finding best mask and then unmasked - beste = 0xffffffff; - -#if 0 - for (i = 0; i < 16; i++) { - unsigned char *dy = i * yp + y; - for (j = 0; j < 16; j++) - printf("%2x", dy[j]); - printf("\n"); - } - printf("\n"); - - for (i = -32; i < 48; i++) { - unsigned char *dyz = i * dyp + dy; - for (j = -32; j < 48; j++) - printf("%2x", dyz[j]); - printf("\n"); - } -#endif - - // find best unmasked mv - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - unsigned char *duz = i / 2 * duvp + du; - unsigned char *dvz = i / 2 * duvp + dv; - for (j = -32; j < 32; j++) { - // 0,0 masked destination - vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp, - sgm[0].y, sgm[0].u, sgm[0].v, - sgm[0].yt, sgm[0].ut, sgm[0].vt); - - vp8_growmaskmb_sse3(dym, dym2); - - e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2); - - if (e < beste) { - bui = i; - buj = j; - beste = e; - } - } - } - // bui=0;buj=0; - // best mv masked destination - - vp8_makemask_sse3(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2, - dym, dyp, duvp, - sgm[0].y, sgm[0].u, sgm[0].v, - sgm[0].yt, sgm[0].ut, sgm[0].vt); - - vp8_growmaskmb_sse3(dym, dym2); - - obeste = beste; - beste = 0xffffffff; - - // find best masked - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - for (j = -32; j < 32; j++) { - e = vp9_sad16x16_masked_wmt(y, yp, dyz + j, dyp, dym2); - if (e < beste) { - bmi = i; - bmj = j; - beste = e; - } - } - } - beste1 = beste + obeste; - bmi1 = bmi; - bmj1 = bmj; - bui1 = bui; - buj1 = buj; - - // source mask - vp8_makemask_sse3(y, u, v, - ym, yp, uvp, - sgm[0].y, sgm[0].u, sgm[0].v, - sgm[0].yt, sgm[0].ut, sgm[0].vt); - - vp8_growmaskmb_sse3(ym, ym2); - - // find best mask - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - unsigned char *duz = i / 2 * duvp + du; - unsigned char *dvz = i / 2 * duvp + dv; - for (j = -32; j < 32; j++) { - // 0,0 masked destination - vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp, - sgm[0].y, sgm[0].u, sgm[0].v, - sgm[0].yt, sgm[0].ut, sgm[0].vt); - - vp8_growmaskmb_sse3(dym, dym2); - - e = compare_masks(ym2, dym2); - - if (e < beste) { - bmi = i; - bmj = j; - beste = e; - } - } - } - - vp8_makemask_sse3(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2, - dym, dyp, duvp, - sgm[0].y, sgm[0].u, sgm[0].v, - sgm[0].yt, sgm[0].ut, sgm[0].vt); - - vp8_growmaskmb_sse3(dym, dym2); - - obeste = vp9_sad16x16_masked_wmt(y, yp, dy + bmi * dyp + bmj, dyp, dym2); - - beste = 0xffffffff; - - // find best unmasked mv - for (i = -32; i < 32; i++) { - unsigned char *dyz = i * dyp + dy; - for (j = -32; j < 32; j++) { - e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2); - - if (e < beste) { - bui = i; - buj = j; - beste = e; - } - } - } - beste += obeste; - - if (beste < beste1) { - *mi = bmi; - *mj = bmj; - *ui = bui; - *uj = buj; - *wm = 1; - } else { - *mi = bmi1; - *mj = bmj1; - *ui = bui1; - *uj = buj1; - *wm = 0; - beste = beste1; - - } - return beste; -} - -int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm, - int ymp, int uvmp, - unsigned char *yp, unsigned char *up, unsigned char *vp, - int ypp, int uvpp, - COLOR_SEG_ELEMENT sgm[], - int count, - int mi, - int mj, - int ui, - int uj, - int wm) { - int i, j; - unsigned char dym[256]; - unsigned char dym2[256]; - unsigned char duvm[64]; - unsigned char *yu = ym, *uu = um, *vu = vm; - - unsigned char *dym3 = dym2; - - ym += mi * ymp + mj; - um += mi / 2 * uvmp + mj / 2; - vm += mi / 2 * uvmp + mj / 2; - - yu += ui * ymp + uj; - uu += ui / 2 * uvmp + uj / 2; - vu += ui / 2 * uvmp + uj / 2; - - // best mv masked destination - if (wm) - vp8_makemask_sse3(ym, um, vm, dym, ymp, uvmp, - sgm[0].y, sgm[0].u, sgm[0].v, - sgm[0].yt, sgm[0].ut, sgm[0].vt); - else - vp8_makemask_sse3(yu, uu, vu, dym, ymp, uvmp, - sgm[0].y, sgm[0].u, sgm[0].v, - sgm[0].yt, sgm[0].ut, sgm[0].vt); - - vp8_growmaskmb_sse3(dym, dym2); - vp8_masked_predictor_wmt(ym, yu, ymp, yp, ypp, dym3); - vp8_uv_from_y_mask(dym3, duvm); - vp8_masked_predictor_uv_wmt(um, uu, uvmp, up, uvpp, duvm); - vp8_masked_predictor_uv_wmt(vm, vu, uvmp, vp, uvpp, duvm); - - return 0; -} - -unsigned char f0p[1280 * 720 * 3 / 2]; -unsigned char f1p[1280 * 720 * 3 / 2]; -unsigned char prd[1280 * 720 * 3 / 2]; -unsigned char msk[1280 * 720 * 3 / 2]; - - -int mainz(int argc, char *argv[]) { - - FILE *f = fopen(argv[1], "rb"); - FILE *g = fopen(argv[2], "wb"); - int w = atoi(argv[3]), h = atoi(argv[4]); - int y_stride = w, uv_stride = w / 2; - int r, c; - unsigned char *f0 = f0p, *f1 = f1p, *t; - unsigned char ym[256], uvm[64]; - unsigned char ym2[256], uvm2[64]; - unsigned char ym3[256], uvm3[64]; - int a, b; - - COLOR_SEG_ELEMENT last = { 20, 20, 20, 20, 230, 20, 1}, best; -#if 0 - makeneighbors(); - COLOR_SEG_ELEMENT segmentation[] = { - { 60, 4, 80, 17, 80, 10, 1}, - { 40, 4, 15, 10, 80, 10, 1}, - }; - make_mb_mask(y, u, v, ym2, uvm2, 16, 8, segmentation, 1); - - vp8_makemask_sse3(y, u, v, ym, (int) 16, (int) 8, - (int) segmentation[0].y, (int) segmentation[0].u, (int) segmentation[0].v, - segmentation[0].yt, segmentation[0].ut, segmentation[0].vt); - - vp8_growmaskmb_sse3(ym, ym3); - - a = vp9_sad16x16_masked_wmt(str, 16, sts, 16, ym3); - b = vp9_sad16x16_unmasked_wmt(str, 16, sts, 16, ym3); - - vp8_masked_predictor_wmt(str, sts, 16, ym, 16, ym3); - - vp8_uv_from_y_mask(ym3, uvm3); - - return 4; -#endif - makeneighbors(); - - - memset(prd, 128, w * h * 3 / 2); - - fread(f0, w * h * 3 / 2, 1, f); - - while (!feof(f)) { - unsigned char *ys = f1, *yd = f0, *yp = prd; - unsigned char *us = f1 + w * h, *ud = f0 + w * h, *up = prd + w * h; - unsigned char *vs = f1 + w * h * 5 / 4, *vd = f0 + w * h * 5 / 4, *vp = prd + w * h * 5 / 4; - fread(f1, w * h * 3 / 2, 1, f); - - ys += 32 * y_stride; - yd += 32 * y_stride; - yp += 32 * y_stride; - us += 16 * uv_stride; - ud += 16 * uv_stride; - up += 16 * uv_stride; - vs += 16 * uv_stride; - vd += 16 * uv_stride; - vp += 16 * uv_stride; - for (r = 32; r < h - 32; r += 16, - ys += 16 * w, yd += 16 * w, yp += 16 * w, - us += 8 * uv_stride, ud += 8 * uv_stride, up += 8 * uv_stride, - vs += 8 * uv_stride, vd += 8 * uv_stride, vp += 8 * uv_stride) { - for (c = 32; c < w - 32; c += 16) { - int mi, mj, ui, uj, wm; - int bmi, bmj, bui, buj, bwm; - unsigned char ym[256]; - - if (vp9_sad16x16_sse3(ys + c, y_stride, yd + c, y_stride, 0xffff) == 0) - bmi = bmj = bui = buj = bwm = 0; - else { - COLOR_SEG_ELEMENT cs[5]; - int j; - unsigned int beste = 0xfffffff; - unsigned int bestj = 0; - - // try color from last mb segmentation - cs[0] = last; - - // try color segs from 4 pixels in mb recon as segmentation - cs[1].y = yd[c + y_stride + 1]; - cs[1].u = ud[c / 2 + uv_stride]; - cs[1].v = vd[c / 2 + uv_stride]; - cs[1].yt = cs[1].ut = cs[1].vt = 20; - cs[2].y = yd[c + w + 14]; - cs[2].u = ud[c / 2 + uv_stride + 7]; - cs[2].v = vd[c / 2 + uv_stride + 7]; - cs[2].yt = cs[2].ut = cs[2].vt = 20; - cs[3].y = yd[c + w * 14 + 1]; - cs[3].u = ud[c / 2 + uv_stride * 7]; - cs[3].v = vd[c / 2 + uv_stride * 7]; - cs[3].yt = cs[3].ut = cs[3].vt = 20; - cs[4].y = yd[c + w * 14 + 14]; - cs[4].u = ud[c / 2 + uv_stride * 7 + 7]; - cs[4].v = vd[c / 2 + uv_stride * 7 + 7]; - cs[4].yt = cs[4].ut = cs[4].vt = 20; - - for (j = 0; j < 5; j++) { - int e; - - e = fast_masked_motion_search( - ys + c, us + c / 2, vs + c / 2, y_stride, uv_stride, - yd + c, ud + c / 2, vd + c / 2, y_stride, uv_stride, - &cs[j], 1, &mi, &mj, &ui, &uj, &wm); - - if (e < beste) { - bmi = mi; - bmj = mj; - bui = ui; - buj = uj, bwm = wm; - bestj = j; - beste = e; - } - } - best = cs[bestj]; - // best = segmentation[0]; - last = best; - } - predict_all(yd + c, ud + c / 2, vd + c / 2, w, uv_stride, - yp + c, up + c / 2, vp + c / 2, w, uv_stride, - &best, 1, bmi, bmj, bui, buj, bwm); - - } - } - fwrite(prd, w * h * 3 / 2, 1, g); - t = f0; - f0 = f1; - f1 = t; - - } - fclose(f); - fclose(g); - return; -} diff --git a/vp9/common/mbpitch.c b/vp9/common/mbpitch.c deleted file mode 100644 index 5ef4dd618..000000000 --- a/vp9/common/mbpitch.c +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "blockd.h" - -typedef enum { - PRED = 0, - DEST = 1 -} BLOCKSET; - -static void setup_block -( - BLOCKD *b, - int mv_stride, - unsigned char **base, - unsigned char **base2, - int Stride, - int offset, - BLOCKSET bs -) { - - if (bs == DEST) { - b->dst_stride = Stride; - b->dst = offset; - b->base_dst = base; - } else { - b->pre_stride = Stride; - b->pre = offset; - b->base_pre = base; - b->base_second_pre = base2; - } - -} - - -static void setup_macroblock(MACROBLOCKD *xd, BLOCKSET bs) { - int block; - - unsigned char **y, **u, **v; - unsigned char **y2, **u2, **v2; - BLOCKD *blockd = xd->block; - int stride; - - if (bs == DEST) { - y = &xd->dst.y_buffer; - u = &xd->dst.u_buffer; - v = &xd->dst.v_buffer; - } else { - y = &xd->pre.y_buffer; - u = &xd->pre.u_buffer; - v = &xd->pre.v_buffer; - - y2 = &xd->second_pre.y_buffer; - u2 = &xd->second_pre.u_buffer; - v2 = &xd->second_pre.v_buffer; - } - - stride = xd->dst.y_stride; - for (block = 0; block < 16; block++) { /* y blocks */ - setup_block(&blockd[block], stride, y, y2, stride, - (block >> 2) * 4 * stride + (block & 3) * 4, bs); - } - - stride = xd->dst.uv_stride; - for (block = 16; block < 20; block++) { /* U and V blocks */ - setup_block(&blockd[block], stride, u, u2, stride, - ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs); - - setup_block(&blockd[block + 4], stride, v, v2, stride, - ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs); - } -} - -void vp9_setup_block_dptrs(MACROBLOCKD *xd) { - int r, c; - BLOCKD *blockd = xd->block; - - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - blockd[r * 4 + c].diff = &xd->diff[r * 4 * 16 + c * 4]; - blockd[r * 4 + c].predictor = xd->predictor + r * 4 * 16 + c * 4; - } - } - - for (r = 0; r < 2; r++) { - for (c = 0; c < 2; c++) { - blockd[16 + r * 2 + c].diff = &xd->diff[256 + r * 4 * 8 + c * 4]; - blockd[16 + r * 2 + c].predictor = - xd->predictor + 256 + r * 4 * 8 + c * 4; - - } - } - - for (r = 0; r < 2; r++) { - for (c = 0; c < 2; c++) { - blockd[20 + r * 2 + c].diff = &xd->diff[320 + r * 4 * 8 + c * 4]; - blockd[20 + r * 2 + c].predictor = - xd->predictor + 320 + r * 4 * 8 + c * 4; - - } - } - - blockd[24].diff = &xd->diff[384]; - - for (r = 0; r < 25; r++) { - blockd[r].qcoeff = xd->qcoeff + r * 16; - blockd[r].dqcoeff = xd->dqcoeff + r * 16; - } -} - -void vp9_build_block_doffsets(MACROBLOCKD *xd) { - - /* handle the destination pitch features */ - setup_macroblock(xd, DEST); - setup_macroblock(xd, PRED); -} diff --git a/vp9/common/modecont.c b/vp9/common/modecont.c deleted file mode 100644 index 1a71eceb5..000000000 --- a/vp9/common/modecont.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "entropy.h" - -const int vp9_default_mode_contexts[INTER_MODE_CONTEXTS][4] = { - {223, 1, 1, 237}, // 0,0 best: Only candidate - {87, 166, 26, 219}, // 0,0 best: non zero candidates - {89, 67, 18, 125}, // 0,0 best: non zero candidates, split - {16, 141, 69, 226}, // strong nz candidate(s), no split - {35, 122, 14, 227}, // weak nz candidate(s), no split - {14, 122, 22, 164}, // strong nz candidate(s), split - {16, 70, 9, 183}, // weak nz candidate(s), split -}; -const int vp9_default_mode_contexts_a[INTER_MODE_CONTEXTS][4] = { - {204, 1, 1, 213}, // 0,0 best: Only candidate - {106, 139, 22, 203}, // 0,0 best: non zero candidates - {75, 52, 15, 118}, // 0,0 best: non zero candidates, split - {12, 148, 61, 211}, // strong nz candidate(s), no split - {18, 98, 17, 199}, // weak nz candidate(s), no split - {11, 91, 25, 148}, // strong nz candidate(s), split - {10, 53, 9, 145}, // weak nz candidate(s), split -}; diff --git a/vp9/common/modecont.h b/vp9/common/modecont.h deleted file mode 100644 index 1fa4558e1..000000000 --- a/vp9/common/modecont.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_MODECONT_H -#define __INC_MODECONT_H - -extern const int vp9_default_mode_contexts[INTER_MODE_CONTEXTS][4]; -extern const int vp9_default_mode_contexts_a[INTER_MODE_CONTEXTS][4]; -#endif diff --git a/vp9/common/modecontext.c b/vp9/common/modecontext.c deleted file mode 100644 index 522498609..000000000 --- a/vp9/common/modecontext.c +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "entropymode.h" - -const unsigned int vp9_kf_default_bmode_counts[VP9_KF_BINTRAMODES] - [VP9_KF_BINTRAMODES] - [VP9_KF_BINTRAMODES] = { - { - /*Above Mode : 0*/ - { 43438, 2195, 470, 316, 615, 171, 217, 412, 124, 160, }, /* left_mode 0 */ - { 5722, 2751, 296, 291, 81, 68, 80, 101, 100, 170, }, /* left_mode 1 */ - { 1629, 201, 307, 25, 47, 16, 34, 72, 19, 28, }, /* left_mode 2 */ - { 332, 266, 36, 500, 20, 65, 23, 14, 154, 106, }, /* left_mode 3 */ - { 450, 97, 10, 24, 117, 10, 2, 12, 8, 71, }, /* left_mode 4 */ - { 384, 49, 29, 44, 12, 162, 51, 5, 87, 42, }, /* left_mode 5 */ - { 495, 53, 157, 27, 14, 57, 180, 17, 17, 34, }, /* left_mode 6 */ - { 695, 64, 62, 9, 27, 5, 3, 147, 10, 26, }, /* left_mode 7 */ - { 230, 54, 20, 124, 16, 125, 29, 12, 283, 37, }, /* left_mode 8 */ - { 260, 87, 21, 120, 32, 16, 33, 16, 33, 203, }, /* left_mode 9 */ - }, - { - /*Above Mode : 1*/ - { 3934, 2573, 355, 137, 128, 87, 133, 117, 37, 27, }, /* left_mode 0 */ - { 1036, 1929, 278, 135, 27, 37, 48, 55, 41, 91, }, /* left_mode 1 */ - { 223, 256, 253, 15, 13, 9, 28, 64, 3, 3, }, /* left_mode 2 */ - { 120, 129, 17, 316, 15, 11, 9, 4, 53, 74, }, /* left_mode 3 */ - { 129, 58, 6, 11, 38, 2, 0, 5, 2, 67, }, /* left_mode 4 */ - { 53, 22, 11, 16, 8, 26, 14, 3, 19, 12, }, /* left_mode 5 */ - { 59, 26, 61, 11, 4, 9, 35, 13, 8, 8, }, /* left_mode 6 */ - { 101, 52, 40, 8, 5, 2, 8, 59, 2, 20, }, /* left_mode 7 */ - { 48, 34, 10, 52, 8, 15, 6, 6, 63, 20, }, /* left_mode 8 */ - { 96, 48, 22, 63, 11, 14, 5, 8, 9, 96, }, /* left_mode 9 */ - }, - { - /*Above Mode : 2*/ - { 709, 461, 506, 36, 27, 33, 151, 98, 24, 6, }, /* left_mode 0 */ - { 201, 375, 442, 27, 13, 8, 46, 58, 6, 19, }, /* left_mode 1 */ - { 122, 140, 417, 4, 13, 3, 33, 59, 4, 2, }, /* left_mode 2 */ - { 36, 17, 22, 16, 6, 8, 12, 17, 9, 21, }, /* left_mode 3 */ - { 51, 15, 7, 1, 14, 0, 4, 5, 3, 22, }, /* left_mode 4 */ - { 18, 11, 30, 9, 7, 20, 11, 5, 2, 6, }, /* left_mode 5 */ - { 38, 21, 103, 9, 4, 12, 79, 13, 2, 5, }, /* left_mode 6 */ - { 64, 17, 66, 2, 12, 4, 2, 65, 4, 5, }, /* left_mode 7 */ - { 14, 7, 7, 16, 3, 11, 4, 13, 15, 16, }, /* left_mode 8 */ - { 36, 8, 32, 9, 9, 4, 14, 7, 6, 24, }, /* left_mode 9 */ - }, - { - /*Above Mode : 3*/ - { 1340, 173, 36, 119, 30, 10, 13, 10, 20, 26, }, /* left_mode 0 */ - { 156, 293, 26, 108, 5, 16, 2, 4, 23, 30, }, /* left_mode 1 */ - { 60, 34, 13, 7, 3, 3, 0, 8, 4, 5, }, /* left_mode 2 */ - { 72, 64, 1, 235, 3, 9, 2, 7, 28, 38, }, /* left_mode 3 */ - { 29, 14, 1, 3, 5, 0, 2, 2, 5, 13, }, /* left_mode 4 */ - { 22, 7, 4, 11, 2, 5, 1, 2, 6, 4, }, /* left_mode 5 */ - { 18, 14, 5, 6, 4, 3, 14, 0, 9, 2, }, /* left_mode 6 */ - { 41, 10, 7, 1, 2, 0, 0, 10, 2, 1, }, /* left_mode 7 */ - { 23, 19, 2, 33, 1, 5, 2, 0, 51, 8, }, /* left_mode 8 */ - { 33, 26, 7, 53, 3, 9, 3, 3, 9, 19, }, /* left_mode 9 */ - }, - { - /*Above Mode : 4*/ - { 410, 165, 43, 31, 66, 15, 30, 54, 8, 17, }, /* left_mode 0 */ - { 115, 64, 27, 18, 30, 7, 11, 15, 4, 19, }, /* left_mode 1 */ - { 31, 23, 25, 1, 7, 2, 2, 10, 0, 5, }, /* left_mode 2 */ - { 17, 4, 1, 6, 8, 2, 7, 5, 5, 21, }, /* left_mode 3 */ - { 120, 12, 1, 2, 83, 3, 0, 4, 1, 40, }, /* left_mode 4 */ - { 4, 3, 1, 2, 1, 2, 5, 0, 3, 6, }, /* left_mode 5 */ - { 10, 2, 13, 6, 6, 6, 8, 2, 4, 5, }, /* left_mode 6 */ - { 58, 10, 5, 1, 28, 1, 1, 33, 1, 9, }, /* left_mode 7 */ - { 8, 2, 1, 4, 2, 5, 1, 1, 2, 10, }, /* left_mode 8 */ - { 76, 7, 5, 7, 18, 2, 2, 0, 5, 45, }, /* left_mode 9 */ - }, - { - /*Above Mode : 5*/ - { 444, 46, 47, 20, 14, 110, 60, 14, 60, 7, }, /* left_mode 0 */ - { 59, 57, 25, 18, 3, 17, 21, 6, 14, 6, }, /* left_mode 1 */ - { 24, 17, 20, 6, 4, 13, 7, 2, 3, 2, }, /* left_mode 2 */ - { 13, 11, 5, 14, 4, 9, 2, 4, 15, 7, }, /* left_mode 3 */ - { 8, 5, 2, 1, 4, 0, 1, 1, 2, 12, }, /* left_mode 4 */ - { 19, 5, 5, 7, 4, 40, 6, 3, 10, 4, }, /* left_mode 5 */ - { 16, 5, 9, 1, 1, 16, 26, 2, 10, 4, }, /* left_mode 6 */ - { 11, 4, 8, 1, 1, 4, 4, 5, 4, 1, }, /* left_mode 7 */ - { 15, 1, 3, 7, 3, 21, 7, 1, 34, 5, }, /* left_mode 8 */ - { 18, 5, 1, 3, 4, 3, 7, 1, 2, 9, }, /* left_mode 9 */ - }, - { - /*Above Mode : 6*/ - { 476, 149, 94, 13, 14, 77, 291, 27, 23, 3, }, /* left_mode 0 */ - { 79, 83, 42, 14, 2, 12, 63, 2, 4, 14, }, /* left_mode 1 */ - { 43, 36, 55, 1, 3, 8, 42, 11, 5, 1, }, /* left_mode 2 */ - { 9, 9, 6, 16, 1, 5, 6, 3, 11, 10, }, /* left_mode 3 */ - { 10, 3, 1, 3, 10, 1, 0, 1, 1, 4, }, /* left_mode 4 */ - { 14, 6, 15, 5, 1, 20, 25, 2, 5, 0, }, /* left_mode 5 */ - { 28, 7, 51, 1, 0, 8, 127, 6, 2, 5, }, /* left_mode 6 */ - { 13, 3, 3, 2, 3, 1, 2, 8, 1, 2, }, /* left_mode 7 */ - { 10, 3, 3, 3, 3, 8, 2, 2, 9, 3, }, /* left_mode 8 */ - { 13, 7, 11, 4, 0, 4, 6, 2, 5, 8, }, /* left_mode 9 */ - }, - { - /*Above Mode : 7*/ - { 376, 135, 119, 6, 32, 8, 31, 224, 9, 3, }, /* left_mode 0 */ - { 93, 60, 54, 6, 13, 7, 8, 92, 2, 12, }, /* left_mode 1 */ - { 74, 36, 84, 0, 3, 2, 9, 67, 2, 1, }, /* left_mode 2 */ - { 19, 4, 4, 8, 8, 2, 4, 7, 6, 16, }, /* left_mode 3 */ - { 51, 7, 4, 1, 77, 3, 0, 14, 1, 15, }, /* left_mode 4 */ - { 7, 7, 5, 7, 4, 7, 4, 5, 0, 3, }, /* left_mode 5 */ - { 18, 2, 19, 2, 2, 4, 12, 11, 1, 2, }, /* left_mode 6 */ - { 129, 6, 27, 1, 21, 3, 0, 189, 0, 6, }, /* left_mode 7 */ - { 9, 1, 2, 8, 3, 7, 0, 5, 3, 3, }, /* left_mode 8 */ - { 20, 4, 5, 10, 4, 2, 7, 17, 3, 16, }, /* left_mode 9 */ - }, - { - /*Above Mode : 8*/ - { 617, 68, 34, 79, 11, 27, 25, 14, 75, 13, }, /* left_mode 0 */ - { 51, 82, 21, 26, 6, 12, 13, 1, 26, 16, }, /* left_mode 1 */ - { 29, 9, 12, 11, 3, 7, 1, 10, 2, 2, }, /* left_mode 2 */ - { 17, 19, 11, 74, 4, 3, 2, 0, 58, 13, }, /* left_mode 3 */ - { 10, 1, 1, 3, 4, 1, 0, 2, 1, 8, }, /* left_mode 4 */ - { 14, 4, 5, 5, 1, 13, 2, 0, 27, 8, }, /* left_mode 5 */ - { 10, 3, 5, 4, 1, 7, 6, 4, 5, 1, }, /* left_mode 6 */ - { 10, 2, 6, 2, 1, 1, 1, 4, 2, 1, }, /* left_mode 7 */ - { 14, 8, 5, 23, 2, 12, 6, 2, 117, 5, }, /* left_mode 8 */ - { 9, 6, 2, 19, 1, 6, 3, 2, 9, 9, }, /* left_mode 9 */ - }, - { - /*Above Mode : 9*/ - { 680, 73, 22, 38, 42, 5, 11, 9, 6, 28, }, /* left_mode 0 */ - { 113, 112, 21, 22, 10, 2, 8, 4, 6, 42, }, /* left_mode 1 */ - { 44, 20, 24, 6, 5, 4, 3, 3, 1, 2, }, /* left_mode 2 */ - { 40, 23, 7, 71, 5, 2, 4, 1, 7, 22, }, /* left_mode 3 */ - { 85, 9, 4, 4, 17, 2, 0, 3, 2, 23, }, /* left_mode 4 */ - { 13, 4, 2, 6, 1, 7, 0, 1, 7, 6, }, /* left_mode 5 */ - { 26, 6, 8, 3, 2, 3, 8, 1, 5, 4, }, /* left_mode 6 */ - { 54, 8, 9, 6, 7, 0, 1, 11, 1, 3, }, /* left_mode 7 */ - { 9, 10, 4, 13, 2, 5, 4, 2, 14, 8, }, /* left_mode 8 */ - { 92, 9, 5, 19, 15, 3, 3, 1, 6, 58, }, /* left_mode 9 */ - }, -}; diff --git a/vp9/common/mv.h b/vp9/common/mv.h deleted file mode 100644 index bbe6d2c8b..000000000 --- a/vp9/common/mv.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_MV_H -#define __INC_MV_H -#include "vpx/vpx_integer.h" - -typedef struct { - short row; - short col; -} MV; - -typedef union int_mv { - uint32_t as_int; - MV as_mv; -} int_mv; /* facilitates faster equality tests and copies */ - -#endif diff --git a/vp9/common/mvref_common.c b/vp9/common/mvref_common.c deleted file mode 100644 index ebb8fa4bd..000000000 --- a/vp9/common/mvref_common.c +++ /dev/null @@ -1,398 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "mvref_common.h" - -#define MVREF_NEIGHBOURS 8 -static int mb_mv_ref_search[MVREF_NEIGHBOURS][2] = { - {0, -1}, {-1, 0}, {-1, -1}, {0, -2}, - {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2} -}; -static int mb_ref_distance_weight[MVREF_NEIGHBOURS] = - { 3, 3, 2, 1, 1, 1, 1, 1 }; -#if CONFIG_SUPERBLOCKS -static int sb_mv_ref_search[MVREF_NEIGHBOURS][2] = { - {0, -1}, {-1, 0}, {1, -1}, {-1, 1}, - {-1, -1}, {0, -2}, {-2, 0}, {-1, -2} -}; -static int sb_ref_distance_weight[MVREF_NEIGHBOURS] = - { 3, 3, 2, 2, 2, 1, 1, 1 }; -#endif -// clamp_mv -#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units -static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) { - - if (mv->as_mv.col < (xd->mb_to_left_edge - MV_BORDER)) - mv->as_mv.col = xd->mb_to_left_edge - MV_BORDER; - else if (mv->as_mv.col > xd->mb_to_right_edge + MV_BORDER) - mv->as_mv.col = xd->mb_to_right_edge + MV_BORDER; - - if (mv->as_mv.row < (xd->mb_to_top_edge - MV_BORDER)) - mv->as_mv.row = xd->mb_to_top_edge - MV_BORDER; - else if (mv->as_mv.row > xd->mb_to_bottom_edge + MV_BORDER) - mv->as_mv.row = xd->mb_to_bottom_edge + MV_BORDER; -} - - -// Gets a best matching candidate refenence motion vector -// from the given mode info structure (if available) -static int get_candidate_mvref( - const MODE_INFO *candidate_mi, - MV_REFERENCE_FRAME ref_frame, - MV_REFERENCE_FRAME *c_ref_frame, - int_mv *c_mv, - MV_REFERENCE_FRAME *c2_ref_frame, - int_mv *c2_mv -) { - - int ret_val = FALSE; - c2_mv->as_int = 0; - *c2_ref_frame = INTRA_FRAME; - - // Target ref frame matches candidate first ref frame - if (ref_frame == candidate_mi->mbmi.ref_frame) { - c_mv->as_int = candidate_mi->mbmi.mv[0].as_int; - *c_ref_frame = ref_frame; - ret_val = TRUE; - - // Is there a second non zero vector we can use. - if ((candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) && - (candidate_mi->mbmi.mv[1].as_int != 0) && - (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) { - c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int; - *c2_ref_frame = candidate_mi->mbmi.second_ref_frame; - } - - // Target ref frame matches candidate second ref frame - } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) { - c_mv->as_int = candidate_mi->mbmi.mv[1].as_int; - *c_ref_frame = ref_frame; - ret_val = TRUE; - - // Is there a second non zero vector we can use. - if ((candidate_mi->mbmi.ref_frame > INTRA_FRAME) && - (candidate_mi->mbmi.mv[0].as_int != 0) && - (candidate_mi->mbmi.mv[0].as_int != c_mv->as_int)) { - c2_mv->as_int = candidate_mi->mbmi.mv[0].as_int; - *c2_ref_frame = candidate_mi->mbmi.ref_frame; - } - - // No ref frame matches so use first ref mv as first choice - } else if (candidate_mi->mbmi.ref_frame > INTRA_FRAME) { - c_mv->as_int = candidate_mi->mbmi.mv[0].as_int; - *c_ref_frame = candidate_mi->mbmi.ref_frame; - ret_val = TRUE; - - // Is there a second non zero vector we can use. - if ((candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) && - (candidate_mi->mbmi.mv[1].as_int != 0) && - (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) { - c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int; - *c2_ref_frame = candidate_mi->mbmi.second_ref_frame; - } - - // If only the second ref mv is valid:- (Should not trigger in current code - // base given current possible compound prediction options). - } else if (candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) { - c_mv->as_int = candidate_mi->mbmi.mv[1].as_int; - *c_ref_frame = candidate_mi->mbmi.second_ref_frame; - ret_val = TRUE; - } - - return ret_val; -} - -// Performs mv adjustment based on reference frame and clamps the MV -// if it goes off the edge of the buffer. -static void scale_mv( - MACROBLOCKD *xd, - MV_REFERENCE_FRAME this_ref_frame, - MV_REFERENCE_FRAME candidate_ref_frame, - int_mv *candidate_mv, - int *ref_sign_bias -) { - - if (candidate_ref_frame != this_ref_frame) { - - //int frame_distances[MAX_REF_FRAMES]; - //int last_distance = 1; - //int gf_distance = xd->frames_since_golden; - //int arf_distance = xd->frames_till_alt_ref_frame; - - // Sign inversion where appropriate. - if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) { - candidate_mv->as_mv.row = -candidate_mv->as_mv.row; - candidate_mv->as_mv.col = -candidate_mv->as_mv.col; - } - - // Scale based on frame distance if the reference frames not the same. - /*frame_distances[INTRA_FRAME] = 1; // should never be used - frame_distances[LAST_FRAME] = 1; - frame_distances[GOLDEN_FRAME] = - (xd->frames_since_golden) ? xd->frames_since_golden : 1; - frame_distances[ALTREF_FRAME] = - (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1; - - if (frame_distances[this_ref_frame] && - frame_distances[candidate_ref_frame]) { - candidate_mv->as_mv.row = - (short)(((int)(candidate_mv->as_mv.row) * - frame_distances[this_ref_frame]) / - frame_distances[candidate_ref_frame]); - - candidate_mv->as_mv.col = - (short)(((int)(candidate_mv->as_mv.col) * - frame_distances[this_ref_frame]) / - frame_distances[candidate_ref_frame]); - } - */ - } - - // Clamp the MV so it does not point out of the frame buffer - clamp_mv(xd, candidate_mv); -} - -// Adds a new candidate reference vector to the list if indeed it is new. -// If it is not new then the score of the existing candidate that it matches -// is increased and the list is resorted. -static void addmv_and_shuffle( - int_mv *mv_list, - int *mv_scores, - int *index, - int_mv candidate_mv, - int weight -) { - - int i = *index; - int duplicate_found = FALSE; - - // Check for duplicates. If there is one increment its score. - // Duplicate defined as being the same full pel vector with rounding. - while (i > 0) { - i--; - - if (candidate_mv.as_int == mv_list[i].as_int) { - duplicate_found = TRUE; - mv_scores[i] += weight; - break; - } - } - - // If no duplicate was found add the new vector and give it a weight - if (!duplicate_found) { - mv_list[*index].as_int = candidate_mv.as_int; - mv_scores[*index] = weight; - i = *index; - (*index)++; - } - - // Reshuffle the list so that highest scoring mvs at the top. - while (i > 0) { - if (mv_scores[i] > mv_scores[i-1]) { - int tmp_score = mv_scores[i-1]; - int_mv tmp_mv = mv_list[i-1]; - - mv_scores[i-1] = mv_scores[i]; - mv_list[i-1] = mv_list[i]; - mv_scores[i] = tmp_score; - mv_list[i] = tmp_mv; - i--; - } else - break; - } -} - -// This function searches the neighbourhood of a given MB/SB and populates a -// list of candidate reference vectors. -// -void vp9_find_mv_refs( - MACROBLOCKD *xd, - MODE_INFO *here, - MODE_INFO *lf_here, - MV_REFERENCE_FRAME ref_frame, - int_mv *mv_ref_list, - int *ref_sign_bias -) { - - int i; - MODE_INFO *candidate_mi; - MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi; - int_mv candidate_mvs[MAX_MV_REFS]; - int_mv c_refmv; - MV_REFERENCE_FRAME c_ref_frame; - int_mv c2_refmv; - MV_REFERENCE_FRAME c2_ref_frame; - int candidate_scores[MAX_MV_REFS]; - int index = 0; - int split_count = 0; - int ref_weight = 0; - int valid_mv_ref; - int (*mv_ref_search)[2]; - int *ref_distance_weight; - - // Blank the reference vector lists and other local structures. - vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REFS); - vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REFS); - vpx_memset(candidate_scores, 0, sizeof(candidate_scores)); - -#if CONFIG_SUPERBLOCKS - if (mbmi->encoded_as_sb) { - mv_ref_search = sb_mv_ref_search; - ref_distance_weight = sb_ref_distance_weight; - } else { - mv_ref_search = mb_mv_ref_search; - ref_distance_weight = mb_ref_distance_weight; - } -#else - mv_ref_search = mb_mv_ref_search; - ref_distance_weight = mb_ref_distance_weight; -#endif - // Populate a list with candidate reference vectors from the - // spatial neighbours. - for (i = 0; i < 2; ++i) { - if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) && - ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) { - - candidate_mi = here + mv_ref_search[i][0] + - (mv_ref_search[i][1] * xd->mode_info_stride); - - valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame, - &c_ref_frame, &c_refmv, - &c2_ref_frame, &c2_refmv); - - // If there is a valid MV candidate then add it to the list - if (valid_mv_ref) { - scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias ); - ref_weight = ref_distance_weight[i] + - ((c_ref_frame == ref_frame) << 4); - split_count += (candidate_mi->mbmi.mode == SPLITMV); - - addmv_and_shuffle(candidate_mvs, candidate_scores, - &index, c_refmv, ref_weight); - - // If there is a second valid mv then add it as well. - if (c2_ref_frame != INTRA_FRAME) { - scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias ); - ref_weight = ref_distance_weight[i] + - ((c2_ref_frame == ref_frame) << 4); - - addmv_and_shuffle(candidate_mvs, candidate_scores, - &index, c2_refmv, ref_weight); - } - } - } - } - - // Look at the corresponding vector in the last frame - candidate_mi = lf_here; - valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame, - &c_ref_frame, &c_refmv, - &c2_ref_frame, &c2_refmv); - - // If there is a valid MV candidate then add it to the list - if (valid_mv_ref) { - scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias ); - ref_weight = 2 + ((c_ref_frame == ref_frame) << 4); - addmv_and_shuffle(candidate_mvs, candidate_scores, - &index, c_refmv, ref_weight); - - // If there is a second valid mv then add it as well. - if (c2_ref_frame != INTRA_FRAME) { - scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias ); - ref_weight = ref_distance_weight[i] + - ((c2_ref_frame == ref_frame) << 4); - - addmv_and_shuffle(candidate_mvs, candidate_scores, - &index, c2_refmv, ref_weight); - } - } - - // Populate a list with candidate reference vectors from the - // spatial neighbours. - for (i = 2; (i < MVREF_NEIGHBOURS) && (index < (MAX_MV_REFS - 2)); ++i) { - if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) && - ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) { - - candidate_mi = here + mv_ref_search[i][0] + - (mv_ref_search[i][1] * xd->mode_info_stride); - - valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame, - &c_ref_frame, &c_refmv, - &c2_ref_frame, &c2_refmv); - - // If there is a valid MV candidate then add it to the list - if (valid_mv_ref) { - scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias ); - ref_weight = ref_distance_weight[i] + - ((c_ref_frame == ref_frame) << 4); - - addmv_and_shuffle(candidate_mvs, candidate_scores, - &index, c_refmv, ref_weight); - - // If there is a second valid mv then add it as well. - if (c2_ref_frame != INTRA_FRAME) { - scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias ); - ref_weight = ref_distance_weight[i] + - ((c2_ref_frame == ref_frame) << 4); - - addmv_and_shuffle(candidate_mvs, candidate_scores, - &index, c2_refmv, ref_weight); - } - } - } - } - - // Make sure we are able to add 0,0 - if (index > (MAX_MV_REFS - 1)) { - index = (MAX_MV_REFS - 1); - } - - // Define inter mode coding context. - // 0,0 was best - if (candidate_mvs[0].as_int == 0) { - // 0,0 is only candidate - if (index <= 1) { - mbmi->mb_mode_context[ref_frame] = 0; - // non zero candidates candidates available - } else if (split_count == 0) { - mbmi->mb_mode_context[ref_frame] = 1; - } else { - mbmi->mb_mode_context[ref_frame] = 2; - } - // Non zero best, No Split MV cases - } else if (split_count == 0) { - if (candidate_scores[0] >= 32) { - mbmi->mb_mode_context[ref_frame] = 3; - } else { - mbmi->mb_mode_context[ref_frame] = 4; - } - // Non zero best, some split mv - } else { - if (candidate_scores[0] >= 32) { - mbmi->mb_mode_context[ref_frame] = 5; - } else { - mbmi->mb_mode_context[ref_frame] = 6; - } - } - - // 0,0 is always a valid reference. - for (i = 0; i < index; ++i) { - if (candidate_mvs[i].as_int == 0) - break; - } - if (i == index) { - c_refmv.as_int = 0; - addmv_and_shuffle(candidate_mvs, candidate_scores, - &index, c_refmv, candidate_scores[3]+1 ); - } - - // Copy over the candidate list. - vpx_memcpy(mv_ref_list, candidate_mvs, sizeof(candidate_mvs)); -} diff --git a/vp9/common/mvref_common.h b/vp9/common/mvref_common.h deleted file mode 100644 index 06050406b..000000000 --- a/vp9/common/mvref_common.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "onyxc_int.h" -#include "blockd.h" - - -#ifndef __INC_MVREF_COMMON_H -#define __INC_MVREF_COMMON_H - -void vp9_find_mv_refs( - MACROBLOCKD *xd, - MODE_INFO *here, - MODE_INFO *lf_here, - MV_REFERENCE_FRAME ref_frame, - int_mv * mv_ref_list, - int *ref_sign_bias -); - -#endif - diff --git a/vp9/common/onyx.h b/vp9/common/onyx.h deleted file mode 100644 index 0f84e8cec..000000000 --- a/vp9/common/onyx.h +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_ONYX_H -#define __INC_ONYX_H - -#ifdef __cplusplus -extern "C" -{ -#endif - -#include "vpx/internal/vpx_codec_internal.h" -#include "vpx/vp8cx.h" -#include "vpx_scale/yv12config.h" -#include "type_aliases.h" -#include "ppflags.h" - typedef int *VP9_PTR; - - /* Create/destroy static data structures. */ - - typedef enum { - NORMAL = 0, - FOURFIVE = 1, - THREEFIVE = 2, - ONETWO = 3 - - } VPX_SCALING; - - typedef enum { - VP9_LAST_FLAG = 1, - VP9_GOLD_FLAG = 2, - VP9_ALT_FLAG = 4 - } VP9_REFFRAME; - - - typedef enum { - USAGE_STREAM_FROM_SERVER = 0x0, - USAGE_LOCAL_FILE_PLAYBACK = 0x1, - USAGE_CONSTRAINED_QUALITY = 0x2 - } END_USAGE; - - - typedef enum { - MODE_GOODQUALITY = 0x1, - MODE_BESTQUALITY = 0x2, - MODE_FIRSTPASS = 0x3, - MODE_SECONDPASS = 0x4, - MODE_SECONDPASS_BEST = 0x5, - } MODE; - - typedef enum { - FRAMEFLAGS_KEY = 1, - FRAMEFLAGS_GOLDEN = 2, - FRAMEFLAGS_ALTREF = 4, - } FRAMETYPE_FLAGS; - - -#include - static __inline void Scale2Ratio(int mode, int *hr, int *hs) { - switch (mode) { - case NORMAL: - *hr = 1; - *hs = 1; - break; - case FOURFIVE: - *hr = 4; - *hs = 5; - break; - case THREEFIVE: - *hr = 3; - *hs = 5; - break; - case ONETWO: - *hr = 1; - *hs = 2; - break; - default: - *hr = 1; - *hs = 1; - assert(0); - break; - } - } - - typedef struct { - int Version; // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode - int Width; // width of data passed to the compressor - int Height; // height of data passed to the compressor - double frame_rate; // set to passed in framerate - int target_bandwidth; // bandwidth to be used in kilobits per second - - int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0 - int Sharpness; // parameter used for sharpening output: recommendation 0: - int cpu_used; - unsigned int rc_max_intra_bitrate_pct; - - // mode -> - // (0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing - // a television signal or feed from a live camera). ( speed setting controls how fast ) - // (1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to - // encode the output. ( speed setting controls how fast ) - // (2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding - // speed. The output is compressed at the highest possible quality. This option takes the longest - // amount of time to encode. ( speed setting ignored ) - // (3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding - // pass. ( speed setting controls how fast ) - // (4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding - // pass to create the compressed output. ( speed setting controls how fast ) - // (5)=Two Pass - Second Pass Best. The encoder uses the statistics that were generated in the first - // encoding pass to create the compressed output using the highest possible quality, and taking a - // longer amount of time to encode.. ( speed setting ignored ) - int Mode; // - - // Key Framing Operations - int auto_key; // automatically detect cut scenes and set the keyframes - int key_freq; // maximum distance to key frame. - - int allow_lag; // allow lagged compression (if 0 lagin frames is ignored) - int lag_in_frames; // how many frames lag before we start encoding - - // ---------------------------------------------------------------- - // DATARATE CONTROL OPTIONS - - int end_usage; // vbr or cbr - - // buffer targeting aggressiveness - int under_shoot_pct; - int over_shoot_pct; - - // buffering parameters - int starting_buffer_level; // in seconds - int optimal_buffer_level; - int maximum_buffer_size; - - // controlling quality - int fixed_q; - int worst_allowed_q; - int best_allowed_q; - int cq_level; - int lossless; - - // two pass datarate control - int two_pass_vbrbias; // two pass datarate control tweaks - int two_pass_vbrmin_section; - int two_pass_vbrmax_section; - // END DATARATE CONTROL OPTIONS - // ---------------------------------------------------------------- - - - // these parameters aren't to be used in final build don't use!!! - int play_alternate; - int alt_freq; - - int encode_breakout; // early breakout encode threshold : for video conf recommend 800 - - int arnr_max_frames; - int arnr_strength; - int arnr_type; - - struct vpx_fixed_buf two_pass_stats_in; - struct vpx_codec_pkt_list *output_pkt_list; - - vp8e_tuning tuning; - } VP9_CONFIG; - - - void vp9_initialize_enc(); - - VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf); - void vp9_remove_compressor(VP9_PTR *comp); - - void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf); - -// receive a frames worth of data caller can assume that a copy of this frame is made -// and not just a copy of the pointer.. - int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags, - YV12_BUFFER_CONFIG *sd, int64_t time_stamp, - int64_t end_time_stamp); - - int vp9_get_compressed_data(VP9_PTR comp, unsigned int *frame_flags, - unsigned long *size, unsigned char *dest, - int64_t *time_stamp, int64_t *time_end, - int flush); - - int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, - vp9_ppflags_t *flags); - - int vp9_use_as_reference(VP9_PTR comp, int ref_frame_flags); - - int vp9_update_reference(VP9_PTR comp, int ref_frame_flags); - - int vp9_get_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd); - - int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd); - - int vp9_update_entropy(VP9_PTR comp, int update); - - int vp9_set_roimap(VP9_PTR comp, unsigned char *map, - unsigned int rows, unsigned int cols, - int delta_q[4], int delta_lf[4], - unsigned int threshold[4]); - - int vp9_set_active_map(VP9_PTR comp, unsigned char *map, - unsigned int rows, unsigned int cols); - - int vp9_set_internal_size(VP9_PTR comp, - VPX_SCALING horiz_mode, VPX_SCALING vert_mode); - - int vp9_get_quantizer(VP9_PTR c); - -#ifdef __cplusplus -} -#endif - -#endif // __INC_ONYX_H diff --git a/vp9/common/onyxc_int.h b/vp9/common/onyxc_int.h deleted file mode 100644 index 27a6ca132..000000000 --- a/vp9/common/onyxc_int.h +++ /dev/null @@ -1,323 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_ONYXC_INT_H -#define __INC_ONYXC_INT_H - -#include "vpx_config.h" -#include "vpx/internal/vpx_codec_internal.h" -#include "vp9_rtcd.h" -#include "loopfilter.h" -#include "entropymv.h" -#include "entropy.h" -#include "entropymode.h" -#if CONFIG_POSTPROC -#include "postproc.h" -#endif - -/*#ifdef PACKET_TESTING*/ -#include "header.h" -/*#endif*/ - -/* Create/destroy static data structures. */ - -void vp9_initialize_common(void); - -#define MINQ 0 - -#define MAXQ 255 -#define QINDEX_BITS 8 - -#define QINDEX_RANGE (MAXQ + 1) - -#define NUM_YV12_BUFFERS 4 - -#define COMP_PRED_CONTEXTS 2 - -typedef struct frame_contexts { - vp9_prob bmode_prob[VP9_NKF_BINTRAMODES - 1]; - vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */ -#if CONFIG_SUPERBLOCKS - vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1]; -#endif - vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1]; - vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1]; - vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1]; - vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1]; - vp9_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; - vp9_prob hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; - vp9_prob coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; - vp9_prob hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; - vp9_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; - vp9_prob hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; - - nmv_context nmvc; - nmv_context pre_nmvc; - vp9_prob pre_bmode_prob[VP9_NKF_BINTRAMODES - 1]; - vp9_prob pre_ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */ -#if CONFIG_SUPERBLOCKS - vp9_prob pre_sb_ymode_prob[VP9_I32X32_MODES - 1]; -#endif - vp9_prob pre_uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1]; - vp9_prob pre_i8x8_mode_prob[VP9_I8X8_MODES - 1]; - vp9_prob pre_sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1]; - vp9_prob pre_mbsplit_prob[VP9_NUMMBSPLITS - 1]; - unsigned int bmode_counts[VP9_NKF_BINTRAMODES]; - unsigned int ymode_counts[VP9_YMODES]; /* interframe intra mode probs */ -#if CONFIG_SUPERBLOCKS - unsigned int sb_ymode_counts[VP9_I32X32_MODES]; -#endif - unsigned int uv_mode_counts[VP9_YMODES][VP9_UV_MODES]; - unsigned int i8x8_mode_counts[VP9_I8X8_MODES]; /* interframe intra probs */ - unsigned int sub_mv_ref_counts[SUBMVREF_COUNT][VP9_SUBMVREFS]; - unsigned int mbsplit_counts[VP9_NUMMBSPLITS]; - - vp9_prob pre_coef_probs [BLOCK_TYPES] [COEF_BANDS] - [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; - vp9_prob pre_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] - [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; - - vp9_prob pre_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] - [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; - vp9_prob pre_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] - [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; - - vp9_prob pre_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] - [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; - vp9_prob pre_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] - [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; - - unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] - [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; - unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS] - [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; - - unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] - [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; - unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] - [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; - - unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] - [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; - unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] - [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; - - nmv_context_counts NMVcount; - vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1] - [VP9_SWITCHABLE_FILTERS - 1]; -#if CONFIG_COMP_INTERINTRA_PRED - unsigned int interintra_counts[2]; - vp9_prob interintra_prob; - vp9_prob pre_interintra_prob; -#endif - - int mode_context[INTER_MODE_CONTEXTS][4]; - int mode_context_a[INTER_MODE_CONTEXTS][4]; - int vp9_mode_contexts[INTER_MODE_CONTEXTS][4]; - int mv_ref_ct[INTER_MODE_CONTEXTS][4][2]; -} FRAME_CONTEXT; - -typedef enum { - RECON_CLAMP_REQUIRED = 0, - RECON_CLAMP_NOTREQUIRED = 1 -} CLAMP_TYPE; - -typedef enum { - SINGLE_PREDICTION_ONLY = 0, - COMP_PREDICTION_ONLY = 1, - HYBRID_PREDICTION = 2, - NB_PREDICTION_TYPES = 3, -} COMPPREDMODE_TYPE; - -typedef enum { - ONLY_4X4 = 0, - ALLOW_8X8 = 1, - ALLOW_16X16 = 2, - TX_MODE_SELECT = 3, - NB_TXFM_MODES = 4, -} TXFM_MODE; - -typedef struct VP9_COMMON_RTCD { -#if CONFIG_RUNTIME_CPU_DETECT -#if CONFIG_POSTPROC - vp9_postproc_rtcd_vtable_t postproc; -#endif - int flags; -#else - int unused; -#endif -} VP9_COMMON_RTCD; - -typedef struct VP9Common { - struct vpx_internal_error_info error; - - DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][16]); - - int Width; - int Height; - int horiz_scale; - int vert_scale; - - YUV_TYPE clr_type; - CLAMP_TYPE clamp_type; - - YV12_BUFFER_CONFIG *frame_to_show; - - YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS]; - int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; - int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx; - - YV12_BUFFER_CONFIG post_proc_buffer; - YV12_BUFFER_CONFIG temp_scale_frame; - - - FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */ - FRAME_TYPE frame_type; - - int show_frame; - - int frame_flags; - int MBs; - int mb_rows; - int mb_cols; - int mode_info_stride; - - /* profile settings */ - int experimental; - int mb_no_coeff_skip; - TXFM_MODE txfm_mode; - COMPPREDMODE_TYPE comp_pred_mode; - int no_lpf; - int use_bilinear_mc_filter; - int full_pixel; - - int base_qindex; - int last_kf_gf_q; /* Q used on the last GF or KF */ - - int y1dc_delta_q; - int y2dc_delta_q; - int y2ac_delta_q; - int uvdc_delta_q; - int uvac_delta_q; - - unsigned int frames_since_golden; - unsigned int frames_till_alt_ref_frame; - - /* We allocate a MODE_INFO struct for each macroblock, together with - an extra row on top and column on the left to simplify prediction. */ - - MODE_INFO *mip; /* Base of allocated array */ - MODE_INFO *mi; /* Corresponds to upper left visible macroblock */ - MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */ - MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */ - - - // Persistent mb segment id map used in prediction. - unsigned char *last_frame_seg_map; - - INTERPOLATIONFILTERTYPE mcomp_filter_type; - LOOPFILTERTYPE filter_type; - - loop_filter_info_n lf_info; - - int filter_level; - int last_sharpness_level; - int sharpness_level; - - int refresh_last_frame; /* Two state 0 = NO, 1 = YES */ - int refresh_golden_frame; /* Two state 0 = NO, 1 = YES */ - int refresh_alt_ref_frame; /* Two state 0 = NO, 1 = YES */ - - int copy_buffer_to_gf; /* 0 none, 1 Last to GF, 2 ARF to GF */ - int copy_buffer_to_arf; /* 0 none, 1 Last to ARF, 2 GF to ARF */ - - int refresh_entropy_probs; /* Two state 0 = NO, 1 = YES */ - - int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */ - - /* Y,U,V,Y2 */ - ENTROPY_CONTEXT_PLANES *above_context; /* row of context for each plane */ - ENTROPY_CONTEXT_PLANES left_context[2]; /* (up to) 4 contexts "" */ - - /* keyframe block modes are predicted by their above, left neighbors */ - - vp9_prob kf_bmode_prob[VP9_KF_BINTRAMODES] - [VP9_KF_BINTRAMODES] - [VP9_KF_BINTRAMODES - 1]; - vp9_prob kf_ymode_prob[8][VP9_YMODES - 1]; /* keyframe "" */ -#if CONFIG_SUPERBLOCKS - vp9_prob sb_kf_ymode_prob[8][VP9_I32X32_MODES - 1]; -#endif - int kf_ymode_probs_index; - int kf_ymode_probs_update; - vp9_prob kf_uv_mode_prob[VP9_YMODES] [VP9_UV_MODES - 1]; - - vp9_prob prob_intra_coded; - vp9_prob prob_last_coded; - vp9_prob prob_gf_coded; -#if CONFIG_SUPERBLOCKS - vp9_prob sb_coded; -#endif - - // Context probabilities when using predictive coding of segment id - vp9_prob segment_pred_probs[PREDICTION_PROBS]; - unsigned char temporal_update; - - // Context probabilities for reference frame prediction - unsigned char ref_scores[MAX_REF_FRAMES]; - vp9_prob ref_pred_probs[PREDICTION_PROBS]; - vp9_prob mod_refprobs[MAX_REF_FRAMES][PREDICTION_PROBS]; - - vp9_prob prob_comppred[COMP_PRED_CONTEXTS]; - - // FIXME contextualize - vp9_prob prob_tx[TX_SIZE_MAX - 1]; - - vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS]; - - FRAME_CONTEXT lfc_a; /* last alt ref entropy */ - FRAME_CONTEXT lfc; /* last frame entropy */ - FRAME_CONTEXT fc; /* this frame entropy */ - - unsigned int current_video_frame; - int near_boffset[3]; - int version; - -#ifdef PACKET_TESTING - VP9_HEADER oh; -#endif - double bitrate; - double framerate; - -#if CONFIG_RUNTIME_CPU_DETECT - VP9_COMMON_RTCD rtcd; -#endif - -#if CONFIG_POSTPROC - struct postproc_state postproc_state; -#endif - -#if CONFIG_PRED_FILTER - /* Prediction filter variables */ - int pred_filter_mode; // 0=disabled at the frame level (no MB filtered) - // 1=enabled at the frame level (all MB filtered) - // 2=specified per MB (1=filtered, 0=non-filtered) - vp9_prob prob_pred_filter_off; -#endif -#if CONFIG_COMP_INTERINTRA_PRED - int use_interintra; -#endif - -} VP9_COMMON; - -#endif // __INC_ONYX_INT_H diff --git a/vp9/common/onyxd.h b/vp9/common/onyxd.h deleted file mode 100644 index 7b7662b3c..000000000 --- a/vp9/common/onyxd.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_ONYXD_H -#define __INC_ONYXD_H - - -/* Create/destroy static data structures. */ -#ifdef __cplusplus -extern "C" -{ -#endif -#include "type_aliases.h" -#include "vpx_scale/yv12config.h" -#include "ppflags.h" -#include "vpx_ports/mem.h" -#include "vpx/vpx_codec.h" - - typedef void *VP9D_PTR; - typedef struct { - int Width; - int Height; - int Version; - int postprocess; - int max_threads; - int input_partition; - } VP9D_CONFIG; - typedef enum { - VP9_LAST_FLAG = 1, - VP9_GOLD_FLAG = 2, - VP9_ALT_FLAG = 4 - } VP9_REFFRAME; - - void vp9_initialize_dec(void); - - int vp9_receive_compressed_data(VP9D_PTR comp, unsigned long size, - const unsigned char **dest, - int64_t time_stamp); - - int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd, - int64_t *time_stamp, int64_t *time_end_stamp, - vp9_ppflags_t *flags); - - vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR comp, - VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd); - - vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp, - VP9_REFFRAME ref_frame_flag, - YV12_BUFFER_CONFIG *sd); - - VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf); - - void vp9_remove_decompressor(VP9D_PTR comp); - -#ifdef __cplusplus -} -#endif - -#endif // __INC_ONYXD_H diff --git a/vp9/common/postproc.c b/vp9/common/postproc.c deleted file mode 100644 index 4c5748e7a..000000000 --- a/vp9/common/postproc.c +++ /dev/null @@ -1,1031 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include "vpx_scale/yv12config.h" -#include "postproc.h" -#include "vp9/common/textblit.h" -#include "vpx_scale/vpxscale.h" -#include "systemdependent.h" - -#include -#include -#include - -#define RGB_TO_YUV(t) \ - ( (0.257*(float)(t >> 16)) + (0.504*(float)(t >> 8 & 0xff)) + \ - (0.098*(float)(t & 0xff)) + 16), \ - (-(0.148*(float)(t >> 16)) - (0.291*(float)(t >> 8 & 0xff)) + \ - (0.439*(float)(t & 0xff)) + 128), \ - ( (0.439*(float)(t >> 16)) - (0.368*(float)(t >> 8 & 0xff)) - \ - (0.071*(float)(t & 0xff)) + 128) - -/* global constants */ -#if CONFIG_POSTPROC_VISUALIZER -static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = { - { RGB_TO_YUV(0x98FB98) }, /* PaleGreen */ - { RGB_TO_YUV(0x00FF00) }, /* Green */ - { RGB_TO_YUV(0xADFF2F) }, /* GreenYellow */ - { RGB_TO_YUV(0x8F0000) }, /* Dark Red */ - { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */ - { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */ - { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */ - { RGB_TO_YUV(0x8F0000) }, /* Dark Red */ - { RGB_TO_YUV(0x8F0000) }, /* Dark Red */ - { RGB_TO_YUV(0x228B22) }, /* ForestGreen */ - { RGB_TO_YUV(0x006400) }, /* DarkGreen */ - { RGB_TO_YUV(0x98F5FF) }, /* Cadet Blue */ - { RGB_TO_YUV(0x6CA6CD) }, /* Sky Blue */ - { RGB_TO_YUV(0x00008B) }, /* Dark blue */ - { RGB_TO_YUV(0x551A8B) }, /* Purple */ - { RGB_TO_YUV(0xFF0000) } /* Red */ - { RGB_TO_YUV(0xCC33FF) }, /* Magenta */ -}; - -static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] = { - { RGB_TO_YUV(0x6633ff) }, /* Purple */ - { RGB_TO_YUV(0xcc33ff) }, /* Magenta */ - { RGB_TO_YUV(0xff33cc) }, /* Pink */ - { RGB_TO_YUV(0xff3366) }, /* Coral */ - { RGB_TO_YUV(0x3366ff) }, /* Blue */ - { RGB_TO_YUV(0xed00f5) }, /* Dark Blue */ - { RGB_TO_YUV(0x2e00b8) }, /* Dark Purple */ - { RGB_TO_YUV(0xff6633) }, /* Orange */ - { RGB_TO_YUV(0x33ccff) }, /* Light Blue */ - { RGB_TO_YUV(0x8ab800) }, /* Green */ - { RGB_TO_YUV(0xffcc33) }, /* Light Orange */ - { RGB_TO_YUV(0x33ffcc) }, /* Aqua */ - { RGB_TO_YUV(0x66ff33) }, /* Light Green */ - { RGB_TO_YUV(0xccff33) }, /* Yellow */ -}; - -static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = { - { RGB_TO_YUV(0x00ff00) }, /* Blue */ - { RGB_TO_YUV(0x0000ff) }, /* Green */ - { RGB_TO_YUV(0xffff00) }, /* Yellow */ - { RGB_TO_YUV(0xff0000) }, /* Red */ -}; -#endif - -static const short kernel5[] = { - 1, 1, 4, 1, 1 -}; - -const short vp9_rv[] = { - 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, - 0, 3, 9, 0, 0, 0, 8, 3, 14, 4, - 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, - 8, 6, 10, 0, 0, 8, 9, 0, 3, 14, - 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, - 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, - 3, 3, 1, 13, 13, 6, 6, 5, 2, 7, - 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, - 14, 4, 12, 5, 12, 10, 8, 10, 13, 10, - 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, - 7, 14, 6, 14, 13, 2, 13, 5, 4, 4, - 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, - 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, - 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, - 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, - 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, - 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, - 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, - 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, - 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, - 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, - 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, - 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, - 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, - 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, - 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, - 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, - 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, - 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, - 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, - 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, - 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, - 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, - 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, - 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, - 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, - 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, - 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, - 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, - 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, - 11, 9, 14, 8, 14, 13, 4, 3, 1, 2, - 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, - 5, 8, 8, 12, 13, 5, 14, 10, 12, 13, - 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, -}; - - -/**************************************************************************** - */ -void vp9_post_proc_down_and_across_c(unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int rows, - int cols, - int flimit) { - unsigned char *p_src, *p_dst; - int row; - int col; - int i; - int v; - int pitch = src_pixels_per_line; - unsigned char d[8]; - (void)dst_pixels_per_line; - - for (row = 0; row < rows; row++) { - /* post_proc_down for one row */ - p_src = src_ptr; - p_dst = dst_ptr; - - for (col = 0; col < cols; col++) { - - int kernel = 4; - int v = p_src[col]; - - for (i = -2; i <= 2; i++) { - if (abs(v - p_src[col + i * pitch]) > flimit) - goto down_skip_convolve; - - kernel += kernel5[2 + i] * p_src[col + i * pitch]; - } - - v = (kernel >> 3); - down_skip_convolve: - p_dst[col] = v; - } - - /* now post_proc_across */ - p_src = dst_ptr; - p_dst = dst_ptr; - - for (i = 0; i < 8; i++) - d[i] = p_src[i]; - - for (col = 0; col < cols; col++) { - int kernel = 4; - v = p_src[col]; - - d[col & 7] = v; - - for (i = -2; i <= 2; i++) { - if (abs(v - p_src[col + i]) > flimit) - goto across_skip_convolve; - - kernel += kernel5[2 + i] * p_src[col + i]; - } - - d[col & 7] = (kernel >> 3); - across_skip_convolve: - - if (col >= 2) - p_dst[col - 2] = d[(col - 2) & 7]; - } - - /* handle the last two pixels */ - p_dst[col - 2] = d[(col - 2) & 7]; - p_dst[col - 1] = d[(col - 1) & 7]; - - - /* next row */ - src_ptr += pitch; - dst_ptr += pitch; - } -} - -static int q2mbl(int x) { - if (x < 20) x = 20; - - x = 50 + (x - 50) * 10 / 8; - return x * x / 3; -} - -void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch, - int rows, int cols, int flimit) { - int r, c, i; - - unsigned char *s = src; - unsigned char d[16]; - - - for (r = 0; r < rows; r++) { - int sumsq = 0; - int sum = 0; - - for (i = -8; i <= 6; i++) { - sumsq += s[i] * s[i]; - sum += s[i]; - d[i + 8] = 0; - } - - for (c = 0; c < cols + 8; c++) { - int x = s[c + 7] - s[c - 8]; - int y = s[c + 7] + s[c - 8]; - - sum += x; - sumsq += x * y; - - d[c & 15] = s[c]; - - if (sumsq * 15 - sum * sum < flimit) { - d[c & 15] = (8 + sum + s[c]) >> 4; - } - - s[c - 8] = d[(c - 8) & 15]; - } - - s += pitch; - } -} - -void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch, - int rows, int cols, int flimit) { - int r, c, i; - const short *rv3 = &vp9_rv[63 & rand()]; - - for (c = 0; c < cols; c++) { - unsigned char *s = &dst[c]; - int sumsq = 0; - int sum = 0; - unsigned char d[16]; - const short *rv2 = rv3 + ((c * 17) & 127); - - for (i = -8; i <= 6; i++) { - sumsq += s[i * pitch] * s[i * pitch]; - sum += s[i * pitch]; - } - - for (r = 0; r < rows + 8; r++) { - sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch]; - sum += s[7 * pitch] - s[-8 * pitch]; - d[r & 15] = s[0]; - - if (sumsq * 15 - sum * sum < flimit) { - d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4; - } - - s[-8 * pitch] = d[(r - 8) & 15]; - s += pitch; - } - } -} - -static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *post, - int q, - int low_var_thresh, - int flag, - vp9_postproc_rtcd_vtable_t *rtcd) { - double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; - int ppl = (int)(level + .5); - (void) low_var_thresh; - (void) flag; - - POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer, - source->y_stride, post->y_stride, - source->y_height, source->y_width, ppl); - POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride, - post->y_height, post->y_width, q2mbl(q)); - POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride, - post->y_height, post->y_width, q2mbl(q)); - - POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer, - source->uv_stride, post->uv_stride, - source->uv_height, source->uv_width, ppl); - POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer, - source->uv_stride, post->uv_stride, - source->uv_height, source->uv_width, ppl); -} - -void vp9_deblock(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *post, - int q, - int low_var_thresh, - int flag, - vp9_postproc_rtcd_vtable_t *rtcd) { - double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; - int ppl = (int)(level + .5); - (void) low_var_thresh; - (void) flag; - - POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer, - source->y_stride, post->y_stride, - source->y_height, source->y_width, ppl); - POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer, - source->uv_stride, post->uv_stride, - source->uv_height, source->uv_width, ppl); - POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer, - source->uv_stride, post->uv_stride, - source->uv_height, source->uv_width, ppl); -} - -void vp9_de_noise(YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *post, - int q, - int low_var_thresh, - int flag, - vp9_postproc_rtcd_vtable_t *rtcd) { - double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; - int ppl = (int)(level + .5); - (void) post; - (void) low_var_thresh; - (void) flag; - - POSTPROC_INVOKE(rtcd, downacross)(src->y_buffer + 2 * src->y_stride + 2, - src->y_buffer + 2 * src->y_stride + 2, - src->y_stride, - src->y_stride, - src->y_height - 4, - src->y_width - 4, - ppl); - POSTPROC_INVOKE(rtcd, downacross)(src->u_buffer + 2 * src->uv_stride + 2, - src->u_buffer + 2 * src->uv_stride + 2, - src->uv_stride, - src->uv_stride, - src->uv_height - 4, - src->uv_width - 4, ppl); - POSTPROC_INVOKE(rtcd, downacross)(src->v_buffer + 2 * src->uv_stride + 2, - src->v_buffer + 2 * src->uv_stride + 2, - src->uv_stride, - src->uv_stride, - src->uv_height - 4, - src->uv_width - 4, ppl); -} - -double vp9_gaussian(double sigma, double mu, double x) { - return 1 / (sigma * sqrt(2.0 * 3.14159265)) * - (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma))); -} - -static void fillrd(struct postproc_state *state, int q, int a) { - char char_dist[300]; - - double sigma; - int ai = a, qi = q, i; - - vp9_clear_system_state(); - - sigma = ai + .5 + .6 * (63 - qi) / 63.0; - - /* set up a lookup table of 256 entries that matches - * a gaussian distribution with sigma determined by q. - */ - { - double i; - int next, j; - - next = 0; - - for (i = -32; i < 32; i++) { - int a = (int)(.5 + 256 * vp9_gaussian(sigma, 0, i)); - - if (a) { - for (j = 0; j < a; j++) { - char_dist[next + j] = (char) i; - } - - next = next + j; - } - - } - - for (next = next; next < 256; next++) - char_dist[next] = 0; - } - - for (i = 0; i < 3072; i++) { - state->noise[i] = char_dist[rand() & 0xff]; - } - - for (i = 0; i < 16; i++) { - state->blackclamp[i] = -char_dist[0]; - state->whiteclamp[i] = -char_dist[0]; - state->bothclamp[i] = -2 * char_dist[0]; - } - - state->last_q = q; - state->last_noise = a; -} - -/**************************************************************************** - * - * ROUTINE : plane_add_noise_c - * - * INPUTS : unsigned char *Start starting address of buffer to - * add gaussian noise to - * unsigned int Width width of plane - * unsigned int Height height of plane - * int Pitch distance between subsequent lines of frame - * int q quantizer used to determine amount of noise - * to add - * - * OUTPUTS : None. - * - * RETURNS : void. - * - * FUNCTION : adds gaussian noise to a plane of pixels - * - * SPECIAL NOTES : None. - * - ****************************************************************************/ -void vp9_plane_add_noise_c(unsigned char *Start, char *noise, - char blackclamp[16], - char whiteclamp[16], - char bothclamp[16], - unsigned int Width, unsigned int Height, int Pitch) { - unsigned int i, j; - - for (i = 0; i < Height; i++) { - unsigned char *Pos = Start + i * Pitch; - char *Ref = (char *)(noise + (rand() & 0xff)); - - for (j = 0; j < Width; j++) { - if (Pos[j] < blackclamp[0]) - Pos[j] = blackclamp[0]; - - if (Pos[j] > 255 + whiteclamp[0]) - Pos[j] = 255 + whiteclamp[0]; - - Pos[j] += Ref[j]; - } - } -} - -/* Blend the macro block with a solid colored square. Leave the - * edges unblended to give distinction to macro blocks in areas - * filled with the same color block. - */ -void vp9_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, - int y1, int u1, int v1, int alpha, int stride) { - int i, j; - int y1_const = y1 * ((1 << 16) - alpha); - int u1_const = u1 * ((1 << 16) - alpha); - int v1_const = v1 * ((1 << 16) - alpha); - - y += 2 * stride + 2; - for (i = 0; i < 12; i++) { - for (j = 0; j < 12; j++) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - stride >>= 1; - - u += stride + 1; - v += stride + 1; - - for (i = 0; i < 6; i++) { - for (j = 0; j < 6; j++) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } - u += stride; - v += stride; - } -} - -/* Blend only the edge of the macro block. Leave center - * unblended to allow for other visualizations to be layered. - */ -void vp9_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, - int y1, int u1, int v1, int alpha, int stride) { - int i, j; - int y1_const = y1 * ((1 << 16) - alpha); - int u1_const = u1 * ((1 << 16) - alpha); - int v1_const = v1 * ((1 << 16) - alpha); - - for (i = 0; i < 2; i++) { - for (j = 0; j < 16; j++) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - for (i = 0; i < 12; i++) { - y[0] = (y[0] * alpha + y1_const) >> 16; - y[1] = (y[1] * alpha + y1_const) >> 16; - y[14] = (y[14] * alpha + y1_const) >> 16; - y[15] = (y[15] * alpha + y1_const) >> 16; - y += stride; - } - - for (i = 0; i < 2; i++) { - for (j = 0; j < 16; j++) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - stride >>= 1; - - for (j = 0; j < 8; j++) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } - u += stride; - v += stride; - - for (i = 0; i < 6; i++) { - u[0] = (u[0] * alpha + u1_const) >> 16; - v[0] = (v[0] * alpha + v1_const) >> 16; - - u[7] = (u[7] * alpha + u1_const) >> 16; - v[7] = (v[7] * alpha + v1_const) >> 16; - - u += stride; - v += stride; - } - - for (j = 0; j < 8; j++) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } -} - -void vp9_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, - int y1, int u1, int v1, int alpha, int stride) { - int i, j; - int y1_const = y1 * ((1 << 16) - alpha); - int u1_const = u1 * ((1 << 16) - alpha); - int v1_const = v1 * ((1 << 16) - alpha); - - for (i = 0; i < 4; i++) { - for (j = 0; j < 4; j++) { - y[j] = (y[j] * alpha + y1_const) >> 16; - } - y += stride; - } - - stride >>= 1; - - for (i = 0; i < 2; i++) { - for (j = 0; j < 2; j++) { - u[j] = (u[j] * alpha + u1_const) >> 16; - v[j] = (v[j] * alpha + v1_const) >> 16; - } - u += stride; - v += stride; - } -} - -static void constrain_line(int x0, int *x1, int y0, int *y1, - int width, int height) { - int dx; - int dy; - - if (*x1 > width) { - dx = *x1 - x0; - dy = *y1 - y0; - - *x1 = width; - if (dx) - *y1 = ((width - x0) * dy) / dx + y0; - } - if (*x1 < 0) { - dx = *x1 - x0; - dy = *y1 - y0; - - *x1 = 0; - if (dx) - *y1 = ((0 - x0) * dy) / dx + y0; - } - if (*y1 > height) { - dx = *x1 - x0; - dy = *y1 - y0; - - *y1 = height; - if (dy) - *x1 = ((height - y0) * dx) / dy + x0; - } - if (*y1 < 0) { - dx = *x1 - x0; - dy = *y1 - y0; - - *y1 = 0; - if (dy) - *x1 = ((0 - y0) * dx) / dy + x0; - } -} - - -#if CONFIG_RUNTIME_CPU_DETECT -#define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc) -#else -#define RTCD_VTABLE(oci) NULL -#endif - -int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest, - vp9_ppflags_t *ppflags) { - int q = oci->filter_level * 10 / 6; - int flags = ppflags->post_proc_flag; - int deblock_level = ppflags->deblocking_level; - int noise_level = ppflags->noise_level; - - if (!oci->frame_to_show) - return -1; - - if (q > 63) - q = 63; - - if (!flags) { - *dest = *oci->frame_to_show; - - /* handle problem with extending borders */ - dest->y_width = oci->Width; - dest->y_height = oci->Height; - dest->uv_height = dest->y_height / 2; - return 0; - - } - -#if ARCH_X86||ARCH_X86_64 - vpx_reset_mmx_state(); -#endif - - if (flags & VP9D_DEMACROBLOCK) { - deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer, - q + (deblock_level - 5) * 10, 1, 0, - RTCD_VTABLE(oci)); - } else if (flags & VP9D_DEBLOCK) { - vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, - q, 1, 0, RTCD_VTABLE(oci)); - } else { - vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer); - } - - if (flags & VP9D_ADDNOISE) { - if (oci->postproc_state.last_q != q - || oci->postproc_state.last_noise != noise_level) { - fillrd(&oci->postproc_state, 63 - q, noise_level); - } - - POSTPROC_INVOKE(RTCD_VTABLE(oci), addnoise)(oci->post_proc_buffer.y_buffer, - oci->postproc_state.noise, - oci->postproc_state.blackclamp, - oci->postproc_state.whiteclamp, - oci->postproc_state.bothclamp, - oci->post_proc_buffer.y_width, - oci->post_proc_buffer.y_height, - oci->post_proc_buffer.y_stride); - } - -#if CONFIG_POSTPROC_VISUALIZER - if (flags & VP9D_DEBUG_TXT_FRAME_INFO) { - char message[512]; - sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d", - (oci->frame_type == KEY_FRAME), - oci->refresh_golden_frame, - oci->base_qindex, - oci->filter_level, - flags, - oci->mb_cols, oci->mb_rows); - vp9_blit_text(message, oci->post_proc_buffer.y_buffer, - oci->post_proc_buffer.y_stride); - } - - if (flags & VP9D_DEBUG_TXT_MBLK_MODES) { - int i, j; - unsigned char *y_ptr; - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; - int mb_rows = post->y_height >> 4; - int mb_cols = post->y_width >> 4; - int mb_index = 0; - MODE_INFO *mi = oci->mi; - - y_ptr = post->y_buffer + 4 * post->y_stride + 4; - - /* vp9_filter each macro block */ - for (i = 0; i < mb_rows; i++) { - for (j = 0; j < mb_cols; j++) { - char zz[4]; - - sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a'); - - vp9_blit_text(zz, y_ptr, post->y_stride); - mb_index++; - y_ptr += 16; - } - - mb_index++; /* border */ - y_ptr += post->y_stride * 16 - post->y_width; - - } - } - - if (flags & VP9D_DEBUG_TXT_DC_DIFF) { - int i, j; - unsigned char *y_ptr; - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; - int mb_rows = post->y_height >> 4; - int mb_cols = post->y_width >> 4; - int mb_index = 0; - MODE_INFO *mi = oci->mi; - - y_ptr = post->y_buffer + 4 * post->y_stride + 4; - - /* vp9_filter each macro block */ - for (i = 0; i < mb_rows; i++) { - for (j = 0; j < mb_cols; j++) { - char zz[4]; - int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED && - mi[mb_index].mbmi.mode != SPLITMV && - mi[mb_index].mbmi.mb_skip_coeff); - - if (oci->frame_type == KEY_FRAME) - sprintf(zz, "a"); - else - sprintf(zz, "%c", dc_diff + '0'); - - vp9_blit_text(zz, y_ptr, post->y_stride); - mb_index++; - y_ptr += 16; - } - - mb_index++; /* border */ - y_ptr += post->y_stride * 16 - post->y_width; - - } - } - - if (flags & VP9D_DEBUG_TXT_RATE_INFO) { - char message[512]; - snprintf(message, sizeof(message), - "Bitrate: %10.2f frame_rate: %10.2f ", - oci->bitrate, oci->framerate); - vp9_blit_text(message, oci->post_proc_buffer.y_buffer, - oci->post_proc_buffer.y_stride); - } - - /* Draw motion vectors */ - if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) { - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; - int width = post->y_width; - int height = post->y_height; - unsigned char *y_buffer = oci->post_proc_buffer.y_buffer; - int y_stride = oci->post_proc_buffer.y_stride; - MODE_INFO *mi = oci->mi; - int x0, y0; - - for (y0 = 0; y0 < height; y0 += 16) { - for (x0 = 0; x0 < width; x0 += 16) { - int x1, y1; - - if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) { - mi++; - continue; - } - - if (mi->mbmi.mode == SPLITMV) { - switch (mi->mbmi.partitioning) { - case PARTITIONING_16X8 : { /* mv_top_bottom */ - union b_mode_info *bmi = &mi->bmi[0]; - MV *mv = &bmi->mv.as_mv; - - x1 = x0 + 8 + (mv->col >> 3); - y1 = y0 + 4 + (mv->row >> 3); - - constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height); - vp9_blit_line(x0 + 8, x1, y0 + 4, y1, y_buffer, y_stride); - - bmi = &mi->bmi[8]; - - x1 = x0 + 8 + (mv->col >> 3); - y1 = y0 + 12 + (mv->row >> 3); - - constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height); - vp9_blit_line(x0 + 8, x1, y0 + 12, y1, y_buffer, y_stride); - - break; - } - case PARTITIONING_8X16 : { /* mv_left_right */ - union b_mode_info *bmi = &mi->bmi[0]; - MV *mv = &bmi->mv.as_mv; - - x1 = x0 + 4 + (mv->col >> 3); - y1 = y0 + 8 + (mv->row >> 3); - - constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height); - vp9_blit_line(x0 + 4, x1, y0 + 8, y1, y_buffer, y_stride); - - bmi = &mi->bmi[2]; - - x1 = x0 + 12 + (mv->col >> 3); - y1 = y0 + 8 + (mv->row >> 3); - - constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height); - vp9_blit_line(x0 + 12, x1, y0 + 8, y1, y_buffer, y_stride); - - break; - } - case PARTITIONING_8X8 : { /* mv_quarters */ - union b_mode_info *bmi = &mi->bmi[0]; - MV *mv = &bmi->mv.as_mv; - - x1 = x0 + 4 + (mv->col >> 3); - y1 = y0 + 4 + (mv->row >> 3); - - constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height); - vp9_blit_line(x0 + 4, x1, y0 + 4, y1, y_buffer, y_stride); - - bmi = &mi->bmi[2]; - - x1 = x0 + 12 + (mv->col >> 3); - y1 = y0 + 4 + (mv->row >> 3); - - constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height); - vp9_blit_line(x0 + 12, x1, y0 + 4, y1, y_buffer, y_stride); - - bmi = &mi->bmi[8]; - - x1 = x0 + 4 + (mv->col >> 3); - y1 = y0 + 12 + (mv->row >> 3); - - constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height); - vp9_blit_line(x0 + 4, x1, y0 + 12, y1, y_buffer, y_stride); - - bmi = &mi->bmi[10]; - - x1 = x0 + 12 + (mv->col >> 3); - y1 = y0 + 12 + (mv->row >> 3); - - constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height); - vp9_blit_line(x0 + 12, x1, y0 + 12, y1, y_buffer, y_stride); - break; - } - case PARTITIONING_4X4: - default : { - union b_mode_info *bmi = mi->bmi; - int bx0, by0; - - for (by0 = y0; by0 < (y0 + 16); by0 += 4) { - for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) { - MV *mv = &bmi->mv.as_mv; - - x1 = bx0 + 2 + (mv->col >> 3); - y1 = by0 + 2 + (mv->row >> 3); - - constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height); - vp9_blit_line(bx0 + 2, x1, by0 + 2, y1, y_buffer, y_stride); - - bmi++; - } - } - } - } - } else if (mi->mbmi.mode >= NEARESTMV) { - MV *mv = &mi->mbmi.mv.as_mv; - const int lx0 = x0 + 8; - const int ly0 = y0 + 8; - - x1 = lx0 + (mv->col >> 3); - y1 = ly0 + (mv->row >> 3); - - if (x1 != lx0 && y1 != ly0) { - constrain_line(lx0, &x1, ly0 - 1, &y1, width, height); - vp9_blit_line(lx0, x1, ly0 - 1, y1, y_buffer, y_stride); - - constrain_line(lx0, &x1, ly0 + 1, &y1, width, height); - vp9_blit_line(lx0, x1, ly0 + 1, y1, y_buffer, y_stride); - } else - vp9_blit_line(lx0, x1, ly0, y1, y_buffer, y_stride); - } - - mi++; - } - mi++; - } - } - - /* Color in block modes */ - if ((flags & VP9D_DEBUG_CLR_BLK_MODES) - && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) { - int y, x; - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; - int width = post->y_width; - int height = post->y_height; - unsigned char *y_ptr = oci->post_proc_buffer.y_buffer; - unsigned char *u_ptr = oci->post_proc_buffer.u_buffer; - unsigned char *v_ptr = oci->post_proc_buffer.v_buffer; - int y_stride = oci->post_proc_buffer.y_stride; - MODE_INFO *mi = oci->mi; - - for (y = 0; y < height; y += 16) { - for (x = 0; x < width; x += 16) { - int Y = 0, U = 0, V = 0; - - if (mi->mbmi.mode == B_PRED && - ((ppflags->display_mb_modes_flag & B_PRED) || - ppflags->display_b_modes_flag)) { - int by, bx; - unsigned char *yl, *ul, *vl; - union b_mode_info *bmi = mi->bmi; - - yl = y_ptr + x; - ul = u_ptr + (x >> 1); - vl = v_ptr + (x >> 1); - - for (by = 0; by < 16; by += 4) { - for (bx = 0; bx < 16; bx += 4) { - if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode)) - || (ppflags->display_mb_modes_flag & B_PRED)) { - Y = B_PREDICTION_MODE_colors[bmi->as_mode.first][0]; - U = B_PREDICTION_MODE_colors[bmi->as_mode.first][1]; - V = B_PREDICTION_MODE_colors[bmi->as_mode.first][2]; - - POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)(yl + bx, - ul + (bx >> 1), - vl + (bx >> 1), - Y, U, V, - 0xc000, y_stride); - } - bmi++; - } - - yl += y_stride * 4; - ul += y_stride * 1; - vl += y_stride * 1; - } - } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) { - Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0]; - U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1]; - V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2]; - - POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)(y_ptr + x, - u_ptr + (x >> 1), - v_ptr + (x >> 1), - Y, U, V, - 0xc000, y_stride); - } - - mi++; - } - y_ptr += y_stride * 16; - u_ptr += y_stride * 4; - v_ptr += y_stride * 4; - - mi++; - } - } - - /* Color in frame reference blocks */ - if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) && - ppflags->display_ref_frame_flag) { - int y, x; - YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; - int width = post->y_width; - int height = post->y_height; - unsigned char *y_ptr = oci->post_proc_buffer.y_buffer; - unsigned char *u_ptr = oci->post_proc_buffer.u_buffer; - unsigned char *v_ptr = oci->post_proc_buffer.v_buffer; - int y_stride = oci->post_proc_buffer.y_stride; - MODE_INFO *mi = oci->mi; - - for (y = 0; y < height; y += 16) { - for (x = 0; x < width; x += 16) { - int Y = 0, U = 0, V = 0; - - if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) { - Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0]; - U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1]; - V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2]; - - POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)(y_ptr + x, - u_ptr + (x >> 1), - v_ptr + (x >> 1), - Y, U, V, - 0xc000, y_stride); - } - - mi++; - } - y_ptr += y_stride * 16; - u_ptr += y_stride * 4; - v_ptr += y_stride * 4; - - mi++; - } - } -#endif - - *dest = oci->post_proc_buffer; - - /* handle problem with extending borders */ - dest->y_width = oci->Width; - dest->y_height = oci->Height; - dest->uv_height = dest->y_height / 2; - - return 0; -} diff --git a/vp9/common/postproc.h b/vp9/common/postproc.h deleted file mode 100644 index f43a8f08a..000000000 --- a/vp9/common/postproc.h +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef POSTPROC_H -#define POSTPROC_H - -#define prototype_postproc_inplace(sym)\ - void sym(unsigned char *dst, int pitch, int rows, int cols, int flimit) - -#define prototype_postproc(sym)\ - void sym(unsigned char *src, unsigned char *dst, int src_pitch, \ - int dst_pitch, int rows, int cols, int flimit) - -#define prototype_postproc_addnoise(sym) \ - void sym(unsigned char *s, char *noise, char blackclamp[16], \ - char whiteclamp[16], char bothclamp[16], \ - unsigned int w, unsigned int h, int pitch) - -#define prototype_postproc_blend_mb_inner(sym)\ - void sym(unsigned char *y, unsigned char *u, unsigned char *v, \ - int y1, int u1, int v1, int alpha, int stride) - -#define prototype_postproc_blend_mb_outer(sym)\ - void sym(unsigned char *y, unsigned char *u, unsigned char *v, \ - int y1, int u1, int v1, int alpha, int stride) - -#define prototype_postproc_blend_b(sym)\ - void sym(unsigned char *y, unsigned char *u, unsigned char *v, \ - int y1, int u1, int v1, int alpha, int stride) - -#if ARCH_X86 || ARCH_X86_64 -#include "x86/postproc_x86.h" -#endif - -#ifndef vp9_postproc_down -#define vp9_postproc_down vp9_mbpost_proc_down_c -#endif -extern prototype_postproc_inplace(vp9_postproc_down); - -#ifndef vp9_postproc_across -#define vp9_postproc_across vp9_mbpost_proc_across_ip_c -#endif -extern prototype_postproc_inplace(vp9_postproc_across); - -#ifndef vp9_postproc_downacross -#define vp9_postproc_downacross vp9_post_proc_down_and_across_c -#endif -extern prototype_postproc(vp9_postproc_downacross); - -#ifndef vp9_postproc_addnoise -#define vp9_postproc_addnoise vp9_plane_add_noise_c -#endif -extern prototype_postproc_addnoise(vp9_postproc_addnoise); - -#ifndef vp9_postproc_blend_mb_inner -#define vp9_postproc_blend_mb_inner vp9_blend_mb_inner_c -#endif -extern prototype_postproc_blend_mb_inner(vp9_postproc_blend_mb_inner); - -#ifndef vp9_postproc_blend_mb_outer -#define vp9_postproc_blend_mb_outer vp9_blend_mb_outer_c -#endif -extern prototype_postproc_blend_mb_outer(vp9_postproc_blend_mb_outer); - -#ifndef vp9_postproc_blend_b -#define vp9_postproc_blend_b vp9_blend_b_c -#endif -extern prototype_postproc_blend_b(vp9_postproc_blend_b); - -typedef prototype_postproc((*vp9_postproc_fn_t)); -typedef prototype_postproc_inplace((*vp9_postproc_inplace_fn_t)); -typedef prototype_postproc_addnoise((*vp9_postproc_addnoise_fn_t)); -typedef prototype_postproc_blend_mb_inner((*vp9_postproc_blend_mb_inner_fn_t)); -typedef prototype_postproc_blend_mb_outer((*vp9_postproc_blend_mb_outer_fn_t)); -typedef prototype_postproc_blend_b((*vp9_postproc_blend_b_fn_t)); -typedef struct { - vp9_postproc_inplace_fn_t down; - vp9_postproc_inplace_fn_t across; - vp9_postproc_fn_t downacross; - vp9_postproc_addnoise_fn_t addnoise; - vp9_postproc_blend_mb_inner_fn_t blend_mb_inner; - vp9_postproc_blend_mb_outer_fn_t blend_mb_outer; - vp9_postproc_blend_b_fn_t blend_b; -} vp9_postproc_rtcd_vtable_t; - -#if CONFIG_RUNTIME_CPU_DETECT -#define POSTPROC_INVOKE(ctx,fn) (ctx)->fn -#else -#define POSTPROC_INVOKE(ctx,fn) vp9_postproc_##fn -#endif - -#include "vpx_ports/mem.h" -struct postproc_state { - int last_q; - int last_noise; - char noise[3072]; - DECLARE_ALIGNED(16, char, blackclamp[16]); - DECLARE_ALIGNED(16, char, whiteclamp[16]); - DECLARE_ALIGNED(16, char, bothclamp[16]); -}; -#include "onyxc_int.h" -#include "ppflags.h" -int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest, - vp9_ppflags_t *flags); - - -void vp9_de_noise(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *post, - int q, - int low_var_thresh, - int flag, - vp9_postproc_rtcd_vtable_t *rtcd); - -void vp9_deblock(YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *post, - int q, - int low_var_thresh, - int flag, - vp9_postproc_rtcd_vtable_t *rtcd); -#endif diff --git a/vp9/common/ppc/copy_altivec.asm b/vp9/common/ppc/copy_altivec.asm deleted file mode 100644 index a4ce91583..000000000 --- a/vp9/common/ppc/copy_altivec.asm +++ /dev/null @@ -1,47 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl copy_mem16x16_ppc - -;# r3 unsigned char *src -;# r4 int src_stride -;# r5 unsigned char *dst -;# r6 int dst_stride - -;# Make the assumption that input will not be aligned, -;# but the output will be. So two reads and a perm -;# for the input, but only one store for the output. -copy_mem16x16_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xe000 - mtspr 256, r12 ;# set VRSAVE - - li r10, 16 - mtctr r10 - -cp_16x16_loop: - lvsl v0, 0, r3 ;# permutate value for alignment - - lvx v1, 0, r3 - lvx v2, r10, r3 - - vperm v1, v1, v2, v0 - - stvx v1, 0, r5 - - add r3, r3, r4 ;# increment source pointer - add r5, r5, r6 ;# increment destination pointer - - bdnz cp_16x16_loop - - mtspr 256, r11 ;# reset old VRSAVE - - blr diff --git a/vp9/common/ppc/filter_altivec.asm b/vp9/common/ppc/filter_altivec.asm deleted file mode 100644 index 4da2e94f9..000000000 --- a/vp9/common/ppc/filter_altivec.asm +++ /dev/null @@ -1,1013 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl sixtap_predict_ppc - .globl sixtap_predict8x4_ppc - .globl sixtap_predict8x8_ppc - .globl sixtap_predict16x16_ppc - -.macro load_c V, LABEL, OFF, R0, R1 - lis \R0, \LABEL@ha - la \R1, \LABEL@l(\R0) - lvx \V, \OFF, \R1 -.endm - -.macro load_hfilter V0, V1 - load_c \V0, HFilter, r5, r9, r10 - - addi r5, r5, 16 - lvx \V1, r5, r10 -.endm - -;# Vertical filtering -.macro Vprolog - load_c v0, VFilter, r6, r3, r10 - - vspltish v5, 8 - vspltish v6, 3 - vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - vspltb v1, v0, 1 - vspltb v2, v0, 2 - vspltb v3, v0, 3 - vspltb v4, v0, 4 - vspltb v5, v0, 5 - vspltb v0, v0, 0 -.endm - -.macro vpre_load - Vprolog - li r10, 16 - lvx v10, 0, r9 ;# v10..v14 = first 5 rows - lvx v11, r10, r9 - addi r9, r9, 32 - lvx v12, 0, r9 - lvx v13, r10, r9 - addi r9, r9, 32 - lvx v14, 0, r9 -.endm - -.macro Msum Re, Ro, V, T, TMP - ;# (Re,Ro) += (V*T) - vmuleub \TMP, \V, \T ;# trashes v8 - vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary - vmuloub \TMP, \V, \T - vadduhm \Ro, \Ro, \TMP ;# Ro = odds -.endm - -.macro vinterp_no_store P0 P1 P2 P3 P4 P5 - vmuleub v8, \P0, v0 ;# 64 + 4 positive taps - vadduhm v16, v6, v8 - vmuloub v8, \P0, v0 - vadduhm v17, v6, v8 - Msum v16, v17, \P2, v2, v8 - Msum v16, v17, \P3, v3, v8 - Msum v16, v17, \P5, v5, v8 - - vmuleub v18, \P1, v1 ;# 2 negative taps - vmuloub v19, \P1, v1 - Msum v18, v19, \P4, v4, v8 - - vsubuhs v16, v16, v18 ;# subtract neg from pos - vsubuhs v17, v17, v19 - vsrh v16, v16, v7 ;# divide by 128 - vsrh v17, v17, v7 ;# v16 v17 = evens, odds - vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order - vmrglh v19, v16, v17 - vpkuhus \P0, v18, v19 ;# P0 = 8-bit result -.endm - -.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5 - vmuleub v24, \P0, v13 ;# 64 + 4 positive taps - vadduhm v21, v20, v24 - vmuloub v24, \P0, v13 - vadduhm v22, v20, v24 - Msum v21, v22, \P2, v15, v25 - Msum v21, v22, \P3, v16, v25 - Msum v21, v22, \P5, v18, v25 - - vmuleub v23, \P1, v14 ;# 2 negative taps - vmuloub v24, \P1, v14 - Msum v23, v24, \P4, v17, v25 - - vsubuhs v21, v21, v23 ;# subtract neg from pos - vsubuhs v22, v22, v24 - vsrh v21, v21, v19 ;# divide by 128 - vsrh v22, v22, v19 ;# v16 v17 = evens, odds - vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order - vmrglh v24, v21, v22 - vpkuhus \P0, v23, v24 ;# P0 = 8-bit result -.endm - - -.macro Vinterp P0 P1 P2 P3 P4 P5 - vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5 - stvx \P0, 0, r7 - add r7, r7, r8 ;# 33 ops per 16 pels -.endm - - -.macro luma_v P0, P1, P2, P3, P4, P5 - addi r9, r9, 16 ;# P5 = newest input row - lvx \P5, 0, r9 - Vinterp \P0, \P1, \P2, \P3, \P4, \P5 -.endm - -.macro luma_vtwo - luma_v v10, v11, v12, v13, v14, v15 - luma_v v11, v12, v13, v14, v15, v10 -.endm - -.macro luma_vfour - luma_vtwo - luma_v v12, v13, v14, v15, v10, v11 - luma_v v13, v14, v15, v10, v11, v12 -.endm - -.macro luma_vsix - luma_vfour - luma_v v14, v15, v10, v11, v12, v13 - luma_v v15, v10, v11, v12, v13, v14 -.endm - -.macro Interp4 R I I4 - vmsummbm \R, v13, \I, v15 - vmsummbm \R, v14, \I4, \R -.endm - -.macro Read8x8 VD, RS, RP, increment_counter - lvsl v21, 0, \RS ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx \VD, 0, \RS - lvx v20, r10, \RS - -.if \increment_counter - add \RS, \RS, \RP -.endif - - vperm \VD, \VD, v20, v21 -.endm - -.macro interp_8x8 R - vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456 - vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A - Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3 - vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx - Interp4 v21, v21, \R ;# v21 = result 4 5 6 7 - - vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7 - vsrh \R, \R, v19 - - vpkuhus \R, \R, \R ;# saturate and pack - -.endm - -.macro Read4x4 VD, RS, RP, increment_counter - lvsl v21, 0, \RS ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v20, 0, \RS - -.if \increment_counter - add \RS, \RS, \RP -.endif - - vperm \VD, v20, v20, v21 -.endm - .text - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch -sixtap_predict_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xff87 - ori r12, r12, 0xffc0 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - slwi. r5, r5, 5 ;# index into horizontal filter array - - vspltish v19, 7 - - ;# If there isn't any filtering to be done for the horizontal, then - ;# just skip to the second pass. - beq- vertical_only_4x4 - - ;# load up horizontal filter - load_hfilter v13, v14 - - ;# rounding added in on the multiply - vspltisw v16, 8 - vspltisw v15, 3 - vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 - - ;# Load up permutation constants - load_c v16, B_0123, 0, r9, r10 - load_c v17, B_4567, 0, r9, r10 - load_c v18, B_89AB, 0, r9, r10 - - ;# Back off input buffer by 2 bytes. Need 2 before and 3 after - addi r3, r3, -2 - - addi r9, r3, 0 - li r10, 16 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - - slwi. r6, r6, 4 ;# index into vertical filter array - - ;# filter a line - interp_8x8 v2 - interp_8x8 v3 - interp_8x8 v4 - interp_8x8 v5 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional 5 lines that are needed - ;# for the vertical filter. - beq- store_4x4 - - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r9, r9, r4 - sub r9, r9, r4 - - Read8x8 v0, r9, r4, 1 - Read8x8 v1, r9, r4, 0 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 0 - - interp_8x8 v0 - interp_8x8 v1 - interp_8x8 v6 - interp_8x8 v7 - interp_8x8 v8 - - b second_pass_4x4 - -vertical_only_4x4: - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r3, r3, r4 - sub r3, r3, r4 - li r10, 16 - - Read8x8 v0, r3, r4, 1 - Read8x8 v1, r3, r4, 1 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 0 - - slwi r6, r6, 4 ;# index into vertical filter array - -second_pass_4x4: - load_c v20, b_hilo_4x4, 0, r9, r10 - load_c v21, b_hilo, 0, r9, r10 - - ;# reposition input so that it can go through the - ;# filtering phase with one pass. - vperm v0, v0, v1, v20 ;# 0 1 x x - vperm v2, v2, v3, v20 ;# 2 3 x x - vperm v4, v4, v5, v20 ;# 4 5 x x - vperm v6, v6, v7, v20 ;# 6 7 x x - - vperm v0, v0, v2, v21 ;# 0 1 2 3 - vperm v4, v4, v6, v21 ;# 4 5 6 7 - - vsldoi v1, v0, v4, 4 - vsldoi v2, v0, v4, 8 - vsldoi v3, v0, v4, 12 - - vsldoi v5, v4, v8, 4 - - load_c v13, VFilter, r6, r9, r10 - - vspltish v15, 8 - vspltish v20, 3 - vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - vspltb v14, v13, 1 - vspltb v15, v13, 2 - vspltb v16, v13, 3 - vspltb v17, v13, 4 - vspltb v18, v13, 5 - vspltb v13, v13, 0 - - vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 - - stvx v0, 0, r1 - - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - lwz r0, 4(r1) - stw r0, 0(r7) - add r7, r7, r8 - - lwz r0, 8(r1) - stw r0, 0(r7) - add r7, r7, r8 - - lwz r0, 12(r1) - stw r0, 0(r7) - - b exit_4x4 - -store_4x4: - - stvx v2, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v3, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v4, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v5, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - -exit_4x4: - - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -.macro w_8x8 V, D, R, P - stvx \V, 0, r1 - lwz \R, 0(r1) - stw \R, 0(r7) - lwz \R, 4(r1) - stw \R, 4(r7) - add \D, \D, \P -.endm - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch - -sixtap_predict8x4_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xffc0 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - slwi. r5, r5, 5 ;# index into horizontal filter array - - vspltish v19, 7 - - ;# If there isn't any filtering to be done for the horizontal, then - ;# just skip to the second pass. - beq- second_pass_pre_copy_8x4 - - load_hfilter v13, v14 - - ;# rounding added in on the multiply - vspltisw v16, 8 - vspltisw v15, 3 - vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 - - ;# Load up permutation constants - load_c v16, B_0123, 0, r9, r10 - load_c v17, B_4567, 0, r9, r10 - load_c v18, B_89AB, 0, r9, r10 - - ;# Back off input buffer by 2 bytes. Need 2 before and 3 after - addi r3, r3, -2 - - addi r9, r3, 0 - li r10, 16 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - - slwi. r6, r6, 4 ;# index into vertical filter array - - ;# filter a line - interp_8x8 v2 - interp_8x8 v3 - interp_8x8 v4 - interp_8x8 v5 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional 5 lines that are needed - ;# for the vertical filter. - beq- store_8x4 - - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r9, r9, r4 - sub r9, r9, r4 - - Read8x8 v0, r9, r4, 1 - Read8x8 v1, r9, r4, 0 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 0 - - interp_8x8 v0 - interp_8x8 v1 - interp_8x8 v6 - interp_8x8 v7 - interp_8x8 v8 - - b second_pass_8x4 - -second_pass_pre_copy_8x4: - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r3, r3, r4 - sub r3, r3, r4 - li r10, 16 - - Read8x8 v0, r3, r4, 1 - Read8x8 v1, r3, r4, 1 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 1 - - slwi r6, r6, 4 ;# index into vertical filter array - -second_pass_8x4: - load_c v13, VFilter, r6, r9, r10 - - vspltish v15, 8 - vspltish v20, 3 - vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - vspltb v14, v13, 1 - vspltb v15, v13, 2 - vspltb v16, v13, 3 - vspltb v17, v13, 4 - vspltb v18, v13, 5 - vspltb v13, v13, 0 - - vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 - vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 - vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 - vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 - - cmpi cr0, r8, 8 - beq cr0, store_aligned_8x4 - - w_8x8 v0, r7, r0, r8 - w_8x8 v1, r7, r0, r8 - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - - b exit_8x4 - -store_aligned_8x4: - - load_c v10, b_hilo, 0, r9, r10 - - vperm v0, v0, v1, v10 - vperm v2, v2, v3, v10 - - stvx v0, 0, r7 - addi r7, r7, 16 - stvx v2, 0, r7 - - b exit_8x4 - -store_8x4: - cmpi cr0, r8, 8 - beq cr0, store_aligned2_8x4 - - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - w_8x8 v4, r7, r0, r8 - w_8x8 v5, r7, r0, r8 - - b exit_8x4 - -store_aligned2_8x4: - load_c v10, b_hilo, 0, r9, r10 - - vperm v2, v2, v3, v10 - vperm v4, v4, v5, v10 - - stvx v2, 0, r7 - addi r7, r7, 16 - stvx v4, 0, r7 - -exit_8x4: - - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - - blr - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch - -;# Because the width that needs to be filtered will fit in a single altivec -;# register there is no need to loop. Everything can stay in registers. -sixtap_predict8x8_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xffc0 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - slwi. r5, r5, 5 ;# index into horizontal filter array - - vspltish v19, 7 - - ;# If there isn't any filtering to be done for the horizontal, then - ;# just skip to the second pass. - beq- second_pass_pre_copy_8x8 - - load_hfilter v13, v14 - - ;# rounding added in on the multiply - vspltisw v16, 8 - vspltisw v15, 3 - vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 - - ;# Load up permutation constants - load_c v16, B_0123, 0, r9, r10 - load_c v17, B_4567, 0, r9, r10 - load_c v18, B_89AB, 0, r9, r10 - - ;# Back off input buffer by 2 bytes. Need 2 before and 3 after - addi r3, r3, -2 - - addi r9, r3, 0 - li r10, 16 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 1 - Read8x8 v9, r3, r4, 1 - - slwi. r6, r6, 4 ;# index into vertical filter array - - ;# filter a line - interp_8x8 v2 - interp_8x8 v3 - interp_8x8 v4 - interp_8x8 v5 - interp_8x8 v6 - interp_8x8 v7 - interp_8x8 v8 - interp_8x8 v9 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional 5 lines that are needed - ;# for the vertical filter. - beq- store_8x8 - - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r9, r9, r4 - sub r9, r9, r4 - - Read8x8 v0, r9, r4, 1 - Read8x8 v1, r9, r4, 0 - Read8x8 v10, r3, r4, 1 - Read8x8 v11, r3, r4, 1 - Read8x8 v12, r3, r4, 0 - - interp_8x8 v0 - interp_8x8 v1 - interp_8x8 v10 - interp_8x8 v11 - interp_8x8 v12 - - b second_pass_8x8 - -second_pass_pre_copy_8x8: - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r3, r3, r4 - sub r3, r3, r4 - li r10, 16 - - Read8x8 v0, r3, r4, 1 - Read8x8 v1, r3, r4, 1 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 1 - Read8x8 v9, r3, r4, 1 - Read8x8 v10, r3, r4, 1 - Read8x8 v11, r3, r4, 1 - Read8x8 v12, r3, r4, 0 - - slwi r6, r6, 4 ;# index into vertical filter array - -second_pass_8x8: - load_c v13, VFilter, r6, r9, r10 - - vspltish v15, 8 - vspltish v20, 3 - vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - vspltb v14, v13, 1 - vspltb v15, v13, 2 - vspltb v16, v13, 3 - vspltb v17, v13, 4 - vspltb v18, v13, 5 - vspltb v13, v13, 0 - - vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 - vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 - vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 - vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 - vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9 - vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10 - vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11 - vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12 - - cmpi cr0, r8, 8 - beq cr0, store_aligned_8x8 - - w_8x8 v0, r7, r0, r8 - w_8x8 v1, r7, r0, r8 - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - w_8x8 v4, r7, r0, r8 - w_8x8 v5, r7, r0, r8 - w_8x8 v6, r7, r0, r8 - w_8x8 v7, r7, r0, r8 - - b exit_8x8 - -store_aligned_8x8: - - load_c v10, b_hilo, 0, r9, r10 - - vperm v0, v0, v1, v10 - vperm v2, v2, v3, v10 - vperm v4, v4, v5, v10 - vperm v6, v6, v7, v10 - - stvx v0, 0, r7 - addi r7, r7, 16 - stvx v2, 0, r7 - addi r7, r7, 16 - stvx v4, 0, r7 - addi r7, r7, 16 - stvx v6, 0, r7 - - b exit_8x8 - -store_8x8: - cmpi cr0, r8, 8 - beq cr0, store_aligned2_8x8 - - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - w_8x8 v4, r7, r0, r8 - w_8x8 v5, r7, r0, r8 - w_8x8 v6, r7, r0, r8 - w_8x8 v7, r7, r0, r8 - w_8x8 v8, r7, r0, r8 - w_8x8 v9, r7, r0, r8 - - b exit_8x8 - -store_aligned2_8x8: - load_c v10, b_hilo, 0, r9, r10 - - vperm v2, v2, v3, v10 - vperm v4, v4, v5, v10 - vperm v6, v6, v7, v10 - vperm v8, v8, v9, v10 - - stvx v2, 0, r7 - addi r7, r7, 16 - stvx v4, 0, r7 - addi r7, r7, 16 - stvx v6, 0, r7 - addi r7, r7, 16 - stvx v8, 0, r7 - -exit_8x8: - - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch - -;# Two pass filtering. First pass is Horizontal edges, second pass is vertical -;# edges. One of the filters can be null, but both won't be. Needs to use a -;# temporary buffer because the source buffer can't be modified and the buffer -;# for the destination is not large enough to hold the temporary data. -sixtap_predict16x16_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xf000 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-416(r1) ;# create space on the stack - - ;# Three possiblities - ;# 1. First filter is null. Don't use a temp buffer. - ;# 2. Second filter is null. Don't use a temp buffer. - ;# 3. Neither are null, use temp buffer. - - ;# First Pass (horizontal edge) - ;# setup pointers for src - ;# if possiblity (1) then setup the src pointer to be the orginal and jump - ;# to second pass. this is based on if x_offset is 0. - - ;# load up horizontal filter - slwi. r5, r5, 5 ;# index into horizontal filter array - - load_hfilter v4, v5 - - beq- copy_horizontal_16x21 - - ;# Back off input buffer by 2 bytes. Need 2 before and 3 after - addi r3, r3, -2 - - slwi. r6, r6, 4 ;# index into vertical filter array - - ;# setup constants - ;# v14 permutation value for alignment - load_c v14, b_hperm, 0, r9, r10 - - ;# These statements are guessing that there won't be a second pass, - ;# but if there is then inside the bypass they need to be set - li r0, 16 ;# prepare for no vertical filter - - ;# Change the output pointer and pitch to be the actual - ;# desination instead of a temporary buffer. - addi r9, r7, 0 - addi r5, r8, 0 - - ;# no vertical filter, so write the output from the first pass - ;# directly into the output buffer. - beq- no_vertical_filter_bypass - - ;# if the second filter is not null then need to back off by 2*pitch - sub r3, r3, r4 - sub r3, r3, r4 - - ;# setup counter for the number of lines that are going to be filtered - li r0, 21 - - ;# use the stack as temporary storage - la r9, 48(r1) - li r5, 16 - -no_vertical_filter_bypass: - - mtctr r0 - - ;# rounding added in on the multiply - vspltisw v10, 8 - vspltisw v12, 3 - vslw v12, v10, v12 ;# 0x00000040000000400000004000000040 - - ;# downshift by 7 ( divide by 128 ) at the end - vspltish v13, 7 - - ;# index to the next set of vectors in the row. - li r10, 16 - li r12, 32 - -horizontal_loop_16x16: - - lvsl v15, 0, r3 ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v1, 0, r3 - lvx v2, r10, r3 - lvx v3, r12, r3 - - vperm v8, v1, v2, v15 - vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified - - vsldoi v11, v8, v9, 4 - - ;# set 0 - vmsummbm v6, v4, v8, v12 ;# taps times elements - vmsummbm v0, v5, v11, v6 - - ;# set 1 - vsldoi v10, v8, v9, 1 - vsldoi v11, v8, v9, 5 - - vmsummbm v6, v4, v10, v12 - vmsummbm v1, v5, v11, v6 - - ;# set 2 - vsldoi v10, v8, v9, 2 - vsldoi v11, v8, v9, 6 - - vmsummbm v6, v4, v10, v12 - vmsummbm v2, v5, v11, v6 - - ;# set 3 - vsldoi v10, v8, v9, 3 - vsldoi v11, v8, v9, 7 - - vmsummbm v6, v4, v10, v12 - vmsummbm v3, v5, v11, v6 - - vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit) - vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F - - vsrh v0, v0, v13 ;# divide v0, v1 by 128 - vsrh v1, v1, v13 - - vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result - vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result - - stvx v0, 0, r9 - add r9, r9, r5 - - add r3, r3, r4 - - bdnz horizontal_loop_16x16 - - ;# check again to see if vertical filter needs to be done. - cmpi cr0, r6, 0 - beq cr0, end_16x16 - - ;# yes there is, so go to the second pass - b second_pass_16x16 - -copy_horizontal_16x21: - li r10, 21 - mtctr r10 - - li r10, 16 - - sub r3, r3, r4 - sub r3, r3, r4 - - ;# this is done above if there is a horizontal filter, - ;# if not it needs to be done down here. - slwi r6, r6, 4 ;# index into vertical filter array - - ;# always write to the stack when doing a horizontal copy - la r9, 48(r1) - -copy_horizontal_loop_16x21: - lvsl v15, 0, r3 ;# permutate value for alignment - - lvx v1, 0, r3 - lvx v2, r10, r3 - - vperm v8, v1, v2, v15 - - stvx v8, 0, r9 - addi r9, r9, 16 - - add r3, r3, r4 - - bdnz copy_horizontal_loop_16x21 - -second_pass_16x16: - - ;# always read from the stack when doing a vertical filter - la r9, 48(r1) - - ;# downshift by 7 ( divide by 128 ) at the end - vspltish v7, 7 - - vpre_load - - luma_vsix - luma_vsix - luma_vfour - -end_16x16: - - addi r1, r1, 416 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .data - - .align 4 -HFilter: - .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0 - .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12 - .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0 - .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36 - .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0 - .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50 - .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 - .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77 - .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0 - .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93 - .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0 - .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108 - .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0 - .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123 - .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 - - .align 4 -VFilter: - .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - - .align 4 -b_hperm: - .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 - - .align 4 -B_0123: - .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 - - .align 4 -B_4567: - .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - - .align 4 -B_89AB: - .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 - - .align 4 -b_hilo: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 - - .align 4 -b_hilo_4x4: - .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 diff --git a/vp9/common/ppc/filter_bilinear_altivec.asm b/vp9/common/ppc/filter_bilinear_altivec.asm deleted file mode 100644 index fd8aa665f..000000000 --- a/vp9/common/ppc/filter_bilinear_altivec.asm +++ /dev/null @@ -1,677 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl bilinear_predict4x4_ppc - .globl bilinear_predict8x4_ppc - .globl bilinear_predict8x8_ppc - .globl bilinear_predict16x16_ppc - -.macro load_c V, LABEL, OFF, R0, R1 - lis \R0, \LABEL@ha - la \R1, \LABEL@l(\R0) - lvx \V, \OFF, \R1 -.endm - -.macro load_vfilter V0, V1 - load_c \V0, vfilter_b, r6, r9, r10 - - addi r6, r6, 16 - lvx \V1, r6, r10 -.endm - -.macro HProlog jump_label - ;# load up horizontal filter - slwi. r5, r5, 4 ;# index into horizontal filter array - - ;# index to the next set of vectors in the row. - li r10, 16 - li r12, 32 - - ;# downshift by 7 ( divide by 128 ) at the end - vspltish v19, 7 - - ;# If there isn't any filtering to be done for the horizontal, then - ;# just skip to the second pass. - beq \jump_label - - load_c v20, hfilter_b, r5, r9, r0 - - ;# setup constants - ;# v14 permutation value for alignment - load_c v28, b_hperm_b, 0, r9, r0 - - ;# rounding added in on the multiply - vspltisw v21, 8 - vspltisw v18, 3 - vslw v18, v21, v18 ;# 0x00000040000000400000004000000040 - - slwi. r6, r6, 5 ;# index into vertical filter array -.endm - -;# Filters a horizontal line -;# expects: -;# r3 src_ptr -;# r4 pitch -;# r10 16 -;# r12 32 -;# v17 perm intput -;# v18 rounding -;# v19 shift -;# v20 filter taps -;# v21 tmp -;# v22 tmp -;# v23 tmp -;# v24 tmp -;# v25 tmp -;# v26 tmp -;# v27 tmp -;# v28 perm output -;# -.macro HFilter V - vperm v24, v21, v21, v10 ;# v20 = 0123 1234 2345 3456 - vperm v25, v21, v21, v11 ;# v21 = 4567 5678 6789 789A - - vmsummbm v24, v20, v24, v18 - vmsummbm v25, v20, v25, v18 - - vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) - - vsrh v24, v24, v19 ;# divide v0, v1 by 128 - - vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result -.endm - -.macro hfilter_8 V, increment_counter - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 9 bytes wide, output is 8 bytes. - lvx v21, 0, r3 - lvx v22, r10, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - vperm v21, v21, v22, v17 - - HFilter \V -.endm - - -.macro load_and_align_8 V, increment_counter - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v21, 0, r3 - lvx v22, r10, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - - vperm \V, v21, v22, v17 -.endm - -.macro write_aligned_8 V, increment_counter - stvx \V, 0, r7 - -.if \increment_counter - add r7, r7, r8 -.endif -.endm - -.macro vfilter_16 P0 P1 - vmuleub v22, \P0, v20 ;# 64 + 4 positive taps - vadduhm v22, v18, v22 - vmuloub v23, \P0, v20 - vadduhm v23, v18, v23 - - vmuleub v24, \P1, v21 - vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary - vmuloub v25, \P1, v21 - vadduhm v23, v23, v25 ;# Ro = odds - - vsrh v22, v22, v19 ;# divide by 128 - vsrh v23, v23, v19 ;# v16 v17 = evens, odds - vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order - vmrglh v23, v22, v23 - vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result -.endm - - -.macro w_8x8 V, D, R, P - stvx \V, 0, r1 - lwz \R, 0(r1) - stw \R, 0(r7) - lwz \R, 4(r1) - stw \R, 4(r7) - add \D, \D, \P -.endm - - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch -bilinear_predict4x4_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xf830 - ori r12, r12, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - HProlog second_pass_4x4_pre_copy_b - - ;# Load up permutation constants - load_c v10, b_0123_b, 0, r9, r12 - load_c v11, b_4567_b, 0, r9, r12 - - hfilter_8 v0, 1 - hfilter_8 v1, 1 - hfilter_8 v2, 1 - hfilter_8 v3, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq store_out_4x4_b - - hfilter_8 v4, 0 - - b second_pass_4x4_b - -second_pass_4x4_pre_copy_b: - slwi r6, r6, 5 ;# index into vertical filter array - - load_and_align_8 v0, 1 - load_and_align_8 v1, 1 - load_and_align_8 v2, 1 - load_and_align_8 v3, 1 - load_and_align_8 v4, 1 - -second_pass_4x4_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - -store_out_4x4_b: - - stvx v0, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v1, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v2, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v3, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - -exit_4x4: - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch -bilinear_predict8x4_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xf830 - ori r12, r12, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - HProlog second_pass_8x4_pre_copy_b - - ;# Load up permutation constants - load_c v10, b_0123_b, 0, r9, r12 - load_c v11, b_4567_b, 0, r9, r12 - - hfilter_8 v0, 1 - hfilter_8 v1, 1 - hfilter_8 v2, 1 - hfilter_8 v3, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq store_out_8x4_b - - hfilter_8 v4, 0 - - b second_pass_8x4_b - -second_pass_8x4_pre_copy_b: - slwi r6, r6, 5 ;# index into vertical filter array - - load_and_align_8 v0, 1 - load_and_align_8 v1, 1 - load_and_align_8 v2, 1 - load_and_align_8 v3, 1 - load_and_align_8 v4, 1 - -second_pass_8x4_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - -store_out_8x4_b: - - cmpi cr0, r8, 8 - beq cr0, store_aligned_8x4_b - - w_8x8 v0, r7, r0, r8 - w_8x8 v1, r7, r0, r8 - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - - b exit_8x4 - -store_aligned_8x4_b: - load_c v10, b_hilo_b, 0, r9, r10 - - vperm v0, v0, v1, v10 - vperm v2, v2, v3, v10 - - stvx v0, 0, r7 - addi r7, r7, 16 - stvx v2, 0, r7 - -exit_8x4: - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch -bilinear_predict8x8_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xfff0 - ori r12, r12, 0xffff - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - HProlog second_pass_8x8_pre_copy_b - - ;# Load up permutation constants - load_c v10, b_0123_b, 0, r9, r12 - load_c v11, b_4567_b, 0, r9, r12 - - hfilter_8 v0, 1 - hfilter_8 v1, 1 - hfilter_8 v2, 1 - hfilter_8 v3, 1 - hfilter_8 v4, 1 - hfilter_8 v5, 1 - hfilter_8 v6, 1 - hfilter_8 v7, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq store_out_8x8_b - - hfilter_8 v8, 0 - - b second_pass_8x8_b - -second_pass_8x8_pre_copy_b: - slwi r6, r6, 5 ;# index into vertical filter array - - load_and_align_8 v0, 1 - load_and_align_8 v1, 1 - load_and_align_8 v2, 1 - load_and_align_8 v3, 1 - load_and_align_8 v4, 1 - load_and_align_8 v5, 1 - load_and_align_8 v6, 1 - load_and_align_8 v7, 1 - load_and_align_8 v8, 0 - -second_pass_8x8_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - vfilter_16 v4, v5 - vfilter_16 v5, v6 - vfilter_16 v6, v7 - vfilter_16 v7, v8 - -store_out_8x8_b: - - cmpi cr0, r8, 8 - beq cr0, store_aligned_8x8_b - - w_8x8 v0, r7, r0, r8 - w_8x8 v1, r7, r0, r8 - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - w_8x8 v4, r7, r0, r8 - w_8x8 v5, r7, r0, r8 - w_8x8 v6, r7, r0, r8 - w_8x8 v7, r7, r0, r8 - - b exit_8x8 - -store_aligned_8x8_b: - load_c v10, b_hilo_b, 0, r9, r10 - - vperm v0, v0, v1, v10 - vperm v2, v2, v3, v10 - vperm v4, v4, v5, v10 - vperm v6, v6, v7, v10 - - stvx v0, 0, r7 - addi r7, r7, 16 - stvx v2, 0, r7 - addi r7, r7, 16 - stvx v4, 0, r7 - addi r7, r7, 16 - stvx v6, 0, r7 - -exit_8x8: - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - - blr - -;# Filters a horizontal line -;# expects: -;# r3 src_ptr -;# r4 pitch -;# r10 16 -;# r12 32 -;# v17 perm intput -;# v18 rounding -;# v19 shift -;# v20 filter taps -;# v21 tmp -;# v22 tmp -;# v23 tmp -;# v24 tmp -;# v25 tmp -;# v26 tmp -;# v27 tmp -;# v28 perm output -;# -.macro hfilter_16 V, increment_counter - - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v21, 0, r3 - lvx v22, r10, r3 - lvx v23, r12, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - vperm v21, v21, v22, v17 - vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified - - ;# set 0 - vmsummbm v24, v20, v21, v18 ;# taps times elements - - ;# set 1 - vsldoi v23, v21, v22, 1 - vmsummbm v25, v20, v23, v18 - - ;# set 2 - vsldoi v23, v21, v22, 2 - vmsummbm v26, v20, v23, v18 - - ;# set 3 - vsldoi v23, v21, v22, 3 - vmsummbm v27, v20, v23, v18 - - vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) - vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F - - vsrh v24, v24, v19 ;# divide v0, v1 by 128 - vsrh v25, v25, v19 - - vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result - vperm \V, \V, v0, v28 ;# \V = correctly-ordered result -.endm - -.macro load_and_align_16 V, increment_counter - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v21, 0, r3 - lvx v22, r10, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - - vperm \V, v21, v22, v17 -.endm - -.macro write_16 V, increment_counter - stvx \V, 0, r7 - -.if \increment_counter - add r7, r7, r8 -.endif -.endm - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch -bilinear_predict16x16_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - HProlog second_pass_16x16_pre_copy_b - - hfilter_16 v0, 1 - hfilter_16 v1, 1 - hfilter_16 v2, 1 - hfilter_16 v3, 1 - hfilter_16 v4, 1 - hfilter_16 v5, 1 - hfilter_16 v6, 1 - hfilter_16 v7, 1 - hfilter_16 v8, 1 - hfilter_16 v9, 1 - hfilter_16 v10, 1 - hfilter_16 v11, 1 - hfilter_16 v12, 1 - hfilter_16 v13, 1 - hfilter_16 v14, 1 - hfilter_16 v15, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq store_out_16x16_b - - hfilter_16 v16, 0 - - b second_pass_16x16_b - -second_pass_16x16_pre_copy_b: - slwi r6, r6, 5 ;# index into vertical filter array - - load_and_align_16 v0, 1 - load_and_align_16 v1, 1 - load_and_align_16 v2, 1 - load_and_align_16 v3, 1 - load_and_align_16 v4, 1 - load_and_align_16 v5, 1 - load_and_align_16 v6, 1 - load_and_align_16 v7, 1 - load_and_align_16 v8, 1 - load_and_align_16 v9, 1 - load_and_align_16 v10, 1 - load_and_align_16 v11, 1 - load_and_align_16 v12, 1 - load_and_align_16 v13, 1 - load_and_align_16 v14, 1 - load_and_align_16 v15, 1 - load_and_align_16 v16, 0 - -second_pass_16x16_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - vfilter_16 v4, v5 - vfilter_16 v5, v6 - vfilter_16 v6, v7 - vfilter_16 v7, v8 - vfilter_16 v8, v9 - vfilter_16 v9, v10 - vfilter_16 v10, v11 - vfilter_16 v11, v12 - vfilter_16 v12, v13 - vfilter_16 v13, v14 - vfilter_16 v14, v15 - vfilter_16 v15, v16 - -store_out_16x16_b: - - write_16 v0, 1 - write_16 v1, 1 - write_16 v2, 1 - write_16 v3, 1 - write_16 v4, 1 - write_16 v5, 1 - write_16 v6, 1 - write_16 v7, 1 - write_16 v8, 1 - write_16 v9, 1 - write_16 v10, 1 - write_16 v11, 1 - write_16 v12, 1 - write_16 v13, 1 - write_16 v14, 1 - write_16 v15, 0 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .data - - .align 4 -hfilter_b: - .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0 - .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0 - .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0 - .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0 - .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0 - .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0 - .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0 - .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0 - - .align 4 -vfilter_b: - .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128 - .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 - .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 - .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 - .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 - .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 - .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 - .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 - .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 - .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 - .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 - - .align 4 -b_hperm_b: - .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 - - .align 4 -b_0123_b: - .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 - - .align 4 -b_4567_b: - .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - -b_hilo_b: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 diff --git a/vp9/common/ppc/idctllm_altivec.asm b/vp9/common/ppc/idctllm_altivec.asm deleted file mode 100644 index 117d9cfc8..000000000 --- a/vp9/common/ppc/idctllm_altivec.asm +++ /dev/null @@ -1,189 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl short_idct4x4llm_ppc - -.macro load_c V, LABEL, OFF, R0, R1 - lis \R0, \LABEL@ha - la \R1, \LABEL@l(\R0) - lvx \V, \OFF, \R1 -.endm - -;# r3 short *input -;# r4 short *output -;# r5 int pitch - .align 2 -short_idct4x4llm_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - load_c v8, sinpi8sqrt2, 0, r9, r10 - load_c v9, cospi8sqrt2minus1, 0, r9, r10 - load_c v10, hi_hi, 0, r9, r10 - load_c v11, lo_lo, 0, r9, r10 - load_c v12, shift_16, 0, r9, r10 - - li r10, 16 - lvx v0, 0, r3 ;# input ip[0], ip[ 4] - lvx v1, r10, r3 ;# input ip[8], ip[12] - - ;# first pass - vupkhsh v2, v0 - vupkhsh v3, v1 - vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8] - vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8] - - vupklsh v0, v0 - vmulosh v4, v0, v8 - vsraw v4, v4, v12 - vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2) - - vupklsh v1, v1 - vmulosh v5, v1, v9 - vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2) - vaddsws v5, v5, v1 - - vsubsws v4, v4, v5 ;# c1 - - vmulosh v3, v1, v8 - vsraw v3, v3, v12 - vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2) - - vmulosh v5, v0, v9 - vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2) - vaddsws v5, v5, v0 - - vaddsws v3, v3, v5 ;# d1 - - vaddsws v0, v6, v3 ;# a1 + d1 - vsubsws v3, v6, v3 ;# a1 - d1 - - vaddsws v1, v7, v4 ;# b1 + c1 - vsubsws v2, v7, v4 ;# b1 - c1 - - ;# transpose input - vmrghw v4, v0, v1 ;# a0 b0 a1 b1 - vmrghw v5, v2, v3 ;# c0 d0 c1 d1 - - vmrglw v6, v0, v1 ;# a2 b2 a3 b3 - vmrglw v7, v2, v3 ;# c2 d2 c3 d3 - - vperm v0, v4, v5, v10 ;# a0 b0 c0 d0 - vperm v1, v4, v5, v11 ;# a1 b1 c1 d1 - - vperm v2, v6, v7, v10 ;# a2 b2 c2 d2 - vperm v3, v6, v7, v11 ;# a3 b3 c3 d3 - - ;# second pass - vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8] - vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8] - - vmulosh v4, v1, v8 - vsraw v4, v4, v12 - vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2) - - vmulosh v5, v3, v9 - vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2) - vaddsws v5, v5, v3 - - vsubsws v4, v4, v5 ;# c1 - - vmulosh v2, v3, v8 - vsraw v2, v2, v12 - vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2) - - vmulosh v5, v1, v9 - vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2) - vaddsws v5, v5, v1 - - vaddsws v3, v2, v5 ;# d1 - - vaddsws v0, v6, v3 ;# a1 + d1 - vsubsws v3, v6, v3 ;# a1 - d1 - - vaddsws v1, v7, v4 ;# b1 + c1 - vsubsws v2, v7, v4 ;# b1 - c1 - - vspltish v6, 4 - vspltish v7, 3 - - vpkswss v0, v0, v1 - vpkswss v1, v2, v3 - - vaddshs v0, v0, v6 - vaddshs v1, v1, v6 - - vsrah v0, v0, v7 - vsrah v1, v1, v7 - - ;# transpose output - vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3 - vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3 - - vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1 - vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3 - - stwu r1,-416(r1) ;# create space on the stack - - stvx v0, 0, r1 - lwz r6, 0(r1) - stw r6, 0(r4) - lwz r6, 4(r1) - stw r6, 4(r4) - - add r4, r4, r5 - - lwz r6, 8(r1) - stw r6, 0(r4) - lwz r6, 12(r1) - stw r6, 4(r4) - - add r4, r4, r5 - - stvx v1, 0, r1 - lwz r6, 0(r1) - stw r6, 0(r4) - lwz r6, 4(r1) - stw r6, 4(r4) - - add r4, r4, r5 - - lwz r6, 8(r1) - stw r6, 0(r4) - lwz r6, 12(r1) - stw r6, 4(r4) - - addi r1, r1, 416 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 4 -sinpi8sqrt2: - .short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468 - - .align 4 -cospi8sqrt2minus1: - .short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091 - - .align 4 -shift_16: - .long 16, 16, 16, 16 - - .align 4 -hi_hi: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 - - .align 4 -lo_lo: - .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 diff --git a/vp9/common/ppc/loopfilter_altivec.c b/vp9/common/ppc/loopfilter_altivec.c deleted file mode 100644 index 962da2319..000000000 --- a/vp9/common/ppc/loopfilter_altivec.c +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "loopfilter.h" -#include "onyxc_int.h" - -typedef void loop_filter_function_y_ppc -( - unsigned char *s, // source pointer - int p, // pitch - const signed char *flimit, - const signed char *limit, - const signed char *thresh -); - -typedef void loop_filter_function_uv_ppc -( - unsigned char *u, // source pointer - unsigned char *v, // source pointer - int p, // pitch - const signed char *flimit, - const signed char *limit, - const signed char *thresh -); - -typedef void loop_filter_function_s_ppc -( - unsigned char *s, // source pointer - int p, // pitch - const signed char *flimit -); - -loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc; -loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc; -loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc; -loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc; - -loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc; -loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc; -loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc; -loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc; - -loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc; -loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc; - -// Horizontal MB filtering -void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr); - - if (u_ptr) - mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr); -} - -void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)u_ptr; - (void)v_ptr; - (void)uv_stride; - loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim); -} - -// Vertical MB Filtering -void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr); - - if (u_ptr) - mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr); -} - -void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)u_ptr; - (void)v_ptr; - (void)uv_stride; - loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim); -} - -// Horizontal B Filtering -void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - // These should all be done at once with one call, instead of 3 - loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); - loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); - loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); - - if (u_ptr) - loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr); -} - -void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)u_ptr; - (void)v_ptr; - (void)uv_stride; - loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim); - loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim); - loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim); -} - -// Vertical B Filtering -void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr); - - if (u_ptr) - loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr); -} - -void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)u_ptr; - (void)v_ptr; - (void)uv_stride; - loop_filter_simple_vertical_edge_ppc(y_ptr + 4, y_stride, lfi->flim); - loop_filter_simple_vertical_edge_ppc(y_ptr + 8, y_stride, lfi->flim); - loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim); -} diff --git a/vp9/common/ppc/loopfilter_filters_altivec.asm b/vp9/common/ppc/loopfilter_filters_altivec.asm deleted file mode 100644 index 61df4e976..000000000 --- a/vp9/common/ppc/loopfilter_filters_altivec.asm +++ /dev/null @@ -1,1253 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl mbloop_filter_horizontal_edge_y_ppc - .globl loop_filter_horizontal_edge_y_ppc - .globl mbloop_filter_vertical_edge_y_ppc - .globl loop_filter_vertical_edge_y_ppc - - .globl mbloop_filter_horizontal_edge_uv_ppc - .globl loop_filter_horizontal_edge_uv_ppc - .globl mbloop_filter_vertical_edge_uv_ppc - .globl loop_filter_vertical_edge_uv_ppc - - .globl loop_filter_simple_horizontal_edge_ppc - .globl loop_filter_simple_vertical_edge_ppc - - .text -;# We often need to perform transposes (and other transpose-like operations) -;# on matrices of data. This is simplified by the fact that we usually -;# operate on hunks of data whose dimensions are powers of 2, or at least -;# divisible by highish powers of 2. -;# -;# These operations can be very confusing. They become more straightforward -;# when we think of them as permutations of address bits: Concatenate a -;# group of vector registers and think of it as occupying a block of -;# memory beginning at address zero. The low four bits 0...3 of the -;# address then correspond to position within a register, the higher-order -;# address bits select the register. -;# -;# Although register selection, at the code level, is arbitrary, things -;# are simpler if we use contiguous ranges of register numbers, simpler -;# still if the low-order bits of the register number correspond to -;# conceptual address bits. We do this whenever reasonable. -;# -;# A 16x16 transpose can then be thought of as an operation on -;# a 256-element block of memory. It takes 8 bits 0...7 to address this -;# memory and the effect of a transpose is to interchange address bit -;# 0 with 4, 1 with 5, 2 with 6, and 3 with 7. Bits 0...3 index the -;# column, which is interchanged with the row addressed by bits 4..7. -;# -;# The altivec merge instructions provide a rapid means of effecting -;# many of these transforms. They operate at three widths (8,16,32). -;# Writing V(x) for vector register #x, paired merges permute address -;# indices as follows. -;# -;# 0->1 1->2 2->3 3->(4+d) (4+s)->0: -;# -;# vmrghb V( x), V( y), V( y + (1<2 2->3 3->(4+d) (4+s)->1: -;# -;# vmrghh V( x), V( y), V( y + (1<3 3->(4+d) (4+s)->2: -;# -;# vmrghw V( x), V( y), V( y + (1<(4+d) (4+s)->3 by the sequence: -;# -;# vperm V( x), V( y), V( y + (1<4 1<->5 2<->6 3<->7, which we accomplish by -;# 4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0. -;# -;# Except for the fact that the destination registers get written -;# before we are done referencing the old contents, the cyclic transform -;# is effected by -;# -;# x = 0; do { -;# vmrghb V(2x), V(x), V(x+8); -;# vmrghb V(2x+1), V(x), V(x+8); -;# } while( ++x < 8); -;# -;# For clarity, and because we can afford it, we do this transpose -;# using all 32 registers, alternating the banks 0..15 and 16 .. 31, -;# leaving the final result in 16 .. 31, as the lower registers are -;# used in the filtering itself. -;# -.macro Tpair A, B, X, Y - vmrghb \A, \X, \Y - vmrglb \B, \X, \Y -.endm - -;# Each step takes 8*2 = 16 instructions - -.macro t16_even - Tpair v16,v17, v0,v8 - Tpair v18,v19, v1,v9 - Tpair v20,v21, v2,v10 - Tpair v22,v23, v3,v11 - Tpair v24,v25, v4,v12 - Tpair v26,v27, v5,v13 - Tpair v28,v29, v6,v14 - Tpair v30,v31, v7,v15 -.endm - -.macro t16_odd - Tpair v0,v1, v16,v24 - Tpair v2,v3, v17,v25 - Tpair v4,v5, v18,v26 - Tpair v6,v7, v19,v27 - Tpair v8,v9, v20,v28 - Tpair v10,v11, v21,v29 - Tpair v12,v13, v22,v30 - Tpair v14,v15, v23,v31 -.endm - -;# Whole transpose takes 4*16 = 64 instructions - -.macro t16_full - t16_odd - t16_even - t16_odd - t16_even -.endm - -;# Vertical edge filtering requires transposes. For the simple filter, -;# we need to convert 16 rows of 4 pels each into 4 registers of 16 pels -;# each. Writing 0 ... 63 for the pixel indices, the desired result is: -;# -;# v0 = 0 1 ... 14 15 -;# v1 = 16 17 ... 30 31 -;# v2 = 32 33 ... 47 48 -;# v3 = 49 50 ... 62 63 -;# -;# In frame-buffer memory, the layout is: -;# -;# 0 16 32 48 -;# 1 17 33 49 -;# ... -;# 15 31 47 63. -;# -;# We begin by reading the data 32 bits at a time (using scalar operations) -;# into a temporary array, reading the rows of the array into vector registers, -;# with the following layout: -;# -;# v0 = 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 -;# v1 = 1 17 33 49 5 21 ... 45 61 -;# v2 = 2 18 ... 46 62 -;# v3 = 3 19 ... 47 63 -;# -;# From the "address-bit" perspective discussed above, we simply need to -;# interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone. -;# In other words, we transpose each of the four 4x4 submatrices. -;# -;# This transformation is its own inverse, and we need to perform it -;# again before writing the pixels back into the frame buffer. -;# -;# It acts in place on registers v0...v3, uses v4...v7 as temporaries, -;# and assumes that v14/v15 contain the b_hihi/b_lolo selectors -;# defined above. We think of both groups of 4 registers as having -;# "addresses" {0,1,2,3} * 16. -;# -.macro Transpose4times4x4 Vlo, Vhi - - ;# d=s=0 0->1 1->2 2->3 3->4 4->0 =5= - - vmrghb v4, v0, v1 - vmrglb v5, v0, v1 - vmrghb v6, v2, v3 - vmrglb v7, v2, v3 - - ;# d=0 s=1 =0= 1->2 2->3 3->4 4->5 5->1 - - vmrghh v0, v4, v6 - vmrglh v1, v4, v6 - vmrghh v2, v5, v7 - vmrglh v3, v5, v7 - - ;# d=s=0 =0= =1= 2->3 3->4 4->2 =5= - - vmrghw v4, v0, v1 - vmrglw v5, v0, v1 - vmrghw v6, v2, v3 - vmrglw v7, v2, v3 - - ;# d=0 s=1 =0= =1= =2= 3->4 4->5 5->3 - - vperm v0, v4, v6, \Vlo - vperm v1, v4, v6, \Vhi - vperm v2, v5, v7, \Vlo - vperm v3, v5, v7, \Vhi -.endm -;# end Transpose4times4x4 - - -;# Normal mb vertical edge filter transpose. -;# -;# We read 8 columns of data, initially in the following pattern: -;# -;# (0,0) (1,0) ... (7,0) (0,1) (1,1) ... (7,1) -;# (0,2) (1,2) ... (7,2) (0,3) (1,3) ... (7,3) -;# ... -;# (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15) -;# -;# and wish to convert to: -;# -;# (0,0) ... (0,15) -;# (1,0) ... (1,15) -;# ... -;# (7,0) ... (7,15). -;# -;# In "address bit" language, we wish to map -;# -;# 0->4 1->5 2->6 3->0 4->1 5->2 6->3, i.e., I -> (I+4) mod 7. -;# -;# This can be accomplished by 4 iterations of the cyclic transform -;# -;# I -> (I+1) mod 7; -;# -;# each iteration can be realized by (d=0, s=2): -;# -;# x = 0; do Tpair( V(2x),V(2x+1), V(x),V(x+4)) while( ++x < 4); -;# -;# The input/output is in registers v0...v7. We use v10...v17 as mirrors; -;# preserving v8 = sign converter. -;# -;# Inverse transpose is similar, except here I -> (I+3) mod 7 and the -;# result lands in the "mirror" registers v10...v17 -;# -.macro t8x16_odd - Tpair v10, v11, v0, v4 - Tpair v12, v13, v1, v5 - Tpair v14, v15, v2, v6 - Tpair v16, v17, v3, v7 -.endm - -.macro t8x16_even - Tpair v0, v1, v10, v14 - Tpair v2, v3, v11, v15 - Tpair v4, v5, v12, v16 - Tpair v6, v7, v13, v17 -.endm - -.macro transpose8x16_fwd - t8x16_odd - t8x16_even - t8x16_odd - t8x16_even -.endm - -.macro transpose8x16_inv - t8x16_odd - t8x16_even - t8x16_odd -.endm - -.macro Transpose16x16 - vmrghb v0, v16, v24 - vmrglb v1, v16, v24 - vmrghb v2, v17, v25 - vmrglb v3, v17, v25 - vmrghb v4, v18, v26 - vmrglb v5, v18, v26 - vmrghb v6, v19, v27 - vmrglb v7, v19, v27 - vmrghb v8, v20, v28 - vmrglb v9, v20, v28 - vmrghb v10, v21, v29 - vmrglb v11, v21, v29 - vmrghb v12, v22, v30 - vmrglb v13, v22, v30 - vmrghb v14, v23, v31 - vmrglb v15, v23, v31 - vmrghb v16, v0, v8 - vmrglb v17, v0, v8 - vmrghb v18, v1, v9 - vmrglb v19, v1, v9 - vmrghb v20, v2, v10 - vmrglb v21, v2, v10 - vmrghb v22, v3, v11 - vmrglb v23, v3, v11 - vmrghb v24, v4, v12 - vmrglb v25, v4, v12 - vmrghb v26, v5, v13 - vmrglb v27, v5, v13 - vmrghb v28, v6, v14 - vmrglb v29, v6, v14 - vmrghb v30, v7, v15 - vmrglb v31, v7, v15 - vmrghb v0, v16, v24 - vmrglb v1, v16, v24 - vmrghb v2, v17, v25 - vmrglb v3, v17, v25 - vmrghb v4, v18, v26 - vmrglb v5, v18, v26 - vmrghb v6, v19, v27 - vmrglb v7, v19, v27 - vmrghb v8, v20, v28 - vmrglb v9, v20, v28 - vmrghb v10, v21, v29 - vmrglb v11, v21, v29 - vmrghb v12, v22, v30 - vmrglb v13, v22, v30 - vmrghb v14, v23, v31 - vmrglb v15, v23, v31 - vmrghb v16, v0, v8 - vmrglb v17, v0, v8 - vmrghb v18, v1, v9 - vmrglb v19, v1, v9 - vmrghb v20, v2, v10 - vmrglb v21, v2, v10 - vmrghb v22, v3, v11 - vmrglb v23, v3, v11 - vmrghb v24, v4, v12 - vmrglb v25, v4, v12 - vmrghb v26, v5, v13 - vmrglb v27, v5, v13 - vmrghb v28, v6, v14 - vmrglb v29, v6, v14 - vmrghb v30, v7, v15 - vmrglb v31, v7, v15 -.endm - -;# load_g loads a global vector (whose address is in the local variable Gptr) -;# into vector register Vreg. Trashes r0 -.macro load_g Vreg, Gptr - lwz r0, \Gptr - lvx \Vreg, 0, r0 -.endm - -;# exploit the saturation here. if the answer is negative -;# it will be clamped to 0. orring 0 with a positive -;# number will be the positive number (abs) -;# RES = abs( A-B), trashes TMP -.macro Abs RES, TMP, A, B - vsububs \RES, \A, \B - vsububs \TMP, \B, \A - vor \RES, \RES, \TMP -.endm - -;# RES = Max( RES, abs( A-B)), trashes TMP -.macro max_abs RES, TMP, A, B - vsububs \TMP, \A, \B - vmaxub \RES, \RES, \TMP - vsububs \TMP, \B, \A - vmaxub \RES, \RES, \TMP -.endm - -.macro Masks - ;# build masks - ;# input is all 8 bit unsigned (0-255). need to - ;# do abs(vala-valb) > limit. but no need to compare each - ;# value to the limit. find the max of the absolute differences - ;# and compare that to the limit. - ;# First hev - Abs v14, v13, v2, v3 ;# |P1 - P0| - max_abs v14, v13, v5, v4 ;# |Q1 - Q0| - - vcmpgtub v10, v14, v10 ;# HEV = true if thresh exceeded - - ;# Next limit - max_abs v14, v13, v0, v1 ;# |P3 - P2| - max_abs v14, v13, v1, v2 ;# |P2 - P1| - max_abs v14, v13, v6, v5 ;# |Q2 - Q1| - max_abs v14, v13, v7, v6 ;# |Q3 - Q2| - - vcmpgtub v9, v14, v9 ;# R = true if limit exceeded - - ;# flimit - Abs v14, v13, v3, v4 ;# |P0 - Q0| - - vcmpgtub v8, v14, v8 ;# X = true if flimit exceeded - - vor v8, v8, v9 ;# R = true if flimit or limit exceeded - ;# done building masks -.endm - -.macro build_constants RFL, RLI, RTH, FL, LI, TH - ;# build constants - lvx \FL, 0, \RFL ;# flimit - lvx \LI, 0, \RLI ;# limit - lvx \TH, 0, \RTH ;# thresh - - vspltisb v11, 8 - vspltisb v12, 4 - vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 -.endm - -.macro load_data_y - ;# setup strides/pointers to be able to access - ;# all of the data - add r5, r4, r4 ;# r5 = 2 * stride - sub r6, r3, r5 ;# r6 -> 2 rows back - neg r7, r4 ;# r7 = -stride - - ;# load 16 pixels worth of data to work on - sub r0, r6, r5 ;# r0 -> 4 rows back (temp) - lvx v0, 0, r0 ;# P3 (read only) - lvx v1, r7, r6 ;# P2 - lvx v2, 0, r6 ;# P1 - lvx v3, r7, r3 ;# P0 - lvx v4, 0, r3 ;# Q0 - lvx v5, r4, r3 ;# Q1 - lvx v6, r5, r3 ;# Q2 - add r0, r3, r5 ;# r0 -> 2 rows fwd (temp) - lvx v7, r4, r0 ;# Q3 (read only) -.endm - -;# Expects -;# v10 == HEV -;# v13 == tmp -;# v14 == tmp -.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT - vxor \P1, \P1, v11 ;# SP1 - vxor \P0, \P0, v11 ;# SP0 - vxor \Q0, \Q0, v11 ;# SQ0 - vxor \Q1, \Q1, v11 ;# SQ1 - - vsubsbs v13, \P1, \Q1 ;# f = c (P1 - Q1) -.if \HEV_PRESENT - vand v13, v13, v10 ;# f &= hev -.endif - vsubsbs v14, \Q0, \P0 ;# -126 <= X = Q0-P0 <= +126 - vaddsbs v13, v13, v14 - vaddsbs v13, v13, v14 - vaddsbs v13, v13, v14 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) - - vandc v13, v13, v8 ;# f &= mask - - vspltisb v8, 3 - vspltisb v9, 4 - - vaddsbs v14, v13, v9 ;# f1 = c (f+4) - vaddsbs v15, v13, v8 ;# f2 = c (f+3) - - vsrab v13, v14, v8 ;# f1 >>= 3 - vsrab v15, v15, v8 ;# f2 >>= 3 - - vsubsbs \Q0, \Q0, v13 ;# u1 = c (SQ0 - f1) - vaddsbs \P0, \P0, v15 ;# u2 = c (SP0 + f2) -.endm - -.macro vp8_mbfilter - Masks - - ;# start the fitering here - vxor v1, v1, v11 ;# SP2 - vxor v2, v2, v11 ;# SP1 - vxor v3, v3, v11 ;# SP0 - vxor v4, v4, v11 ;# SQ0 - vxor v5, v5, v11 ;# SQ1 - vxor v6, v6, v11 ;# SQ2 - - ;# add outer taps if we have high edge variance - vsubsbs v13, v2, v5 ;# f = c (SP1-SQ1) - - vsubsbs v14, v4, v3 ;# SQ0-SP0 - vaddsbs v13, v13, v14 - vaddsbs v13, v13, v14 - vaddsbs v13, v13, v14 ;# f = c( c(SP1-SQ1) + 3*(SQ0-SP0)) - - vandc v13, v13, v8 ;# f &= mask - vand v15, v13, v10 ;# f2 = f & hev - - ;# save bottom 3 bits so that we round one side +4 and the other +3 - vspltisb v8, 3 - vspltisb v9, 4 - - vaddsbs v14, v15, v9 ;# f1 = c (f+4) - vaddsbs v15, v15, v8 ;# f2 = c (f+3) - - vsrab v14, v14, v8 ;# f1 >>= 3 - vsrab v15, v15, v8 ;# f2 >>= 3 - - vsubsbs v4, v4, v14 ;# u1 = c (SQ0 - f1) - vaddsbs v3, v3, v15 ;# u2 = c (SP0 + f2) - - ;# only apply wider filter if not high edge variance - vandc v13, v13, v10 ;# f &= ~hev - - vspltisb v9, 2 - vnor v8, v8, v8 - vsrb v9, v8, v9 ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f - vupkhsb v9, v9 ;# 0x003f003f003f003f003f003f003f003f - vspltisb v8, 9 - - ;# roughly 1/7th difference across boundary - vspltish v10, 7 - vmulosb v14, v8, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) - vmulesb v15, v8, v13 - vaddshs v14, v14, v9 ;# += 63 - vaddshs v15, v15, v9 - vsrah v14, v14, v10 ;# >>= 7 - vsrah v15, v15, v10 - vmrglh v10, v15, v14 - vmrghh v15, v15, v14 - - vpkshss v10, v15, v10 ;# X = saturated down to bytes - - vsubsbs v6, v6, v10 ;# subtract from Q and add to P - vaddsbs v1, v1, v10 - - vxor v6, v6, v11 - vxor v1, v1, v11 - - ;# roughly 2/7th difference across boundary - vspltish v10, 7 - vaddubm v12, v8, v8 - vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) - vmulesb v15, v12, v13 - vaddshs v14, v14, v9 - vaddshs v15, v15, v9 - vsrah v14, v14, v10 ;# >>= 7 - vsrah v15, v15, v10 - vmrglh v10, v15, v14 - vmrghh v15, v15, v14 - - vpkshss v10, v15, v10 ;# X = saturated down to bytes - - vsubsbs v5, v5, v10 ;# subtract from Q and add to P - vaddsbs v2, v2, v10 - - vxor v5, v5, v11 - vxor v2, v2, v11 - - ;# roughly 3/7th difference across boundary - vspltish v10, 7 - vaddubm v12, v12, v8 - vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) - vmulesb v15, v12, v13 - vaddshs v14, v14, v9 - vaddshs v15, v15, v9 - vsrah v14, v14, v10 ;# >>= 7 - vsrah v15, v15, v10 - vmrglh v10, v15, v14 - vmrghh v15, v15, v14 - - vpkshss v10, v15, v10 ;# X = saturated down to bytes - - vsubsbs v4, v4, v10 ;# subtract from Q and add to P - vaddsbs v3, v3, v10 - - vxor v4, v4, v11 - vxor v3, v3, v11 -.endm - -.macro SBFilter - Masks - - common_adjust v3, v4, v2, v5, 1 - - ;# outer tap adjustments - vspltisb v8, 1 - - vaddubm v13, v13, v8 ;# f += 1 - vsrab v13, v13, v8 ;# f >>= 1 - - vandc v13, v13, v10 ;# f &= ~hev - - vsubsbs v5, v5, v13 ;# u1 = c (SQ1 - f) - vaddsbs v2, v2, v13 ;# u2 = c (SP1 + f) - - vxor v2, v2, v11 - vxor v3, v3, v11 - vxor v4, v4, v11 - vxor v5, v5, v11 -.endm - - .align 2 -mbloop_filter_horizontal_edge_y_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - mtspr 256, r12 ;# set VRSAVE - - build_constants r5, r6, r7, v8, v9, v10 - - load_data_y - - vp8_mbfilter - - stvx v1, r7, r6 ;# P2 - stvx v2, 0, r6 ;# P1 - stvx v3, r7, r3 ;# P0 - stvx v4, 0, r3 ;# Q0 - stvx v5, r4, r3 ;# Q1 - stvx v6, r5, r3 ;# Q2 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char *s -;# r4 int p -;# r5 const signed char *flimit -;# r6 const signed char *limit -;# r7 const signed char *thresh -loop_filter_horizontal_edge_y_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - mtspr 256, r12 ;# set VRSAVE - - build_constants r5, r6, r7, v8, v9, v10 - - load_data_y - - SBFilter - - stvx v2, 0, r6 ;# P1 - stvx v3, r7, r3 ;# P0 - stvx v4, 0, r3 ;# Q0 - stvx v5, r4, r3 ;# Q1 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -;# Filtering a vertical mb. Each mb is aligned on a 16 byte boundary. -;# So we can read in an entire mb aligned. However if we want to filter the mb -;# edge we run into problems. For the loopfilter we require 4 bytes before the mb -;# and 4 after for a total of 8 bytes. Reading 16 bytes inorder to get 4 is a bit -;# of a waste. So this is an even uglier way to get around that. -;# Using the regular register file words are read in and then saved back out to -;# memory to align and order them up. Then they are read in using the -;# vector register file. -.macro RLVmb V, R - lwzux r0, r3, r4 - stw r0, 4(\R) - lwz r0,-4(r3) - stw r0, 0(\R) - lwzux r0, r3, r4 - stw r0,12(\R) - lwz r0,-4(r3) - stw r0, 8(\R) - lvx \V, 0, \R -.endm - -.macro WLVmb V, R - stvx \V, 0, \R - lwz r0,12(\R) - stwux r0, r3, r4 - lwz r0, 8(\R) - stw r0,-4(r3) - lwz r0, 4(\R) - stwux r0, r3, r4 - lwz r0, 0(\R) - stw r0,-4(r3) -.endm - - .align 2 -;# r3 unsigned char *s -;# r4 int p -;# r5 const signed char *flimit -;# r6 const signed char *limit -;# r7 const signed char *thresh -mbloop_filter_vertical_edge_y_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xc000 - mtspr 256, r12 ;# set VRSAVE - - la r9, -48(r1) ;# temporary space for reading in vectors - sub r3, r3, r4 - - RLVmb v0, r9 - RLVmb v1, r9 - RLVmb v2, r9 - RLVmb v3, r9 - RLVmb v4, r9 - RLVmb v5, r9 - RLVmb v6, r9 - RLVmb v7, r9 - - transpose8x16_fwd - - build_constants r5, r6, r7, v8, v9, v10 - - vp8_mbfilter - - transpose8x16_inv - - add r3, r3, r4 - neg r4, r4 - - WLVmb v17, r9 - WLVmb v16, r9 - WLVmb v15, r9 - WLVmb v14, r9 - WLVmb v13, r9 - WLVmb v12, r9 - WLVmb v11, r9 - WLVmb v10, r9 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -.macro RL V, R, P - lvx \V, 0, \R - add \R, \R, \P -.endm - -.macro WL V, R, P - stvx \V, 0, \R - add \R, \R, \P -.endm - -.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3 - ;# K = |P0-P1| already - Abs v14, v13, \Q0, \Q1 ;# M = |Q0-Q1| - vmaxub v14, v14, v4 ;# M = max( |P0-P1|, |Q0-Q1|) - vcmpgtub v10, v14, v0 - - Abs v4, v5, \Q2, \Q3 ;# K = |Q2-Q3| = next |P0-P1] - - max_abs v14, v13, \Q1, \Q2 ;# M = max( M, |Q1-Q2|) - max_abs v14, v13, \P1, \P2 ;# M = max( M, |P1-P2|) - max_abs v14, v13, \P2, \P3 ;# M = max( M, |P2-P3|) - - vmaxub v14, v14, v4 ;# M = max interior abs diff - vcmpgtub v9, v14, v2 ;# M = true if int_l exceeded - - Abs v14, v13, \P0, \Q0 ;# X = Abs( P0-Q0) - vcmpgtub v8, v14, v3 ;# X = true if edge_l exceeded - vor v8, v8, v9 ;# M = true if edge_l or int_l exceeded - - ;# replace P1,Q1 w/signed versions - common_adjust \P0, \Q0, \P1, \Q1, 1 - - vaddubm v13, v13, v1 ;# -16 <= M <= 15, saturation irrelevant - vsrab v13, v13, v1 - vandc v13, v13, v10 ;# adjust P1,Q1 by (M+1)>>1 if ! hev - vsubsbs \Q1, \Q1, v13 - vaddsbs \P1, \P1, v13 - - vxor \P1, \P1, v11 ;# P1 - vxor \P0, \P0, v11 ;# P0 - vxor \Q0, \Q0, v11 ;# Q0 - vxor \Q1, \Q1, v11 ;# Q1 -.endm - - - .align 2 -;# r3 unsigned char *s -;# r4 int p -;# r5 const signed char *flimit -;# r6 const signed char *limit -;# r7 const signed char *thresh -loop_filter_vertical_edge_y_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xffff - mtspr 256, r12 ;# set VRSAVE - - addi r9, r3, 0 - RL v16, r9, r4 - RL v17, r9, r4 - RL v18, r9, r4 - RL v19, r9, r4 - RL v20, r9, r4 - RL v21, r9, r4 - RL v22, r9, r4 - RL v23, r9, r4 - RL v24, r9, r4 - RL v25, r9, r4 - RL v26, r9, r4 - RL v27, r9, r4 - RL v28, r9, r4 - RL v29, r9, r4 - RL v30, r9, r4 - lvx v31, 0, r9 - - Transpose16x16 - - vspltisb v1, 1 - - build_constants r5, r6, r7, v3, v2, v0 - - Abs v4, v5, v19, v18 ;# K(v14) = first |P0-P1| - - Fil v16, v17, v18, v19, v20, v21, v22, v23 - Fil v20, v21, v22, v23, v24, v25, v26, v27 - Fil v24, v25, v26, v27, v28, v29, v30, v31 - - Transpose16x16 - - addi r9, r3, 0 - WL v16, r9, r4 - WL v17, r9, r4 - WL v18, r9, r4 - WL v19, r9, r4 - WL v20, r9, r4 - WL v21, r9, r4 - WL v22, r9, r4 - WL v23, r9, r4 - WL v24, r9, r4 - WL v25, r9, r4 - WL v26, r9, r4 - WL v27, r9, r4 - WL v28, r9, r4 - WL v29, r9, r4 - WL v30, r9, r4 - stvx v31, 0, r9 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -.macro active_chroma_sel V - andi. r7, r3, 8 ;# row origin modulo 16 - add r7, r7, r7 ;# selects selectors - lis r12, _chromaSelectors@ha - la r0, _chromaSelectors@l(r12) - lwzux r0, r7, r0 ;# leave selector addr in r7 - - lvx \V, 0, r0 ;# mask to concatenate active U,V pels -.endm - -.macro hread_uv Dest, U, V, Offs, VMask - lvx \U, \Offs, r3 - lvx \V, \Offs, r4 - vperm \Dest, \U, \V, \VMask ;# Dest = active part of U then V -.endm - -.macro hwrite_uv New, U, V, Offs, Umask, Vmask - vperm \U, \New, \U, \Umask ;# Combine new pels with siblings - vperm \V, \New, \V, \Vmask - stvx \U, \Offs, r3 ;# Write to frame buffer - stvx \V, \Offs, r4 -.endm - -;# Process U,V in parallel. -.macro load_chroma_h - neg r9, r5 ;# r9 = -1 * stride - add r8, r9, r9 ;# r8 = -2 * stride - add r10, r5, r5 ;# r10 = 2 * stride - - active_chroma_sel v12 - - ;# P3, Q3 are read-only; need not save addresses or sibling pels - add r6, r8, r8 ;# r6 = -4 * stride - hread_uv v0, v14, v15, r6, v12 - add r6, r10, r5 ;# r6 = 3 * stride - hread_uv v7, v14, v15, r6, v12 - - ;# Others are read/write; save addresses and sibling pels - - add r6, r8, r9 ;# r6 = -3 * stride - hread_uv v1, v16, v17, r6, v12 - hread_uv v2, v18, v19, r8, v12 - hread_uv v3, v20, v21, r9, v12 - hread_uv v4, v22, v23, 0, v12 - hread_uv v5, v24, v25, r5, v12 - hread_uv v6, v26, v27, r10, v12 -.endm - -.macro uresult_sel V - load_g \V, 4(r7) -.endm - -.macro vresult_sel V - load_g \V, 8(r7) -.endm - -;# always write P1,P0,Q0,Q1 -.macro store_chroma_h - uresult_sel v11 - vresult_sel v12 - hwrite_uv v2, v18, v19, r8, v11, v12 - hwrite_uv v3, v20, v21, r9, v11, v12 - hwrite_uv v4, v22, v23, 0, v11, v12 - hwrite_uv v5, v24, v25, r5, v11, v12 -.endm - - .align 2 -;# r3 unsigned char *u -;# r4 unsigned char *v -;# r5 int p -;# r6 const signed char *flimit -;# r7 const signed char *limit -;# r8 const signed char *thresh -mbloop_filter_horizontal_edge_uv_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xffff - mtspr 256, r12 ;# set VRSAVE - - build_constants r6, r7, r8, v8, v9, v10 - - load_chroma_h - - vp8_mbfilter - - store_chroma_h - - hwrite_uv v1, v16, v17, r6, v11, v12 ;# v1 == P2 - hwrite_uv v6, v26, v27, r10, v11, v12 ;# v6 == Q2 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char *u -;# r4 unsigned char *v -;# r5 int p -;# r6 const signed char *flimit -;# r7 const signed char *limit -;# r8 const signed char *thresh -loop_filter_horizontal_edge_uv_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xffff - mtspr 256, r12 ;# set VRSAVE - - build_constants r6, r7, r8, v8, v9, v10 - - load_chroma_h - - SBFilter - - store_chroma_h - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -.macro R V, R - lwzux r0, r3, r5 - stw r0, 4(\R) - lwz r0,-4(r3) - stw r0, 0(\R) - lwzux r0, r4, r5 - stw r0,12(\R) - lwz r0,-4(r4) - stw r0, 8(\R) - lvx \V, 0, \R -.endm - - -.macro W V, R - stvx \V, 0, \R - lwz r0,12(\R) - stwux r0, r4, r5 - lwz r0, 8(\R) - stw r0,-4(r4) - lwz r0, 4(\R) - stwux r0, r3, r5 - lwz r0, 0(\R) - stw r0,-4(r3) -.endm - -.macro chroma_vread R - sub r3, r3, r5 ;# back up one line for simplicity - sub r4, r4, r5 - - R v0, \R - R v1, \R - R v2, \R - R v3, \R - R v4, \R - R v5, \R - R v6, \R - R v7, \R - - transpose8x16_fwd -.endm - -.macro chroma_vwrite R - - transpose8x16_inv - - add r3, r3, r5 - add r4, r4, r5 - neg r5, r5 ;# Write rows back in reverse order - - W v17, \R - W v16, \R - W v15, \R - W v14, \R - W v13, \R - W v12, \R - W v11, \R - W v10, \R -.endm - - .align 2 -;# r3 unsigned char *u -;# r4 unsigned char *v -;# r5 int p -;# r6 const signed char *flimit -;# r7 const signed char *limit -;# r8 const signed char *thresh -mbloop_filter_vertical_edge_uv_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xc000 - mtspr 256, r12 ;# set VRSAVE - - la r9, -48(r1) ;# temporary space for reading in vectors - - chroma_vread r9 - - build_constants r6, r7, r8, v8, v9, v10 - - vp8_mbfilter - - chroma_vwrite r9 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char *u -;# r4 unsigned char *v -;# r5 int p -;# r6 const signed char *flimit -;# r7 const signed char *limit -;# r8 const signed char *thresh -loop_filter_vertical_edge_uv_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xc000 - mtspr 256, r12 ;# set VRSAVE - - la r9, -48(r1) ;# temporary space for reading in vectors - - chroma_vread r9 - - build_constants r6, r7, r8, v8, v9, v10 - - SBFilter - - chroma_vwrite r9 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=- - -.macro vp8_simple_filter - Abs v14, v13, v1, v2 ;# M = abs( P0 - Q0) - vcmpgtub v8, v14, v8 ;# v5 = true if _over_ limit - - ;# preserve unsigned v0 and v3 - common_adjust v1, v2, v0, v3, 0 - - vxor v1, v1, v11 - vxor v2, v2, v11 ;# cvt Q0, P0 back to pels -.endm - -.macro simple_vertical - addi r8, 0, 16 - addi r7, r5, 32 - - lvx v0, 0, r5 - lvx v1, r8, r5 - lvx v2, 0, r7 - lvx v3, r8, r7 - - lis r12, _B_hihi@ha - la r0, _B_hihi@l(r12) - lvx v16, 0, r0 - - lis r12, _B_lolo@ha - la r0, _B_lolo@l(r12) - lvx v17, 0, r0 - - Transpose4times4x4 v16, v17 - vp8_simple_filter - - vxor v0, v0, v11 - vxor v3, v3, v11 ;# cvt Q0, P0 back to pels - - Transpose4times4x4 v16, v17 - - stvx v0, 0, r5 - stvx v1, r8, r5 - stvx v2, 0, r7 - stvx v3, r8, r7 -.endm - - .align 2 -;# r3 unsigned char *s -;# r4 int p -;# r5 const signed char *flimit -loop_filter_simple_horizontal_edge_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - mtspr 256, r12 ;# set VRSAVE - - ;# build constants - lvx v8, 0, r5 ;# flimit - - vspltisb v11, 8 - vspltisb v12, 4 - vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 - - neg r5, r4 ;# r5 = -1 * stride - add r6, r5, r5 ;# r6 = -2 * stride - - lvx v0, r6, r3 ;# v0 = P1 = 16 pels two rows above edge - lvx v1, r5, r3 ;# v1 = P0 = 16 pels one row above edge - lvx v2, 0, r3 ;# v2 = Q0 = 16 pels one row below edge - lvx v3, r4, r3 ;# v3 = Q1 = 16 pels two rows below edge - - vp8_simple_filter - - stvx v1, r5, r3 ;# store P0 - stvx v2, 0, r3 ;# store Q0 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -.macro RLV Offs - stw r0, (\Offs*4)(r5) - lwzux r0, r7, r4 -.endm - -.macro WLV Offs - lwz r0, (\Offs*4)(r5) - stwux r0, r7, r4 -.endm - - .align 2 -;# r3 unsigned char *s -;# r4 int p -;# r5 const signed char *flimit -loop_filter_simple_vertical_edge_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xc000 - mtspr 256, r12 ;# set VRSAVE - - ;# build constants - lvx v8, 0, r5 ;# flimit - - vspltisb v11, 8 - vspltisb v12, 4 - vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 - - la r5, -96(r1) ;# temporary space for reading in vectors - - ;# Store 4 pels at word "Offs" in temp array, then advance r7 - ;# to next row and read another 4 pels from the frame buffer. - - subi r7, r3, 2 ;# r7 -> 2 pels before start - lwzx r0, 0, r7 ;# read first 4 pels - - ;# 16 unaligned word accesses - RLV 0 - RLV 4 - RLV 8 - RLV 12 - RLV 1 - RLV 5 - RLV 9 - RLV 13 - RLV 2 - RLV 6 - RLV 10 - RLV 14 - RLV 3 - RLV 7 - RLV 11 - - stw r0, (15*4)(r5) ;# write last 4 pels - - simple_vertical - - ;# Read temp array, write frame buffer. - subi r7, r3, 2 ;# r7 -> 2 pels before start - lwzx r0, 0, r5 ;# read/write first 4 pels - stwx r0, 0, r7 - - WLV 4 - WLV 8 - WLV 12 - WLV 1 - WLV 5 - WLV 9 - WLV 13 - WLV 2 - WLV 6 - WLV 10 - WLV 14 - WLV 3 - WLV 7 - WLV 11 - WLV 15 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .data - -_chromaSelectors: - .long _B_hihi - .long _B_Ures0 - .long _B_Vres0 - .long 0 - .long _B_lolo - .long _B_Ures8 - .long _B_Vres8 - .long 0 - - .align 4 -_B_Vres8: - .byte 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 - - .align 4 -_B_Ures8: - .byte 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7 - - .align 4 -_B_lolo: - .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 - - .align 4 -_B_Vres0: - .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 - .align 4 -_B_Ures0: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 - - .align 4 -_B_hihi: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 diff --git a/vp9/common/ppc/platform_altivec.asm b/vp9/common/ppc/platform_altivec.asm deleted file mode 100644 index f81d86f74..000000000 --- a/vp9/common/ppc/platform_altivec.asm +++ /dev/null @@ -1,59 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl save_platform_context - .globl restore_platform_context - -.macro W V P - stvx \V, 0, \P - addi \P, \P, 16 -.endm - -.macro R V P - lvx \V, 0, \P - addi \P, \P, 16 -.endm - -;# r3 context_ptr - .align 2 -save_platform_contex: - W v20, r3 - W v21, r3 - W v22, r3 - W v23, r3 - W v24, r3 - W v25, r3 - W v26, r3 - W v27, r3 - W v28, r3 - W v29, r3 - W v30, r3 - W v31, r3 - - blr - -;# r3 context_ptr - .align 2 -restore_platform_context: - R v20, r3 - R v21, r3 - R v22, r3 - R v23, r3 - R v24, r3 - R v25, r3 - R v26, r3 - R v27, r3 - R v28, r3 - R v29, r3 - R v30, r3 - R v31, r3 - - blr diff --git a/vp9/common/ppc/recon_altivec.asm b/vp9/common/ppc/recon_altivec.asm deleted file mode 100644 index dd39e05a8..000000000 --- a/vp9/common/ppc/recon_altivec.asm +++ /dev/null @@ -1,175 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl recon4b_ppc - .globl recon2b_ppc - .globl recon_b_ppc - -.macro row_of16 Diff Pred Dst Stride - lvx v1, 0, \Pred ;# v1 = pred = p0..p15 - addi \Pred, \Pred, 16 ;# next pred - vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7 - lvx v3, 0, \Diff ;# v3 = d0..d7 - vaddshs v2, v2, v3 ;# v2 = r0..r7 - vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15 - lvx v3, r8, \Diff ;# v3 = d8..d15 - addi \Diff, \Diff, 32 ;# next diff - vaddshs v3, v3, v1 ;# v3 = r8..r15 - vpkshus v2, v2, v3 ;# v2 = 8-bit r0..r15 - stvx v2, 0, \Dst ;# to dst - add \Dst, \Dst, \Stride ;# next dst -.endm - - .text - .align 2 -;# r3 = short *diff_ptr, -;# r4 = unsigned char *pred_ptr, -;# r5 = unsigned char *dst_ptr, -;# r6 = int stride -recon4b_ppc: - mfspr r0, 256 ;# get old VRSAVE - stw r0, -8(r1) ;# save old VRSAVE to stack - oris r0, r0, 0xf000 - mtspr 256,r0 ;# set VRSAVE - - vxor v0, v0, v0 - li r8, 16 - - row_of16 r3, r4, r5, r6 - row_of16 r3, r4, r5, r6 - row_of16 r3, r4, r5, r6 - row_of16 r3, r4, r5, r6 - - lwz r12, -8(r1) ;# restore old VRSAVE from stack - mtspr 256, r12 ;# reset old VRSAVE - - blr - -.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels - lvx v1, 0, \Pred ;# v1 = pred = p0..p15 - vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7 - lvx v3, 0, \Diff ;# v3 = d0..d7 - vaddshs v2, v2, v3 ;# v2 = r0..r7 - vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15 - lvx v3, r8, \Diff ;# v2 = d8..d15 - vaddshs v3, v3, v1 ;# v3 = r8..r15 - vpkshus v2, v2, v3 ;# v3 = 8-bit r0..r15 - stvx v2, 0, r10 ;# 2 rows to dst from buf - lwz r0, 0(r10) -.if \write_first_four_pels - stw r0, 0(\Dst) - .else - stwux r0, \Dst, \Stride -.endif - lwz r0, 4(r10) - stw r0, 4(\Dst) - lwz r0, 8(r10) - stwux r0, \Dst, \Stride ;# advance dst to next row - lwz r0, 12(r10) - stw r0, 4(\Dst) -.endm - - .align 2 -;# r3 = short *diff_ptr, -;# r4 = unsigned char *pred_ptr, -;# r5 = unsigned char *dst_ptr, -;# r6 = int stride - -recon2b_ppc: - mfspr r0, 256 ;# get old VRSAVE - stw r0, -8(r1) ;# save old VRSAVE to stack - oris r0, r0, 0xf000 - mtspr 256,r0 ;# set VRSAVE - - vxor v0, v0, v0 - li r8, 16 - - la r10, -48(r1) ;# buf - - two_rows_of8 r3, r4, r5, r6, 1 - - addi r4, r4, 16; ;# next pred - addi r3, r3, 32; ;# next diff - - two_rows_of8 r3, r4, r5, r6, 0 - - lwz r12, -8(r1) ;# restore old VRSAVE from stack - mtspr 256, r12 ;# reset old VRSAVE - - blr - -.macro get_two_diff_rows - stw r0, 0(r10) - lwz r0, 4(r3) - stw r0, 4(r10) - lwzu r0, 32(r3) - stw r0, 8(r10) - lwz r0, 4(r3) - stw r0, 12(r10) - lvx v3, 0, r10 -.endm - - .align 2 -;# r3 = short *diff_ptr, -;# r4 = unsigned char *pred_ptr, -;# r5 = unsigned char *dst_ptr, -;# r6 = int stride -recon_b_ppc: - mfspr r0, 256 ;# get old VRSAVE - stw r0, -8(r1) ;# save old VRSAVE to stack - oris r0, r0, 0xf000 - mtspr 256,r0 ;# set VRSAVE - - vxor v0, v0, v0 - - la r10, -48(r1) ;# buf - - lwz r0, 0(r4) - stw r0, 0(r10) - lwz r0, 16(r4) - stw r0, 4(r10) - lwz r0, 32(r4) - stw r0, 8(r10) - lwz r0, 48(r4) - stw r0, 12(r10) - - lvx v1, 0, r10; ;# v1 = pred = p0..p15 - - lwz r0, 0(r3) ;# v3 = d0..d7 - - get_two_diff_rows - - vmrghb v2, v0, v1; ;# v2 = 16-bit p0..p7 - vaddshs v2, v2, v3; ;# v2 = r0..r7 - - lwzu r0, 32(r3) ;# v3 = d8..d15 - - get_two_diff_rows - - vmrglb v1, v0, v1; ;# v1 = 16-bit p8..p15 - vaddshs v3, v3, v1; ;# v3 = r8..r15 - - vpkshus v2, v2, v3; ;# v2 = 8-bit r0..r15 - stvx v2, 0, r10; ;# 16 pels to dst from buf - - lwz r0, 0(r10) - stw r0, 0(r5) - lwz r0, 4(r10) - stwux r0, r5, r6 - lwz r0, 8(r10) - stwux r0, r5, r6 - lwz r0, 12(r10) - stwx r0, r5, r6 - - lwz r12, -8(r1) ;# restore old VRSAVE from stack - mtspr 256, r12 ;# reset old VRSAVE - - blr diff --git a/vp9/common/ppc/systemdependent.c b/vp9/common/ppc/systemdependent.c deleted file mode 100644 index 941c11bd6..000000000 --- a/vp9/common/ppc/systemdependent.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "subpixel.h" -#include "loopfilter.h" -#include "recon.h" -#include "onyxc_int.h" - -void (*vp8_short_idct4x4)(short *input, short *output, int pitch); -void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch); -void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch); - -extern void (*vp9_post_proc_down_and_across)( - unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int rows, - int cols, - int flimit -); - -extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit); -extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit); -extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit); -extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit); - -extern void vp9_post_proc_down_and_across_c -( - unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int rows, - int cols, - int flimit -); -void vp9_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a); - -extern copy_mem_block_function *vp9_copy_mem16x16; -extern copy_mem_block_function *vp9_copy_mem8x8; -extern copy_mem_block_function *vp9_copy_mem8x4; - -// PPC -extern subpixel_predict_function sixtap_predict_ppc; -extern subpixel_predict_function sixtap_predict8x4_ppc; -extern subpixel_predict_function sixtap_predict8x8_ppc; -extern subpixel_predict_function sixtap_predict16x16_ppc; -extern subpixel_predict_function bilinear_predict4x4_ppc; -extern subpixel_predict_function bilinear_predict8x4_ppc; -extern subpixel_predict_function bilinear_predict8x8_ppc; -extern subpixel_predict_function bilinear_predict16x16_ppc; - -extern copy_mem_block_function copy_mem16x16_ppc; - -void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); -void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); -void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); - -extern void short_idct4x4llm_ppc(short *input, short *output, int pitch); - -// Generic C -extern subpixel_predict_function vp9_sixtap_predict_c; -extern subpixel_predict_function vp9_sixtap_predict8x4_c; -extern subpixel_predict_function vp9_sixtap_predict8x8_c; -extern subpixel_predict_function vp9_sixtap_predict16x16_c; -extern subpixel_predict_function vp9_bilinear_predict4x4_c; -extern subpixel_predict_function vp9_bilinear_predict8x4_c; -extern subpixel_predict_function vp9_bilinear_predict8x8_c; -extern subpixel_predict_function vp9_bilinear_predict16x16_c; - -extern copy_mem_block_function vp9_copy_mem16x16_c; -extern copy_mem_block_function vp9_copy_mem8x8_c; -extern copy_mem_block_function vp9_copy_mem8x4_c; - -void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); -void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); -void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); - -extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch); -extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch); -extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch); - -// PPC -extern loop_filter_block_function loop_filter_mbv_ppc; -extern loop_filter_block_function loop_filter_bv_ppc; -extern loop_filter_block_function loop_filter_mbh_ppc; -extern loop_filter_block_function loop_filter_bh_ppc; - -extern loop_filter_block_function loop_filter_mbvs_ppc; -extern loop_filter_block_function loop_filter_bvs_ppc; -extern loop_filter_block_function loop_filter_mbhs_ppc; -extern loop_filter_block_function loop_filter_bhs_ppc; - -// Generic C -extern loop_filter_block_function vp9_loop_filter_mbv_c; -extern loop_filter_block_function vp9_loop_filter_bv_c; -extern loop_filter_block_function vp9_loop_filter_mbh_c; -extern loop_filter_block_function vp9_loop_filter_bh_c; - -extern loop_filter_block_function vp9_loop_filter_mbvs_c; -extern loop_filter_block_function vp9_loop_filter_bvs_c; -extern loop_filter_block_function vp9_loop_filter_mbhs_c; -extern loop_filter_block_function vp9_loop_filter_bhs_c; - -extern loop_filter_block_function *vp8_lf_mbvfull; -extern loop_filter_block_function *vp8_lf_mbhfull; -extern loop_filter_block_function *vp8_lf_bvfull; -extern loop_filter_block_function *vp8_lf_bhfull; - -extern loop_filter_block_function *vp8_lf_mbvsimple; -extern loop_filter_block_function *vp8_lf_mbhsimple; -extern loop_filter_block_function *vp8_lf_bvsimple; -extern loop_filter_block_function *vp8_lf_bhsimple; - -void vp9_clear_c(void) { -} - -void vp9_machine_specific_config(void) { - // Pure C: - vp9_clear_system_state = vp9_clear_c; - vp9_recon_b = vp9_recon_b_c; - vp9_recon4b = vp9_recon4b_c; - vp9_recon2b = vp9_recon2b_c; - - vp9_bilinear_predict16x16 = bilinear_predict16x16_ppc; - vp9_bilinear_predict8x8 = bilinear_predict8x8_ppc; - vp9_bilinear_predict8x4 = bilinear_predict8x4_ppc; - vp8_bilinear_predict = bilinear_predict4x4_ppc; - - vp9_sixtap_predict16x16 = sixtap_predict16x16_ppc; - vp9_sixtap_predict8x8 = sixtap_predict8x8_ppc; - vp9_sixtap_predict8x4 = sixtap_predict8x4_ppc; - vp9_sixtap_predict = sixtap_predict_ppc; - - vp8_short_idct4x4_1 = vp9_short_idct4x4llm_1_c; - vp8_short_idct4x4 = short_idct4x4llm_ppc; - vp8_dc_only_idct = vp8_dc_only_idct_c; - - vp8_lf_mbvfull = loop_filter_mbv_ppc; - vp8_lf_bvfull = loop_filter_bv_ppc; - vp8_lf_mbhfull = loop_filter_mbh_ppc; - vp8_lf_bhfull = loop_filter_bh_ppc; - - vp8_lf_mbvsimple = loop_filter_mbvs_ppc; - vp8_lf_bvsimple = loop_filter_bvs_ppc; - vp8_lf_mbhsimple = loop_filter_mbhs_ppc; - vp8_lf_bhsimple = loop_filter_bhs_ppc; - - vp9_post_proc_down_and_across = vp9_post_proc_down_and_across_c; - vp9_mbpost_proc_down = vp9_mbpost_proc_down_c; - vp9_mbpost_proc_across_ip = vp9_mbpost_proc_across_ip_c; - vp9_plane_add_noise = vp9_plane_add_noise_c; - - vp9_copy_mem16x16 = copy_mem16x16_ppc; - vp9_copy_mem8x8 = vp9_copy_mem8x8_c; - vp9_copy_mem8x4 = vp9_copy_mem8x4_c; - -} diff --git a/vp9/common/ppc/vp9_copy_altivec.asm b/vp9/common/ppc/vp9_copy_altivec.asm new file mode 100644 index 000000000..a4ce91583 --- /dev/null +++ b/vp9/common/ppc/vp9_copy_altivec.asm @@ -0,0 +1,47 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl copy_mem16x16_ppc + +;# r3 unsigned char *src +;# r4 int src_stride +;# r5 unsigned char *dst +;# r6 int dst_stride + +;# Make the assumption that input will not be aligned, +;# but the output will be. So two reads and a perm +;# for the input, but only one store for the output. +copy_mem16x16_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xe000 + mtspr 256, r12 ;# set VRSAVE + + li r10, 16 + mtctr r10 + +cp_16x16_loop: + lvsl v0, 0, r3 ;# permutate value for alignment + + lvx v1, 0, r3 + lvx v2, r10, r3 + + vperm v1, v1, v2, v0 + + stvx v1, 0, r5 + + add r3, r3, r4 ;# increment source pointer + add r5, r5, r6 ;# increment destination pointer + + bdnz cp_16x16_loop + + mtspr 256, r11 ;# reset old VRSAVE + + blr diff --git a/vp9/common/ppc/vp9_filter_altivec.asm b/vp9/common/ppc/vp9_filter_altivec.asm new file mode 100644 index 000000000..4da2e94f9 --- /dev/null +++ b/vp9/common/ppc/vp9_filter_altivec.asm @@ -0,0 +1,1013 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl sixtap_predict_ppc + .globl sixtap_predict8x4_ppc + .globl sixtap_predict8x8_ppc + .globl sixtap_predict16x16_ppc + +.macro load_c V, LABEL, OFF, R0, R1 + lis \R0, \LABEL@ha + la \R1, \LABEL@l(\R0) + lvx \V, \OFF, \R1 +.endm + +.macro load_hfilter V0, V1 + load_c \V0, HFilter, r5, r9, r10 + + addi r5, r5, 16 + lvx \V1, r5, r10 +.endm + +;# Vertical filtering +.macro Vprolog + load_c v0, VFilter, r6, r3, r10 + + vspltish v5, 8 + vspltish v6, 3 + vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + vspltb v1, v0, 1 + vspltb v2, v0, 2 + vspltb v3, v0, 3 + vspltb v4, v0, 4 + vspltb v5, v0, 5 + vspltb v0, v0, 0 +.endm + +.macro vpre_load + Vprolog + li r10, 16 + lvx v10, 0, r9 ;# v10..v14 = first 5 rows + lvx v11, r10, r9 + addi r9, r9, 32 + lvx v12, 0, r9 + lvx v13, r10, r9 + addi r9, r9, 32 + lvx v14, 0, r9 +.endm + +.macro Msum Re, Ro, V, T, TMP + ;# (Re,Ro) += (V*T) + vmuleub \TMP, \V, \T ;# trashes v8 + vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary + vmuloub \TMP, \V, \T + vadduhm \Ro, \Ro, \TMP ;# Ro = odds +.endm + +.macro vinterp_no_store P0 P1 P2 P3 P4 P5 + vmuleub v8, \P0, v0 ;# 64 + 4 positive taps + vadduhm v16, v6, v8 + vmuloub v8, \P0, v0 + vadduhm v17, v6, v8 + Msum v16, v17, \P2, v2, v8 + Msum v16, v17, \P3, v3, v8 + Msum v16, v17, \P5, v5, v8 + + vmuleub v18, \P1, v1 ;# 2 negative taps + vmuloub v19, \P1, v1 + Msum v18, v19, \P4, v4, v8 + + vsubuhs v16, v16, v18 ;# subtract neg from pos + vsubuhs v17, v17, v19 + vsrh v16, v16, v7 ;# divide by 128 + vsrh v17, v17, v7 ;# v16 v17 = evens, odds + vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order + vmrglh v19, v16, v17 + vpkuhus \P0, v18, v19 ;# P0 = 8-bit result +.endm + +.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5 + vmuleub v24, \P0, v13 ;# 64 + 4 positive taps + vadduhm v21, v20, v24 + vmuloub v24, \P0, v13 + vadduhm v22, v20, v24 + Msum v21, v22, \P2, v15, v25 + Msum v21, v22, \P3, v16, v25 + Msum v21, v22, \P5, v18, v25 + + vmuleub v23, \P1, v14 ;# 2 negative taps + vmuloub v24, \P1, v14 + Msum v23, v24, \P4, v17, v25 + + vsubuhs v21, v21, v23 ;# subtract neg from pos + vsubuhs v22, v22, v24 + vsrh v21, v21, v19 ;# divide by 128 + vsrh v22, v22, v19 ;# v16 v17 = evens, odds + vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order + vmrglh v24, v21, v22 + vpkuhus \P0, v23, v24 ;# P0 = 8-bit result +.endm + + +.macro Vinterp P0 P1 P2 P3 P4 P5 + vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5 + stvx \P0, 0, r7 + add r7, r7, r8 ;# 33 ops per 16 pels +.endm + + +.macro luma_v P0, P1, P2, P3, P4, P5 + addi r9, r9, 16 ;# P5 = newest input row + lvx \P5, 0, r9 + Vinterp \P0, \P1, \P2, \P3, \P4, \P5 +.endm + +.macro luma_vtwo + luma_v v10, v11, v12, v13, v14, v15 + luma_v v11, v12, v13, v14, v15, v10 +.endm + +.macro luma_vfour + luma_vtwo + luma_v v12, v13, v14, v15, v10, v11 + luma_v v13, v14, v15, v10, v11, v12 +.endm + +.macro luma_vsix + luma_vfour + luma_v v14, v15, v10, v11, v12, v13 + luma_v v15, v10, v11, v12, v13, v14 +.endm + +.macro Interp4 R I I4 + vmsummbm \R, v13, \I, v15 + vmsummbm \R, v14, \I4, \R +.endm + +.macro Read8x8 VD, RS, RP, increment_counter + lvsl v21, 0, \RS ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx \VD, 0, \RS + lvx v20, r10, \RS + +.if \increment_counter + add \RS, \RS, \RP +.endif + + vperm \VD, \VD, v20, v21 +.endm + +.macro interp_8x8 R + vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456 + vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A + Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3 + vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx + Interp4 v21, v21, \R ;# v21 = result 4 5 6 7 + + vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7 + vsrh \R, \R, v19 + + vpkuhus \R, \R, \R ;# saturate and pack + +.endm + +.macro Read4x4 VD, RS, RP, increment_counter + lvsl v21, 0, \RS ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v20, 0, \RS + +.if \increment_counter + add \RS, \RS, \RP +.endif + + vperm \VD, v20, v20, v21 +.endm + .text + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch +sixtap_predict_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xff87 + ori r12, r12, 0xffc0 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + slwi. r5, r5, 5 ;# index into horizontal filter array + + vspltish v19, 7 + + ;# If there isn't any filtering to be done for the horizontal, then + ;# just skip to the second pass. + beq- vertical_only_4x4 + + ;# load up horizontal filter + load_hfilter v13, v14 + + ;# rounding added in on the multiply + vspltisw v16, 8 + vspltisw v15, 3 + vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 + + ;# Load up permutation constants + load_c v16, B_0123, 0, r9, r10 + load_c v17, B_4567, 0, r9, r10 + load_c v18, B_89AB, 0, r9, r10 + + ;# Back off input buffer by 2 bytes. Need 2 before and 3 after + addi r3, r3, -2 + + addi r9, r3, 0 + li r10, 16 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + + slwi. r6, r6, 4 ;# index into vertical filter array + + ;# filter a line + interp_8x8 v2 + interp_8x8 v3 + interp_8x8 v4 + interp_8x8 v5 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional 5 lines that are needed + ;# for the vertical filter. + beq- store_4x4 + + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r9, r9, r4 + sub r9, r9, r4 + + Read8x8 v0, r9, r4, 1 + Read8x8 v1, r9, r4, 0 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 0 + + interp_8x8 v0 + interp_8x8 v1 + interp_8x8 v6 + interp_8x8 v7 + interp_8x8 v8 + + b second_pass_4x4 + +vertical_only_4x4: + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r3, r3, r4 + sub r3, r3, r4 + li r10, 16 + + Read8x8 v0, r3, r4, 1 + Read8x8 v1, r3, r4, 1 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 0 + + slwi r6, r6, 4 ;# index into vertical filter array + +second_pass_4x4: + load_c v20, b_hilo_4x4, 0, r9, r10 + load_c v21, b_hilo, 0, r9, r10 + + ;# reposition input so that it can go through the + ;# filtering phase with one pass. + vperm v0, v0, v1, v20 ;# 0 1 x x + vperm v2, v2, v3, v20 ;# 2 3 x x + vperm v4, v4, v5, v20 ;# 4 5 x x + vperm v6, v6, v7, v20 ;# 6 7 x x + + vperm v0, v0, v2, v21 ;# 0 1 2 3 + vperm v4, v4, v6, v21 ;# 4 5 6 7 + + vsldoi v1, v0, v4, 4 + vsldoi v2, v0, v4, 8 + vsldoi v3, v0, v4, 12 + + vsldoi v5, v4, v8, 4 + + load_c v13, VFilter, r6, r9, r10 + + vspltish v15, 8 + vspltish v20, 3 + vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + vspltb v14, v13, 1 + vspltb v15, v13, 2 + vspltb v16, v13, 3 + vspltb v17, v13, 4 + vspltb v18, v13, 5 + vspltb v13, v13, 0 + + vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 + + stvx v0, 0, r1 + + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + lwz r0, 4(r1) + stw r0, 0(r7) + add r7, r7, r8 + + lwz r0, 8(r1) + stw r0, 0(r7) + add r7, r7, r8 + + lwz r0, 12(r1) + stw r0, 0(r7) + + b exit_4x4 + +store_4x4: + + stvx v2, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v3, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v4, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v5, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + +exit_4x4: + + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +.macro w_8x8 V, D, R, P + stvx \V, 0, r1 + lwz \R, 0(r1) + stw \R, 0(r7) + lwz \R, 4(r1) + stw \R, 4(r7) + add \D, \D, \P +.endm + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch + +sixtap_predict8x4_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xffc0 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + slwi. r5, r5, 5 ;# index into horizontal filter array + + vspltish v19, 7 + + ;# If there isn't any filtering to be done for the horizontal, then + ;# just skip to the second pass. + beq- second_pass_pre_copy_8x4 + + load_hfilter v13, v14 + + ;# rounding added in on the multiply + vspltisw v16, 8 + vspltisw v15, 3 + vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 + + ;# Load up permutation constants + load_c v16, B_0123, 0, r9, r10 + load_c v17, B_4567, 0, r9, r10 + load_c v18, B_89AB, 0, r9, r10 + + ;# Back off input buffer by 2 bytes. Need 2 before and 3 after + addi r3, r3, -2 + + addi r9, r3, 0 + li r10, 16 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + + slwi. r6, r6, 4 ;# index into vertical filter array + + ;# filter a line + interp_8x8 v2 + interp_8x8 v3 + interp_8x8 v4 + interp_8x8 v5 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional 5 lines that are needed + ;# for the vertical filter. + beq- store_8x4 + + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r9, r9, r4 + sub r9, r9, r4 + + Read8x8 v0, r9, r4, 1 + Read8x8 v1, r9, r4, 0 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 0 + + interp_8x8 v0 + interp_8x8 v1 + interp_8x8 v6 + interp_8x8 v7 + interp_8x8 v8 + + b second_pass_8x4 + +second_pass_pre_copy_8x4: + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r3, r3, r4 + sub r3, r3, r4 + li r10, 16 + + Read8x8 v0, r3, r4, 1 + Read8x8 v1, r3, r4, 1 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 1 + + slwi r6, r6, 4 ;# index into vertical filter array + +second_pass_8x4: + load_c v13, VFilter, r6, r9, r10 + + vspltish v15, 8 + vspltish v20, 3 + vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + vspltb v14, v13, 1 + vspltb v15, v13, 2 + vspltb v16, v13, 3 + vspltb v17, v13, 4 + vspltb v18, v13, 5 + vspltb v13, v13, 0 + + vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 + vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 + vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 + vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 + + cmpi cr0, r8, 8 + beq cr0, store_aligned_8x4 + + w_8x8 v0, r7, r0, r8 + w_8x8 v1, r7, r0, r8 + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + + b exit_8x4 + +store_aligned_8x4: + + load_c v10, b_hilo, 0, r9, r10 + + vperm v0, v0, v1, v10 + vperm v2, v2, v3, v10 + + stvx v0, 0, r7 + addi r7, r7, 16 + stvx v2, 0, r7 + + b exit_8x4 + +store_8x4: + cmpi cr0, r8, 8 + beq cr0, store_aligned2_8x4 + + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + w_8x8 v4, r7, r0, r8 + w_8x8 v5, r7, r0, r8 + + b exit_8x4 + +store_aligned2_8x4: + load_c v10, b_hilo, 0, r9, r10 + + vperm v2, v2, v3, v10 + vperm v4, v4, v5, v10 + + stvx v2, 0, r7 + addi r7, r7, 16 + stvx v4, 0, r7 + +exit_8x4: + + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + + blr + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch + +;# Because the width that needs to be filtered will fit in a single altivec +;# register there is no need to loop. Everything can stay in registers. +sixtap_predict8x8_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xffc0 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + slwi. r5, r5, 5 ;# index into horizontal filter array + + vspltish v19, 7 + + ;# If there isn't any filtering to be done for the horizontal, then + ;# just skip to the second pass. + beq- second_pass_pre_copy_8x8 + + load_hfilter v13, v14 + + ;# rounding added in on the multiply + vspltisw v16, 8 + vspltisw v15, 3 + vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 + + ;# Load up permutation constants + load_c v16, B_0123, 0, r9, r10 + load_c v17, B_4567, 0, r9, r10 + load_c v18, B_89AB, 0, r9, r10 + + ;# Back off input buffer by 2 bytes. Need 2 before and 3 after + addi r3, r3, -2 + + addi r9, r3, 0 + li r10, 16 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 1 + Read8x8 v9, r3, r4, 1 + + slwi. r6, r6, 4 ;# index into vertical filter array + + ;# filter a line + interp_8x8 v2 + interp_8x8 v3 + interp_8x8 v4 + interp_8x8 v5 + interp_8x8 v6 + interp_8x8 v7 + interp_8x8 v8 + interp_8x8 v9 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional 5 lines that are needed + ;# for the vertical filter. + beq- store_8x8 + + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r9, r9, r4 + sub r9, r9, r4 + + Read8x8 v0, r9, r4, 1 + Read8x8 v1, r9, r4, 0 + Read8x8 v10, r3, r4, 1 + Read8x8 v11, r3, r4, 1 + Read8x8 v12, r3, r4, 0 + + interp_8x8 v0 + interp_8x8 v1 + interp_8x8 v10 + interp_8x8 v11 + interp_8x8 v12 + + b second_pass_8x8 + +second_pass_pre_copy_8x8: + ;# only needed if there is a vertical filter present + ;# if the second filter is not null then need to back off by 2*pitch + sub r3, r3, r4 + sub r3, r3, r4 + li r10, 16 + + Read8x8 v0, r3, r4, 1 + Read8x8 v1, r3, r4, 1 + Read8x8 v2, r3, r4, 1 + Read8x8 v3, r3, r4, 1 + Read8x8 v4, r3, r4, 1 + Read8x8 v5, r3, r4, 1 + Read8x8 v6, r3, r4, 1 + Read8x8 v7, r3, r4, 1 + Read8x8 v8, r3, r4, 1 + Read8x8 v9, r3, r4, 1 + Read8x8 v10, r3, r4, 1 + Read8x8 v11, r3, r4, 1 + Read8x8 v12, r3, r4, 0 + + slwi r6, r6, 4 ;# index into vertical filter array + +second_pass_8x8: + load_c v13, VFilter, r6, r9, r10 + + vspltish v15, 8 + vspltish v20, 3 + vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + vspltb v14, v13, 1 + vspltb v15, v13, 2 + vspltb v16, v13, 3 + vspltb v17, v13, 4 + vspltb v18, v13, 5 + vspltb v13, v13, 0 + + vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 + vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 + vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 + vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 + vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9 + vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10 + vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11 + vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12 + + cmpi cr0, r8, 8 + beq cr0, store_aligned_8x8 + + w_8x8 v0, r7, r0, r8 + w_8x8 v1, r7, r0, r8 + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + w_8x8 v4, r7, r0, r8 + w_8x8 v5, r7, r0, r8 + w_8x8 v6, r7, r0, r8 + w_8x8 v7, r7, r0, r8 + + b exit_8x8 + +store_aligned_8x8: + + load_c v10, b_hilo, 0, r9, r10 + + vperm v0, v0, v1, v10 + vperm v2, v2, v3, v10 + vperm v4, v4, v5, v10 + vperm v6, v6, v7, v10 + + stvx v0, 0, r7 + addi r7, r7, 16 + stvx v2, 0, r7 + addi r7, r7, 16 + stvx v4, 0, r7 + addi r7, r7, 16 + stvx v6, 0, r7 + + b exit_8x8 + +store_8x8: + cmpi cr0, r8, 8 + beq cr0, store_aligned2_8x8 + + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + w_8x8 v4, r7, r0, r8 + w_8x8 v5, r7, r0, r8 + w_8x8 v6, r7, r0, r8 + w_8x8 v7, r7, r0, r8 + w_8x8 v8, r7, r0, r8 + w_8x8 v9, r7, r0, r8 + + b exit_8x8 + +store_aligned2_8x8: + load_c v10, b_hilo, 0, r9, r10 + + vperm v2, v2, v3, v10 + vperm v4, v4, v5, v10 + vperm v6, v6, v7, v10 + vperm v8, v8, v9, v10 + + stvx v2, 0, r7 + addi r7, r7, 16 + stvx v4, 0, r7 + addi r7, r7, 16 + stvx v6, 0, r7 + addi r7, r7, 16 + stvx v8, 0, r7 + +exit_8x8: + + addi r1, r1, 32 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch + +;# Two pass filtering. First pass is Horizontal edges, second pass is vertical +;# edges. One of the filters can be null, but both won't be. Needs to use a +;# temporary buffer because the source buffer can't be modified and the buffer +;# for the destination is not large enough to hold the temporary data. +sixtap_predict16x16_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xf000 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-416(r1) ;# create space on the stack + + ;# Three possiblities + ;# 1. First filter is null. Don't use a temp buffer. + ;# 2. Second filter is null. Don't use a temp buffer. + ;# 3. Neither are null, use temp buffer. + + ;# First Pass (horizontal edge) + ;# setup pointers for src + ;# if possiblity (1) then setup the src pointer to be the orginal and jump + ;# to second pass. this is based on if x_offset is 0. + + ;# load up horizontal filter + slwi. r5, r5, 5 ;# index into horizontal filter array + + load_hfilter v4, v5 + + beq- copy_horizontal_16x21 + + ;# Back off input buffer by 2 bytes. Need 2 before and 3 after + addi r3, r3, -2 + + slwi. r6, r6, 4 ;# index into vertical filter array + + ;# setup constants + ;# v14 permutation value for alignment + load_c v14, b_hperm, 0, r9, r10 + + ;# These statements are guessing that there won't be a second pass, + ;# but if there is then inside the bypass they need to be set + li r0, 16 ;# prepare for no vertical filter + + ;# Change the output pointer and pitch to be the actual + ;# desination instead of a temporary buffer. + addi r9, r7, 0 + addi r5, r8, 0 + + ;# no vertical filter, so write the output from the first pass + ;# directly into the output buffer. + beq- no_vertical_filter_bypass + + ;# if the second filter is not null then need to back off by 2*pitch + sub r3, r3, r4 + sub r3, r3, r4 + + ;# setup counter for the number of lines that are going to be filtered + li r0, 21 + + ;# use the stack as temporary storage + la r9, 48(r1) + li r5, 16 + +no_vertical_filter_bypass: + + mtctr r0 + + ;# rounding added in on the multiply + vspltisw v10, 8 + vspltisw v12, 3 + vslw v12, v10, v12 ;# 0x00000040000000400000004000000040 + + ;# downshift by 7 ( divide by 128 ) at the end + vspltish v13, 7 + + ;# index to the next set of vectors in the row. + li r10, 16 + li r12, 32 + +horizontal_loop_16x16: + + lvsl v15, 0, r3 ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v1, 0, r3 + lvx v2, r10, r3 + lvx v3, r12, r3 + + vperm v8, v1, v2, v15 + vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified + + vsldoi v11, v8, v9, 4 + + ;# set 0 + vmsummbm v6, v4, v8, v12 ;# taps times elements + vmsummbm v0, v5, v11, v6 + + ;# set 1 + vsldoi v10, v8, v9, 1 + vsldoi v11, v8, v9, 5 + + vmsummbm v6, v4, v10, v12 + vmsummbm v1, v5, v11, v6 + + ;# set 2 + vsldoi v10, v8, v9, 2 + vsldoi v11, v8, v9, 6 + + vmsummbm v6, v4, v10, v12 + vmsummbm v2, v5, v11, v6 + + ;# set 3 + vsldoi v10, v8, v9, 3 + vsldoi v11, v8, v9, 7 + + vmsummbm v6, v4, v10, v12 + vmsummbm v3, v5, v11, v6 + + vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit) + vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F + + vsrh v0, v0, v13 ;# divide v0, v1 by 128 + vsrh v1, v1, v13 + + vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result + vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result + + stvx v0, 0, r9 + add r9, r9, r5 + + add r3, r3, r4 + + bdnz horizontal_loop_16x16 + + ;# check again to see if vertical filter needs to be done. + cmpi cr0, r6, 0 + beq cr0, end_16x16 + + ;# yes there is, so go to the second pass + b second_pass_16x16 + +copy_horizontal_16x21: + li r10, 21 + mtctr r10 + + li r10, 16 + + sub r3, r3, r4 + sub r3, r3, r4 + + ;# this is done above if there is a horizontal filter, + ;# if not it needs to be done down here. + slwi r6, r6, 4 ;# index into vertical filter array + + ;# always write to the stack when doing a horizontal copy + la r9, 48(r1) + +copy_horizontal_loop_16x21: + lvsl v15, 0, r3 ;# permutate value for alignment + + lvx v1, 0, r3 + lvx v2, r10, r3 + + vperm v8, v1, v2, v15 + + stvx v8, 0, r9 + addi r9, r9, 16 + + add r3, r3, r4 + + bdnz copy_horizontal_loop_16x21 + +second_pass_16x16: + + ;# always read from the stack when doing a vertical filter + la r9, 48(r1) + + ;# downshift by 7 ( divide by 128 ) at the end + vspltish v7, 7 + + vpre_load + + luma_vsix + luma_vsix + luma_vfour + +end_16x16: + + addi r1, r1, 416 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .data + + .align 4 +HFilter: + .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12 + .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0 + .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36 + .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0 + .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50 + .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 + .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77 + .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0 + .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93 + .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0 + .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108 + .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0 + .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123 + .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 + + .align 4 +VFilter: + .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + + .align 4 +b_hperm: + .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 + + .align 4 +B_0123: + .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + + .align 4 +B_4567: + .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + + .align 4 +B_89AB: + .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + + .align 4 +b_hilo: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 + + .align 4 +b_hilo_4x4: + .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 diff --git a/vp9/common/ppc/vp9_filter_bilinear_altivec.asm b/vp9/common/ppc/vp9_filter_bilinear_altivec.asm new file mode 100644 index 000000000..fd8aa665f --- /dev/null +++ b/vp9/common/ppc/vp9_filter_bilinear_altivec.asm @@ -0,0 +1,677 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl bilinear_predict4x4_ppc + .globl bilinear_predict8x4_ppc + .globl bilinear_predict8x8_ppc + .globl bilinear_predict16x16_ppc + +.macro load_c V, LABEL, OFF, R0, R1 + lis \R0, \LABEL@ha + la \R1, \LABEL@l(\R0) + lvx \V, \OFF, \R1 +.endm + +.macro load_vfilter V0, V1 + load_c \V0, vfilter_b, r6, r9, r10 + + addi r6, r6, 16 + lvx \V1, r6, r10 +.endm + +.macro HProlog jump_label + ;# load up horizontal filter + slwi. r5, r5, 4 ;# index into horizontal filter array + + ;# index to the next set of vectors in the row. + li r10, 16 + li r12, 32 + + ;# downshift by 7 ( divide by 128 ) at the end + vspltish v19, 7 + + ;# If there isn't any filtering to be done for the horizontal, then + ;# just skip to the second pass. + beq \jump_label + + load_c v20, hfilter_b, r5, r9, r0 + + ;# setup constants + ;# v14 permutation value for alignment + load_c v28, b_hperm_b, 0, r9, r0 + + ;# rounding added in on the multiply + vspltisw v21, 8 + vspltisw v18, 3 + vslw v18, v21, v18 ;# 0x00000040000000400000004000000040 + + slwi. r6, r6, 5 ;# index into vertical filter array +.endm + +;# Filters a horizontal line +;# expects: +;# r3 src_ptr +;# r4 pitch +;# r10 16 +;# r12 32 +;# v17 perm intput +;# v18 rounding +;# v19 shift +;# v20 filter taps +;# v21 tmp +;# v22 tmp +;# v23 tmp +;# v24 tmp +;# v25 tmp +;# v26 tmp +;# v27 tmp +;# v28 perm output +;# +.macro HFilter V + vperm v24, v21, v21, v10 ;# v20 = 0123 1234 2345 3456 + vperm v25, v21, v21, v11 ;# v21 = 4567 5678 6789 789A + + vmsummbm v24, v20, v24, v18 + vmsummbm v25, v20, v25, v18 + + vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) + + vsrh v24, v24, v19 ;# divide v0, v1 by 128 + + vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result +.endm + +.macro hfilter_8 V, increment_counter + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 9 bytes wide, output is 8 bytes. + lvx v21, 0, r3 + lvx v22, r10, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + vperm v21, v21, v22, v17 + + HFilter \V +.endm + + +.macro load_and_align_8 V, increment_counter + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v21, 0, r3 + lvx v22, r10, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + + vperm \V, v21, v22, v17 +.endm + +.macro write_aligned_8 V, increment_counter + stvx \V, 0, r7 + +.if \increment_counter + add r7, r7, r8 +.endif +.endm + +.macro vfilter_16 P0 P1 + vmuleub v22, \P0, v20 ;# 64 + 4 positive taps + vadduhm v22, v18, v22 + vmuloub v23, \P0, v20 + vadduhm v23, v18, v23 + + vmuleub v24, \P1, v21 + vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary + vmuloub v25, \P1, v21 + vadduhm v23, v23, v25 ;# Ro = odds + + vsrh v22, v22, v19 ;# divide by 128 + vsrh v23, v23, v19 ;# v16 v17 = evens, odds + vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order + vmrglh v23, v22, v23 + vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result +.endm + + +.macro w_8x8 V, D, R, P + stvx \V, 0, r1 + lwz \R, 0(r1) + stw \R, 0(r7) + lwz \R, 4(r1) + stw \R, 4(r7) + add \D, \D, \P +.endm + + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch +bilinear_predict4x4_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xf830 + ori r12, r12, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + HProlog second_pass_4x4_pre_copy_b + + ;# Load up permutation constants + load_c v10, b_0123_b, 0, r9, r12 + load_c v11, b_4567_b, 0, r9, r12 + + hfilter_8 v0, 1 + hfilter_8 v1, 1 + hfilter_8 v2, 1 + hfilter_8 v3, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq store_out_4x4_b + + hfilter_8 v4, 0 + + b second_pass_4x4_b + +second_pass_4x4_pre_copy_b: + slwi r6, r6, 5 ;# index into vertical filter array + + load_and_align_8 v0, 1 + load_and_align_8 v1, 1 + load_and_align_8 v2, 1 + load_and_align_8 v3, 1 + load_and_align_8 v4, 1 + +second_pass_4x4_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + +store_out_4x4_b: + + stvx v0, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v1, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v2, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + add r7, r7, r8 + + stvx v3, 0, r1 + lwz r0, 0(r1) + stw r0, 0(r7) + +exit_4x4: + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch +bilinear_predict8x4_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xf830 + ori r12, r12, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + HProlog second_pass_8x4_pre_copy_b + + ;# Load up permutation constants + load_c v10, b_0123_b, 0, r9, r12 + load_c v11, b_4567_b, 0, r9, r12 + + hfilter_8 v0, 1 + hfilter_8 v1, 1 + hfilter_8 v2, 1 + hfilter_8 v3, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq store_out_8x4_b + + hfilter_8 v4, 0 + + b second_pass_8x4_b + +second_pass_8x4_pre_copy_b: + slwi r6, r6, 5 ;# index into vertical filter array + + load_and_align_8 v0, 1 + load_and_align_8 v1, 1 + load_and_align_8 v2, 1 + load_and_align_8 v3, 1 + load_and_align_8 v4, 1 + +second_pass_8x4_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + +store_out_8x4_b: + + cmpi cr0, r8, 8 + beq cr0, store_aligned_8x4_b + + w_8x8 v0, r7, r0, r8 + w_8x8 v1, r7, r0, r8 + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + + b exit_8x4 + +store_aligned_8x4_b: + load_c v10, b_hilo_b, 0, r9, r10 + + vperm v0, v0, v1, v10 + vperm v2, v2, v3, v10 + + stvx v0, 0, r7 + addi r7, r7, 16 + stvx v2, 0, r7 + +exit_8x4: + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch +bilinear_predict8x8_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xfff0 + ori r12, r12, 0xffff + mtspr 256, r12 ;# set VRSAVE + + stwu r1,-32(r1) ;# create space on the stack + + HProlog second_pass_8x8_pre_copy_b + + ;# Load up permutation constants + load_c v10, b_0123_b, 0, r9, r12 + load_c v11, b_4567_b, 0, r9, r12 + + hfilter_8 v0, 1 + hfilter_8 v1, 1 + hfilter_8 v2, 1 + hfilter_8 v3, 1 + hfilter_8 v4, 1 + hfilter_8 v5, 1 + hfilter_8 v6, 1 + hfilter_8 v7, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq store_out_8x8_b + + hfilter_8 v8, 0 + + b second_pass_8x8_b + +second_pass_8x8_pre_copy_b: + slwi r6, r6, 5 ;# index into vertical filter array + + load_and_align_8 v0, 1 + load_and_align_8 v1, 1 + load_and_align_8 v2, 1 + load_and_align_8 v3, 1 + load_and_align_8 v4, 1 + load_and_align_8 v5, 1 + load_and_align_8 v6, 1 + load_and_align_8 v7, 1 + load_and_align_8 v8, 0 + +second_pass_8x8_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + vfilter_16 v4, v5 + vfilter_16 v5, v6 + vfilter_16 v6, v7 + vfilter_16 v7, v8 + +store_out_8x8_b: + + cmpi cr0, r8, 8 + beq cr0, store_aligned_8x8_b + + w_8x8 v0, r7, r0, r8 + w_8x8 v1, r7, r0, r8 + w_8x8 v2, r7, r0, r8 + w_8x8 v3, r7, r0, r8 + w_8x8 v4, r7, r0, r8 + w_8x8 v5, r7, r0, r8 + w_8x8 v6, r7, r0, r8 + w_8x8 v7, r7, r0, r8 + + b exit_8x8 + +store_aligned_8x8_b: + load_c v10, b_hilo_b, 0, r9, r10 + + vperm v0, v0, v1, v10 + vperm v2, v2, v3, v10 + vperm v4, v4, v5, v10 + vperm v6, v6, v7, v10 + + stvx v0, 0, r7 + addi r7, r7, 16 + stvx v2, 0, r7 + addi r7, r7, 16 + stvx v4, 0, r7 + addi r7, r7, 16 + stvx v6, 0, r7 + +exit_8x8: + + addi r1, r1, 32 ;# recover stack + mtspr 256, r11 ;# reset old VRSAVE + + blr + +;# Filters a horizontal line +;# expects: +;# r3 src_ptr +;# r4 pitch +;# r10 16 +;# r12 32 +;# v17 perm intput +;# v18 rounding +;# v19 shift +;# v20 filter taps +;# v21 tmp +;# v22 tmp +;# v23 tmp +;# v24 tmp +;# v25 tmp +;# v26 tmp +;# v27 tmp +;# v28 perm output +;# +.macro hfilter_16 V, increment_counter + + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v21, 0, r3 + lvx v22, r10, r3 + lvx v23, r12, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + vperm v21, v21, v22, v17 + vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified + + ;# set 0 + vmsummbm v24, v20, v21, v18 ;# taps times elements + + ;# set 1 + vsldoi v23, v21, v22, 1 + vmsummbm v25, v20, v23, v18 + + ;# set 2 + vsldoi v23, v21, v22, 2 + vmsummbm v26, v20, v23, v18 + + ;# set 3 + vsldoi v23, v21, v22, 3 + vmsummbm v27, v20, v23, v18 + + vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) + vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F + + vsrh v24, v24, v19 ;# divide v0, v1 by 128 + vsrh v25, v25, v19 + + vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result + vperm \V, \V, v0, v28 ;# \V = correctly-ordered result +.endm + +.macro load_and_align_16 V, increment_counter + lvsl v17, 0, r3 ;# permutate value for alignment + + ;# input to filter is 21 bytes wide, output is 16 bytes. + ;# input will can span three vectors if not aligned correctly. + lvx v21, 0, r3 + lvx v22, r10, r3 + +.if \increment_counter + add r3, r3, r4 +.endif + + vperm \V, v21, v22, v17 +.endm + +.macro write_16 V, increment_counter + stvx \V, 0, r7 + +.if \increment_counter + add r7, r7, r8 +.endif +.endm + + .align 2 +;# r3 unsigned char * src +;# r4 int src_pitch +;# r5 int x_offset +;# r6 int y_offset +;# r7 unsigned char * dst +;# r8 int dst_pitch +bilinear_predict16x16_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + HProlog second_pass_16x16_pre_copy_b + + hfilter_16 v0, 1 + hfilter_16 v1, 1 + hfilter_16 v2, 1 + hfilter_16 v3, 1 + hfilter_16 v4, 1 + hfilter_16 v5, 1 + hfilter_16 v6, 1 + hfilter_16 v7, 1 + hfilter_16 v8, 1 + hfilter_16 v9, 1 + hfilter_16 v10, 1 + hfilter_16 v11, 1 + hfilter_16 v12, 1 + hfilter_16 v13, 1 + hfilter_16 v14, 1 + hfilter_16 v15, 1 + + ;# Finished filtering main horizontal block. If there is no + ;# vertical filtering, jump to storing the data. Otherwise + ;# load up and filter the additional line that is needed + ;# for the vertical filter. + beq store_out_16x16_b + + hfilter_16 v16, 0 + + b second_pass_16x16_b + +second_pass_16x16_pre_copy_b: + slwi r6, r6, 5 ;# index into vertical filter array + + load_and_align_16 v0, 1 + load_and_align_16 v1, 1 + load_and_align_16 v2, 1 + load_and_align_16 v3, 1 + load_and_align_16 v4, 1 + load_and_align_16 v5, 1 + load_and_align_16 v6, 1 + load_and_align_16 v7, 1 + load_and_align_16 v8, 1 + load_and_align_16 v9, 1 + load_and_align_16 v10, 1 + load_and_align_16 v11, 1 + load_and_align_16 v12, 1 + load_and_align_16 v13, 1 + load_and_align_16 v14, 1 + load_and_align_16 v15, 1 + load_and_align_16 v16, 0 + +second_pass_16x16_b: + vspltish v20, 8 + vspltish v18, 3 + vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 + + load_vfilter v20, v21 + + vfilter_16 v0, v1 + vfilter_16 v1, v2 + vfilter_16 v2, v3 + vfilter_16 v3, v4 + vfilter_16 v4, v5 + vfilter_16 v5, v6 + vfilter_16 v6, v7 + vfilter_16 v7, v8 + vfilter_16 v8, v9 + vfilter_16 v9, v10 + vfilter_16 v10, v11 + vfilter_16 v11, v12 + vfilter_16 v12, v13 + vfilter_16 v13, v14 + vfilter_16 v14, v15 + vfilter_16 v15, v16 + +store_out_16x16_b: + + write_16 v0, 1 + write_16 v1, 1 + write_16 v2, 1 + write_16 v3, 1 + write_16 v4, 1 + write_16 v5, 1 + write_16 v6, 1 + write_16 v7, 1 + write_16 v8, 1 + write_16 v9, 1 + write_16 v10, 1 + write_16 v11, 1 + write_16 v12, 1 + write_16 v13, 1 + write_16 v14, 1 + write_16 v15, 0 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .data + + .align 4 +hfilter_b: + .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0 + .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0 + .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0 + .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0 + .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0 + .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0 + .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0 + .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0 + + .align 4 +vfilter_b: + .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128 + .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 + .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 + .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 + .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 + .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 + .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 + .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 + .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 + .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 + .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 + .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 + + .align 4 +b_hperm_b: + .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 + + .align 4 +b_0123_b: + .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + + .align 4 +b_4567_b: + .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 + +b_hilo_b: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 diff --git a/vp9/common/ppc/vp9_idctllm_altivec.asm b/vp9/common/ppc/vp9_idctllm_altivec.asm new file mode 100644 index 000000000..117d9cfc8 --- /dev/null +++ b/vp9/common/ppc/vp9_idctllm_altivec.asm @@ -0,0 +1,189 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl short_idct4x4llm_ppc + +.macro load_c V, LABEL, OFF, R0, R1 + lis \R0, \LABEL@ha + la \R1, \LABEL@l(\R0) + lvx \V, \OFF, \R1 +.endm + +;# r3 short *input +;# r4 short *output +;# r5 int pitch + .align 2 +short_idct4x4llm_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xfff8 + mtspr 256, r12 ;# set VRSAVE + + load_c v8, sinpi8sqrt2, 0, r9, r10 + load_c v9, cospi8sqrt2minus1, 0, r9, r10 + load_c v10, hi_hi, 0, r9, r10 + load_c v11, lo_lo, 0, r9, r10 + load_c v12, shift_16, 0, r9, r10 + + li r10, 16 + lvx v0, 0, r3 ;# input ip[0], ip[ 4] + lvx v1, r10, r3 ;# input ip[8], ip[12] + + ;# first pass + vupkhsh v2, v0 + vupkhsh v3, v1 + vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8] + vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8] + + vupklsh v0, v0 + vmulosh v4, v0, v8 + vsraw v4, v4, v12 + vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2) + + vupklsh v1, v1 + vmulosh v5, v1, v9 + vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2) + vaddsws v5, v5, v1 + + vsubsws v4, v4, v5 ;# c1 + + vmulosh v3, v1, v8 + vsraw v3, v3, v12 + vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2) + + vmulosh v5, v0, v9 + vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2) + vaddsws v5, v5, v0 + + vaddsws v3, v3, v5 ;# d1 + + vaddsws v0, v6, v3 ;# a1 + d1 + vsubsws v3, v6, v3 ;# a1 - d1 + + vaddsws v1, v7, v4 ;# b1 + c1 + vsubsws v2, v7, v4 ;# b1 - c1 + + ;# transpose input + vmrghw v4, v0, v1 ;# a0 b0 a1 b1 + vmrghw v5, v2, v3 ;# c0 d0 c1 d1 + + vmrglw v6, v0, v1 ;# a2 b2 a3 b3 + vmrglw v7, v2, v3 ;# c2 d2 c3 d3 + + vperm v0, v4, v5, v10 ;# a0 b0 c0 d0 + vperm v1, v4, v5, v11 ;# a1 b1 c1 d1 + + vperm v2, v6, v7, v10 ;# a2 b2 c2 d2 + vperm v3, v6, v7, v11 ;# a3 b3 c3 d3 + + ;# second pass + vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8] + vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8] + + vmulosh v4, v1, v8 + vsraw v4, v4, v12 + vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2) + + vmulosh v5, v3, v9 + vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2) + vaddsws v5, v5, v3 + + vsubsws v4, v4, v5 ;# c1 + + vmulosh v2, v3, v8 + vsraw v2, v2, v12 + vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2) + + vmulosh v5, v1, v9 + vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2) + vaddsws v5, v5, v1 + + vaddsws v3, v2, v5 ;# d1 + + vaddsws v0, v6, v3 ;# a1 + d1 + vsubsws v3, v6, v3 ;# a1 - d1 + + vaddsws v1, v7, v4 ;# b1 + c1 + vsubsws v2, v7, v4 ;# b1 - c1 + + vspltish v6, 4 + vspltish v7, 3 + + vpkswss v0, v0, v1 + vpkswss v1, v2, v3 + + vaddshs v0, v0, v6 + vaddshs v1, v1, v6 + + vsrah v0, v0, v7 + vsrah v1, v1, v7 + + ;# transpose output + vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3 + vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3 + + vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1 + vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3 + + stwu r1,-416(r1) ;# create space on the stack + + stvx v0, 0, r1 + lwz r6, 0(r1) + stw r6, 0(r4) + lwz r6, 4(r1) + stw r6, 4(r4) + + add r4, r4, r5 + + lwz r6, 8(r1) + stw r6, 0(r4) + lwz r6, 12(r1) + stw r6, 4(r4) + + add r4, r4, r5 + + stvx v1, 0, r1 + lwz r6, 0(r1) + stw r6, 0(r4) + lwz r6, 4(r1) + stw r6, 4(r4) + + add r4, r4, r5 + + lwz r6, 8(r1) + stw r6, 0(r4) + lwz r6, 12(r1) + stw r6, 4(r4) + + addi r1, r1, 416 ;# recover stack + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 4 +sinpi8sqrt2: + .short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468 + + .align 4 +cospi8sqrt2minus1: + .short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091 + + .align 4 +shift_16: + .long 16, 16, 16, 16 + + .align 4 +hi_hi: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 + + .align 4 +lo_lo: + .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 diff --git a/vp9/common/ppc/vp9_loopfilter_altivec.c b/vp9/common/ppc/vp9_loopfilter_altivec.c new file mode 100644 index 000000000..cae171805 --- /dev/null +++ b/vp9/common/ppc/vp9_loopfilter_altivec.c @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9_loopfilter.h" +#include "vp9_onyxc_int.h" + +typedef void loop_filter_function_y_ppc +( + unsigned char *s, // source pointer + int p, // pitch + const signed char *flimit, + const signed char *limit, + const signed char *thresh +); + +typedef void loop_filter_function_uv_ppc +( + unsigned char *u, // source pointer + unsigned char *v, // source pointer + int p, // pitch + const signed char *flimit, + const signed char *limit, + const signed char *thresh +); + +typedef void loop_filter_function_s_ppc +( + unsigned char *s, // source pointer + int p, // pitch + const signed char *flimit +); + +loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc; +loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc; +loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc; +loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc; + +loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc; +loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc; +loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc; +loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc; + +loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc; +loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc; + +// Horizontal MB filtering +void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr); + + if (u_ptr) + mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr); +} + +void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + (void)u_ptr; + (void)v_ptr; + (void)uv_stride; + loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim); +} + +// Vertical MB Filtering +void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr); + + if (u_ptr) + mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr); +} + +void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + (void)u_ptr; + (void)v_ptr; + (void)uv_stride; + loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim); +} + +// Horizontal B Filtering +void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + // These should all be done at once with one call, instead of 3 + loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); + loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); + loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); + + if (u_ptr) + loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr); +} + +void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + (void)u_ptr; + (void)v_ptr; + (void)uv_stride; + loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim); + loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim); + loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim); +} + +// Vertical B Filtering +void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr); + + if (u_ptr) + loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr); +} + +void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, loop_filter_info *lfi) { + (void)u_ptr; + (void)v_ptr; + (void)uv_stride; + loop_filter_simple_vertical_edge_ppc(y_ptr + 4, y_stride, lfi->flim); + loop_filter_simple_vertical_edge_ppc(y_ptr + 8, y_stride, lfi->flim); + loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim); +} diff --git a/vp9/common/ppc/vp9_loopfilter_filters_altivec.asm b/vp9/common/ppc/vp9_loopfilter_filters_altivec.asm new file mode 100644 index 000000000..61df4e976 --- /dev/null +++ b/vp9/common/ppc/vp9_loopfilter_filters_altivec.asm @@ -0,0 +1,1253 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl mbloop_filter_horizontal_edge_y_ppc + .globl loop_filter_horizontal_edge_y_ppc + .globl mbloop_filter_vertical_edge_y_ppc + .globl loop_filter_vertical_edge_y_ppc + + .globl mbloop_filter_horizontal_edge_uv_ppc + .globl loop_filter_horizontal_edge_uv_ppc + .globl mbloop_filter_vertical_edge_uv_ppc + .globl loop_filter_vertical_edge_uv_ppc + + .globl loop_filter_simple_horizontal_edge_ppc + .globl loop_filter_simple_vertical_edge_ppc + + .text +;# We often need to perform transposes (and other transpose-like operations) +;# on matrices of data. This is simplified by the fact that we usually +;# operate on hunks of data whose dimensions are powers of 2, or at least +;# divisible by highish powers of 2. +;# +;# These operations can be very confusing. They become more straightforward +;# when we think of them as permutations of address bits: Concatenate a +;# group of vector registers and think of it as occupying a block of +;# memory beginning at address zero. The low four bits 0...3 of the +;# address then correspond to position within a register, the higher-order +;# address bits select the register. +;# +;# Although register selection, at the code level, is arbitrary, things +;# are simpler if we use contiguous ranges of register numbers, simpler +;# still if the low-order bits of the register number correspond to +;# conceptual address bits. We do this whenever reasonable. +;# +;# A 16x16 transpose can then be thought of as an operation on +;# a 256-element block of memory. It takes 8 bits 0...7 to address this +;# memory and the effect of a transpose is to interchange address bit +;# 0 with 4, 1 with 5, 2 with 6, and 3 with 7. Bits 0...3 index the +;# column, which is interchanged with the row addressed by bits 4..7. +;# +;# The altivec merge instructions provide a rapid means of effecting +;# many of these transforms. They operate at three widths (8,16,32). +;# Writing V(x) for vector register #x, paired merges permute address +;# indices as follows. +;# +;# 0->1 1->2 2->3 3->(4+d) (4+s)->0: +;# +;# vmrghb V( x), V( y), V( y + (1<2 2->3 3->(4+d) (4+s)->1: +;# +;# vmrghh V( x), V( y), V( y + (1<3 3->(4+d) (4+s)->2: +;# +;# vmrghw V( x), V( y), V( y + (1<(4+d) (4+s)->3 by the sequence: +;# +;# vperm V( x), V( y), V( y + (1<4 1<->5 2<->6 3<->7, which we accomplish by +;# 4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0. +;# +;# Except for the fact that the destination registers get written +;# before we are done referencing the old contents, the cyclic transform +;# is effected by +;# +;# x = 0; do { +;# vmrghb V(2x), V(x), V(x+8); +;# vmrghb V(2x+1), V(x), V(x+8); +;# } while( ++x < 8); +;# +;# For clarity, and because we can afford it, we do this transpose +;# using all 32 registers, alternating the banks 0..15 and 16 .. 31, +;# leaving the final result in 16 .. 31, as the lower registers are +;# used in the filtering itself. +;# +.macro Tpair A, B, X, Y + vmrghb \A, \X, \Y + vmrglb \B, \X, \Y +.endm + +;# Each step takes 8*2 = 16 instructions + +.macro t16_even + Tpair v16,v17, v0,v8 + Tpair v18,v19, v1,v9 + Tpair v20,v21, v2,v10 + Tpair v22,v23, v3,v11 + Tpair v24,v25, v4,v12 + Tpair v26,v27, v5,v13 + Tpair v28,v29, v6,v14 + Tpair v30,v31, v7,v15 +.endm + +.macro t16_odd + Tpair v0,v1, v16,v24 + Tpair v2,v3, v17,v25 + Tpair v4,v5, v18,v26 + Tpair v6,v7, v19,v27 + Tpair v8,v9, v20,v28 + Tpair v10,v11, v21,v29 + Tpair v12,v13, v22,v30 + Tpair v14,v15, v23,v31 +.endm + +;# Whole transpose takes 4*16 = 64 instructions + +.macro t16_full + t16_odd + t16_even + t16_odd + t16_even +.endm + +;# Vertical edge filtering requires transposes. For the simple filter, +;# we need to convert 16 rows of 4 pels each into 4 registers of 16 pels +;# each. Writing 0 ... 63 for the pixel indices, the desired result is: +;# +;# v0 = 0 1 ... 14 15 +;# v1 = 16 17 ... 30 31 +;# v2 = 32 33 ... 47 48 +;# v3 = 49 50 ... 62 63 +;# +;# In frame-buffer memory, the layout is: +;# +;# 0 16 32 48 +;# 1 17 33 49 +;# ... +;# 15 31 47 63. +;# +;# We begin by reading the data 32 bits at a time (using scalar operations) +;# into a temporary array, reading the rows of the array into vector registers, +;# with the following layout: +;# +;# v0 = 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 +;# v1 = 1 17 33 49 5 21 ... 45 61 +;# v2 = 2 18 ... 46 62 +;# v3 = 3 19 ... 47 63 +;# +;# From the "address-bit" perspective discussed above, we simply need to +;# interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone. +;# In other words, we transpose each of the four 4x4 submatrices. +;# +;# This transformation is its own inverse, and we need to perform it +;# again before writing the pixels back into the frame buffer. +;# +;# It acts in place on registers v0...v3, uses v4...v7 as temporaries, +;# and assumes that v14/v15 contain the b_hihi/b_lolo selectors +;# defined above. We think of both groups of 4 registers as having +;# "addresses" {0,1,2,3} * 16. +;# +.macro Transpose4times4x4 Vlo, Vhi + + ;# d=s=0 0->1 1->2 2->3 3->4 4->0 =5= + + vmrghb v4, v0, v1 + vmrglb v5, v0, v1 + vmrghb v6, v2, v3 + vmrglb v7, v2, v3 + + ;# d=0 s=1 =0= 1->2 2->3 3->4 4->5 5->1 + + vmrghh v0, v4, v6 + vmrglh v1, v4, v6 + vmrghh v2, v5, v7 + vmrglh v3, v5, v7 + + ;# d=s=0 =0= =1= 2->3 3->4 4->2 =5= + + vmrghw v4, v0, v1 + vmrglw v5, v0, v1 + vmrghw v6, v2, v3 + vmrglw v7, v2, v3 + + ;# d=0 s=1 =0= =1= =2= 3->4 4->5 5->3 + + vperm v0, v4, v6, \Vlo + vperm v1, v4, v6, \Vhi + vperm v2, v5, v7, \Vlo + vperm v3, v5, v7, \Vhi +.endm +;# end Transpose4times4x4 + + +;# Normal mb vertical edge filter transpose. +;# +;# We read 8 columns of data, initially in the following pattern: +;# +;# (0,0) (1,0) ... (7,0) (0,1) (1,1) ... (7,1) +;# (0,2) (1,2) ... (7,2) (0,3) (1,3) ... (7,3) +;# ... +;# (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15) +;# +;# and wish to convert to: +;# +;# (0,0) ... (0,15) +;# (1,0) ... (1,15) +;# ... +;# (7,0) ... (7,15). +;# +;# In "address bit" language, we wish to map +;# +;# 0->4 1->5 2->6 3->0 4->1 5->2 6->3, i.e., I -> (I+4) mod 7. +;# +;# This can be accomplished by 4 iterations of the cyclic transform +;# +;# I -> (I+1) mod 7; +;# +;# each iteration can be realized by (d=0, s=2): +;# +;# x = 0; do Tpair( V(2x),V(2x+1), V(x),V(x+4)) while( ++x < 4); +;# +;# The input/output is in registers v0...v7. We use v10...v17 as mirrors; +;# preserving v8 = sign converter. +;# +;# Inverse transpose is similar, except here I -> (I+3) mod 7 and the +;# result lands in the "mirror" registers v10...v17 +;# +.macro t8x16_odd + Tpair v10, v11, v0, v4 + Tpair v12, v13, v1, v5 + Tpair v14, v15, v2, v6 + Tpair v16, v17, v3, v7 +.endm + +.macro t8x16_even + Tpair v0, v1, v10, v14 + Tpair v2, v3, v11, v15 + Tpair v4, v5, v12, v16 + Tpair v6, v7, v13, v17 +.endm + +.macro transpose8x16_fwd + t8x16_odd + t8x16_even + t8x16_odd + t8x16_even +.endm + +.macro transpose8x16_inv + t8x16_odd + t8x16_even + t8x16_odd +.endm + +.macro Transpose16x16 + vmrghb v0, v16, v24 + vmrglb v1, v16, v24 + vmrghb v2, v17, v25 + vmrglb v3, v17, v25 + vmrghb v4, v18, v26 + vmrglb v5, v18, v26 + vmrghb v6, v19, v27 + vmrglb v7, v19, v27 + vmrghb v8, v20, v28 + vmrglb v9, v20, v28 + vmrghb v10, v21, v29 + vmrglb v11, v21, v29 + vmrghb v12, v22, v30 + vmrglb v13, v22, v30 + vmrghb v14, v23, v31 + vmrglb v15, v23, v31 + vmrghb v16, v0, v8 + vmrglb v17, v0, v8 + vmrghb v18, v1, v9 + vmrglb v19, v1, v9 + vmrghb v20, v2, v10 + vmrglb v21, v2, v10 + vmrghb v22, v3, v11 + vmrglb v23, v3, v11 + vmrghb v24, v4, v12 + vmrglb v25, v4, v12 + vmrghb v26, v5, v13 + vmrglb v27, v5, v13 + vmrghb v28, v6, v14 + vmrglb v29, v6, v14 + vmrghb v30, v7, v15 + vmrglb v31, v7, v15 + vmrghb v0, v16, v24 + vmrglb v1, v16, v24 + vmrghb v2, v17, v25 + vmrglb v3, v17, v25 + vmrghb v4, v18, v26 + vmrglb v5, v18, v26 + vmrghb v6, v19, v27 + vmrglb v7, v19, v27 + vmrghb v8, v20, v28 + vmrglb v9, v20, v28 + vmrghb v10, v21, v29 + vmrglb v11, v21, v29 + vmrghb v12, v22, v30 + vmrglb v13, v22, v30 + vmrghb v14, v23, v31 + vmrglb v15, v23, v31 + vmrghb v16, v0, v8 + vmrglb v17, v0, v8 + vmrghb v18, v1, v9 + vmrglb v19, v1, v9 + vmrghb v20, v2, v10 + vmrglb v21, v2, v10 + vmrghb v22, v3, v11 + vmrglb v23, v3, v11 + vmrghb v24, v4, v12 + vmrglb v25, v4, v12 + vmrghb v26, v5, v13 + vmrglb v27, v5, v13 + vmrghb v28, v6, v14 + vmrglb v29, v6, v14 + vmrghb v30, v7, v15 + vmrglb v31, v7, v15 +.endm + +;# load_g loads a global vector (whose address is in the local variable Gptr) +;# into vector register Vreg. Trashes r0 +.macro load_g Vreg, Gptr + lwz r0, \Gptr + lvx \Vreg, 0, r0 +.endm + +;# exploit the saturation here. if the answer is negative +;# it will be clamped to 0. orring 0 with a positive +;# number will be the positive number (abs) +;# RES = abs( A-B), trashes TMP +.macro Abs RES, TMP, A, B + vsububs \RES, \A, \B + vsububs \TMP, \B, \A + vor \RES, \RES, \TMP +.endm + +;# RES = Max( RES, abs( A-B)), trashes TMP +.macro max_abs RES, TMP, A, B + vsububs \TMP, \A, \B + vmaxub \RES, \RES, \TMP + vsububs \TMP, \B, \A + vmaxub \RES, \RES, \TMP +.endm + +.macro Masks + ;# build masks + ;# input is all 8 bit unsigned (0-255). need to + ;# do abs(vala-valb) > limit. but no need to compare each + ;# value to the limit. find the max of the absolute differences + ;# and compare that to the limit. + ;# First hev + Abs v14, v13, v2, v3 ;# |P1 - P0| + max_abs v14, v13, v5, v4 ;# |Q1 - Q0| + + vcmpgtub v10, v14, v10 ;# HEV = true if thresh exceeded + + ;# Next limit + max_abs v14, v13, v0, v1 ;# |P3 - P2| + max_abs v14, v13, v1, v2 ;# |P2 - P1| + max_abs v14, v13, v6, v5 ;# |Q2 - Q1| + max_abs v14, v13, v7, v6 ;# |Q3 - Q2| + + vcmpgtub v9, v14, v9 ;# R = true if limit exceeded + + ;# flimit + Abs v14, v13, v3, v4 ;# |P0 - Q0| + + vcmpgtub v8, v14, v8 ;# X = true if flimit exceeded + + vor v8, v8, v9 ;# R = true if flimit or limit exceeded + ;# done building masks +.endm + +.macro build_constants RFL, RLI, RTH, FL, LI, TH + ;# build constants + lvx \FL, 0, \RFL ;# flimit + lvx \LI, 0, \RLI ;# limit + lvx \TH, 0, \RTH ;# thresh + + vspltisb v11, 8 + vspltisb v12, 4 + vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 +.endm + +.macro load_data_y + ;# setup strides/pointers to be able to access + ;# all of the data + add r5, r4, r4 ;# r5 = 2 * stride + sub r6, r3, r5 ;# r6 -> 2 rows back + neg r7, r4 ;# r7 = -stride + + ;# load 16 pixels worth of data to work on + sub r0, r6, r5 ;# r0 -> 4 rows back (temp) + lvx v0, 0, r0 ;# P3 (read only) + lvx v1, r7, r6 ;# P2 + lvx v2, 0, r6 ;# P1 + lvx v3, r7, r3 ;# P0 + lvx v4, 0, r3 ;# Q0 + lvx v5, r4, r3 ;# Q1 + lvx v6, r5, r3 ;# Q2 + add r0, r3, r5 ;# r0 -> 2 rows fwd (temp) + lvx v7, r4, r0 ;# Q3 (read only) +.endm + +;# Expects +;# v10 == HEV +;# v13 == tmp +;# v14 == tmp +.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT + vxor \P1, \P1, v11 ;# SP1 + vxor \P0, \P0, v11 ;# SP0 + vxor \Q0, \Q0, v11 ;# SQ0 + vxor \Q1, \Q1, v11 ;# SQ1 + + vsubsbs v13, \P1, \Q1 ;# f = c (P1 - Q1) +.if \HEV_PRESENT + vand v13, v13, v10 ;# f &= hev +.endif + vsubsbs v14, \Q0, \P0 ;# -126 <= X = Q0-P0 <= +126 + vaddsbs v13, v13, v14 + vaddsbs v13, v13, v14 + vaddsbs v13, v13, v14 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) + + vandc v13, v13, v8 ;# f &= mask + + vspltisb v8, 3 + vspltisb v9, 4 + + vaddsbs v14, v13, v9 ;# f1 = c (f+4) + vaddsbs v15, v13, v8 ;# f2 = c (f+3) + + vsrab v13, v14, v8 ;# f1 >>= 3 + vsrab v15, v15, v8 ;# f2 >>= 3 + + vsubsbs \Q0, \Q0, v13 ;# u1 = c (SQ0 - f1) + vaddsbs \P0, \P0, v15 ;# u2 = c (SP0 + f2) +.endm + +.macro vp8_mbfilter + Masks + + ;# start the fitering here + vxor v1, v1, v11 ;# SP2 + vxor v2, v2, v11 ;# SP1 + vxor v3, v3, v11 ;# SP0 + vxor v4, v4, v11 ;# SQ0 + vxor v5, v5, v11 ;# SQ1 + vxor v6, v6, v11 ;# SQ2 + + ;# add outer taps if we have high edge variance + vsubsbs v13, v2, v5 ;# f = c (SP1-SQ1) + + vsubsbs v14, v4, v3 ;# SQ0-SP0 + vaddsbs v13, v13, v14 + vaddsbs v13, v13, v14 + vaddsbs v13, v13, v14 ;# f = c( c(SP1-SQ1) + 3*(SQ0-SP0)) + + vandc v13, v13, v8 ;# f &= mask + vand v15, v13, v10 ;# f2 = f & hev + + ;# save bottom 3 bits so that we round one side +4 and the other +3 + vspltisb v8, 3 + vspltisb v9, 4 + + vaddsbs v14, v15, v9 ;# f1 = c (f+4) + vaddsbs v15, v15, v8 ;# f2 = c (f+3) + + vsrab v14, v14, v8 ;# f1 >>= 3 + vsrab v15, v15, v8 ;# f2 >>= 3 + + vsubsbs v4, v4, v14 ;# u1 = c (SQ0 - f1) + vaddsbs v3, v3, v15 ;# u2 = c (SP0 + f2) + + ;# only apply wider filter if not high edge variance + vandc v13, v13, v10 ;# f &= ~hev + + vspltisb v9, 2 + vnor v8, v8, v8 + vsrb v9, v8, v9 ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f + vupkhsb v9, v9 ;# 0x003f003f003f003f003f003f003f003f + vspltisb v8, 9 + + ;# roughly 1/7th difference across boundary + vspltish v10, 7 + vmulosb v14, v8, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) + vmulesb v15, v8, v13 + vaddshs v14, v14, v9 ;# += 63 + vaddshs v15, v15, v9 + vsrah v14, v14, v10 ;# >>= 7 + vsrah v15, v15, v10 + vmrglh v10, v15, v14 + vmrghh v15, v15, v14 + + vpkshss v10, v15, v10 ;# X = saturated down to bytes + + vsubsbs v6, v6, v10 ;# subtract from Q and add to P + vaddsbs v1, v1, v10 + + vxor v6, v6, v11 + vxor v1, v1, v11 + + ;# roughly 2/7th difference across boundary + vspltish v10, 7 + vaddubm v12, v8, v8 + vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) + vmulesb v15, v12, v13 + vaddshs v14, v14, v9 + vaddshs v15, v15, v9 + vsrah v14, v14, v10 ;# >>= 7 + vsrah v15, v15, v10 + vmrglh v10, v15, v14 + vmrghh v15, v15, v14 + + vpkshss v10, v15, v10 ;# X = saturated down to bytes + + vsubsbs v5, v5, v10 ;# subtract from Q and add to P + vaddsbs v2, v2, v10 + + vxor v5, v5, v11 + vxor v2, v2, v11 + + ;# roughly 3/7th difference across boundary + vspltish v10, 7 + vaddubm v12, v12, v8 + vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) + vmulesb v15, v12, v13 + vaddshs v14, v14, v9 + vaddshs v15, v15, v9 + vsrah v14, v14, v10 ;# >>= 7 + vsrah v15, v15, v10 + vmrglh v10, v15, v14 + vmrghh v15, v15, v14 + + vpkshss v10, v15, v10 ;# X = saturated down to bytes + + vsubsbs v4, v4, v10 ;# subtract from Q and add to P + vaddsbs v3, v3, v10 + + vxor v4, v4, v11 + vxor v3, v3, v11 +.endm + +.macro SBFilter + Masks + + common_adjust v3, v4, v2, v5, 1 + + ;# outer tap adjustments + vspltisb v8, 1 + + vaddubm v13, v13, v8 ;# f += 1 + vsrab v13, v13, v8 ;# f >>= 1 + + vandc v13, v13, v10 ;# f &= ~hev + + vsubsbs v5, v5, v13 ;# u1 = c (SQ1 - f) + vaddsbs v2, v2, v13 ;# u2 = c (SP1 + f) + + vxor v2, v2, v11 + vxor v3, v3, v11 + vxor v4, v4, v11 + vxor v5, v5, v11 +.endm + + .align 2 +mbloop_filter_horizontal_edge_y_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + mtspr 256, r12 ;# set VRSAVE + + build_constants r5, r6, r7, v8, v9, v10 + + load_data_y + + vp8_mbfilter + + stvx v1, r7, r6 ;# P2 + stvx v2, 0, r6 ;# P1 + stvx v3, r7, r3 ;# P0 + stvx v4, 0, r3 ;# Q0 + stvx v5, r4, r3 ;# Q1 + stvx v6, r5, r3 ;# Q2 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char *s +;# r4 int p +;# r5 const signed char *flimit +;# r6 const signed char *limit +;# r7 const signed char *thresh +loop_filter_horizontal_edge_y_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + mtspr 256, r12 ;# set VRSAVE + + build_constants r5, r6, r7, v8, v9, v10 + + load_data_y + + SBFilter + + stvx v2, 0, r6 ;# P1 + stvx v3, r7, r3 ;# P0 + stvx v4, 0, r3 ;# Q0 + stvx v5, r4, r3 ;# Q1 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +;# Filtering a vertical mb. Each mb is aligned on a 16 byte boundary. +;# So we can read in an entire mb aligned. However if we want to filter the mb +;# edge we run into problems. For the loopfilter we require 4 bytes before the mb +;# and 4 after for a total of 8 bytes. Reading 16 bytes inorder to get 4 is a bit +;# of a waste. So this is an even uglier way to get around that. +;# Using the regular register file words are read in and then saved back out to +;# memory to align and order them up. Then they are read in using the +;# vector register file. +.macro RLVmb V, R + lwzux r0, r3, r4 + stw r0, 4(\R) + lwz r0,-4(r3) + stw r0, 0(\R) + lwzux r0, r3, r4 + stw r0,12(\R) + lwz r0,-4(r3) + stw r0, 8(\R) + lvx \V, 0, \R +.endm + +.macro WLVmb V, R + stvx \V, 0, \R + lwz r0,12(\R) + stwux r0, r3, r4 + lwz r0, 8(\R) + stw r0,-4(r3) + lwz r0, 4(\R) + stwux r0, r3, r4 + lwz r0, 0(\R) + stw r0,-4(r3) +.endm + + .align 2 +;# r3 unsigned char *s +;# r4 int p +;# r5 const signed char *flimit +;# r6 const signed char *limit +;# r7 const signed char *thresh +mbloop_filter_vertical_edge_y_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xc000 + mtspr 256, r12 ;# set VRSAVE + + la r9, -48(r1) ;# temporary space for reading in vectors + sub r3, r3, r4 + + RLVmb v0, r9 + RLVmb v1, r9 + RLVmb v2, r9 + RLVmb v3, r9 + RLVmb v4, r9 + RLVmb v5, r9 + RLVmb v6, r9 + RLVmb v7, r9 + + transpose8x16_fwd + + build_constants r5, r6, r7, v8, v9, v10 + + vp8_mbfilter + + transpose8x16_inv + + add r3, r3, r4 + neg r4, r4 + + WLVmb v17, r9 + WLVmb v16, r9 + WLVmb v15, r9 + WLVmb v14, r9 + WLVmb v13, r9 + WLVmb v12, r9 + WLVmb v11, r9 + WLVmb v10, r9 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +.macro RL V, R, P + lvx \V, 0, \R + add \R, \R, \P +.endm + +.macro WL V, R, P + stvx \V, 0, \R + add \R, \R, \P +.endm + +.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3 + ;# K = |P0-P1| already + Abs v14, v13, \Q0, \Q1 ;# M = |Q0-Q1| + vmaxub v14, v14, v4 ;# M = max( |P0-P1|, |Q0-Q1|) + vcmpgtub v10, v14, v0 + + Abs v4, v5, \Q2, \Q3 ;# K = |Q2-Q3| = next |P0-P1] + + max_abs v14, v13, \Q1, \Q2 ;# M = max( M, |Q1-Q2|) + max_abs v14, v13, \P1, \P2 ;# M = max( M, |P1-P2|) + max_abs v14, v13, \P2, \P3 ;# M = max( M, |P2-P3|) + + vmaxub v14, v14, v4 ;# M = max interior abs diff + vcmpgtub v9, v14, v2 ;# M = true if int_l exceeded + + Abs v14, v13, \P0, \Q0 ;# X = Abs( P0-Q0) + vcmpgtub v8, v14, v3 ;# X = true if edge_l exceeded + vor v8, v8, v9 ;# M = true if edge_l or int_l exceeded + + ;# replace P1,Q1 w/signed versions + common_adjust \P0, \Q0, \P1, \Q1, 1 + + vaddubm v13, v13, v1 ;# -16 <= M <= 15, saturation irrelevant + vsrab v13, v13, v1 + vandc v13, v13, v10 ;# adjust P1,Q1 by (M+1)>>1 if ! hev + vsubsbs \Q1, \Q1, v13 + vaddsbs \P1, \P1, v13 + + vxor \P1, \P1, v11 ;# P1 + vxor \P0, \P0, v11 ;# P0 + vxor \Q0, \Q0, v11 ;# Q0 + vxor \Q1, \Q1, v11 ;# Q1 +.endm + + + .align 2 +;# r3 unsigned char *s +;# r4 int p +;# r5 const signed char *flimit +;# r6 const signed char *limit +;# r7 const signed char *thresh +loop_filter_vertical_edge_y_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xffff + mtspr 256, r12 ;# set VRSAVE + + addi r9, r3, 0 + RL v16, r9, r4 + RL v17, r9, r4 + RL v18, r9, r4 + RL v19, r9, r4 + RL v20, r9, r4 + RL v21, r9, r4 + RL v22, r9, r4 + RL v23, r9, r4 + RL v24, r9, r4 + RL v25, r9, r4 + RL v26, r9, r4 + RL v27, r9, r4 + RL v28, r9, r4 + RL v29, r9, r4 + RL v30, r9, r4 + lvx v31, 0, r9 + + Transpose16x16 + + vspltisb v1, 1 + + build_constants r5, r6, r7, v3, v2, v0 + + Abs v4, v5, v19, v18 ;# K(v14) = first |P0-P1| + + Fil v16, v17, v18, v19, v20, v21, v22, v23 + Fil v20, v21, v22, v23, v24, v25, v26, v27 + Fil v24, v25, v26, v27, v28, v29, v30, v31 + + Transpose16x16 + + addi r9, r3, 0 + WL v16, r9, r4 + WL v17, r9, r4 + WL v18, r9, r4 + WL v19, r9, r4 + WL v20, r9, r4 + WL v21, r9, r4 + WL v22, r9, r4 + WL v23, r9, r4 + WL v24, r9, r4 + WL v25, r9, r4 + WL v26, r9, r4 + WL v27, r9, r4 + WL v28, r9, r4 + WL v29, r9, r4 + WL v30, r9, r4 + stvx v31, 0, r9 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- +.macro active_chroma_sel V + andi. r7, r3, 8 ;# row origin modulo 16 + add r7, r7, r7 ;# selects selectors + lis r12, _chromaSelectors@ha + la r0, _chromaSelectors@l(r12) + lwzux r0, r7, r0 ;# leave selector addr in r7 + + lvx \V, 0, r0 ;# mask to concatenate active U,V pels +.endm + +.macro hread_uv Dest, U, V, Offs, VMask + lvx \U, \Offs, r3 + lvx \V, \Offs, r4 + vperm \Dest, \U, \V, \VMask ;# Dest = active part of U then V +.endm + +.macro hwrite_uv New, U, V, Offs, Umask, Vmask + vperm \U, \New, \U, \Umask ;# Combine new pels with siblings + vperm \V, \New, \V, \Vmask + stvx \U, \Offs, r3 ;# Write to frame buffer + stvx \V, \Offs, r4 +.endm + +;# Process U,V in parallel. +.macro load_chroma_h + neg r9, r5 ;# r9 = -1 * stride + add r8, r9, r9 ;# r8 = -2 * stride + add r10, r5, r5 ;# r10 = 2 * stride + + active_chroma_sel v12 + + ;# P3, Q3 are read-only; need not save addresses or sibling pels + add r6, r8, r8 ;# r6 = -4 * stride + hread_uv v0, v14, v15, r6, v12 + add r6, r10, r5 ;# r6 = 3 * stride + hread_uv v7, v14, v15, r6, v12 + + ;# Others are read/write; save addresses and sibling pels + + add r6, r8, r9 ;# r6 = -3 * stride + hread_uv v1, v16, v17, r6, v12 + hread_uv v2, v18, v19, r8, v12 + hread_uv v3, v20, v21, r9, v12 + hread_uv v4, v22, v23, 0, v12 + hread_uv v5, v24, v25, r5, v12 + hread_uv v6, v26, v27, r10, v12 +.endm + +.macro uresult_sel V + load_g \V, 4(r7) +.endm + +.macro vresult_sel V + load_g \V, 8(r7) +.endm + +;# always write P1,P0,Q0,Q1 +.macro store_chroma_h + uresult_sel v11 + vresult_sel v12 + hwrite_uv v2, v18, v19, r8, v11, v12 + hwrite_uv v3, v20, v21, r9, v11, v12 + hwrite_uv v4, v22, v23, 0, v11, v12 + hwrite_uv v5, v24, v25, r5, v11, v12 +.endm + + .align 2 +;# r3 unsigned char *u +;# r4 unsigned char *v +;# r5 int p +;# r6 const signed char *flimit +;# r7 const signed char *limit +;# r8 const signed char *thresh +mbloop_filter_horizontal_edge_uv_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xffff + mtspr 256, r12 ;# set VRSAVE + + build_constants r6, r7, r8, v8, v9, v10 + + load_chroma_h + + vp8_mbfilter + + store_chroma_h + + hwrite_uv v1, v16, v17, r6, v11, v12 ;# v1 == P2 + hwrite_uv v6, v26, v27, r10, v11, v12 ;# v6 == Q2 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char *u +;# r4 unsigned char *v +;# r5 int p +;# r6 const signed char *flimit +;# r7 const signed char *limit +;# r8 const signed char *thresh +loop_filter_horizontal_edge_uv_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xffff + mtspr 256, r12 ;# set VRSAVE + + build_constants r6, r7, r8, v8, v9, v10 + + load_chroma_h + + SBFilter + + store_chroma_h + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +.macro R V, R + lwzux r0, r3, r5 + stw r0, 4(\R) + lwz r0,-4(r3) + stw r0, 0(\R) + lwzux r0, r4, r5 + stw r0,12(\R) + lwz r0,-4(r4) + stw r0, 8(\R) + lvx \V, 0, \R +.endm + + +.macro W V, R + stvx \V, 0, \R + lwz r0,12(\R) + stwux r0, r4, r5 + lwz r0, 8(\R) + stw r0,-4(r4) + lwz r0, 4(\R) + stwux r0, r3, r5 + lwz r0, 0(\R) + stw r0,-4(r3) +.endm + +.macro chroma_vread R + sub r3, r3, r5 ;# back up one line for simplicity + sub r4, r4, r5 + + R v0, \R + R v1, \R + R v2, \R + R v3, \R + R v4, \R + R v5, \R + R v6, \R + R v7, \R + + transpose8x16_fwd +.endm + +.macro chroma_vwrite R + + transpose8x16_inv + + add r3, r3, r5 + add r4, r4, r5 + neg r5, r5 ;# Write rows back in reverse order + + W v17, \R + W v16, \R + W v15, \R + W v14, \R + W v13, \R + W v12, \R + W v11, \R + W v10, \R +.endm + + .align 2 +;# r3 unsigned char *u +;# r4 unsigned char *v +;# r5 int p +;# r6 const signed char *flimit +;# r7 const signed char *limit +;# r8 const signed char *thresh +mbloop_filter_vertical_edge_uv_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xc000 + mtspr 256, r12 ;# set VRSAVE + + la r9, -48(r1) ;# temporary space for reading in vectors + + chroma_vread r9 + + build_constants r6, r7, r8, v8, v9, v10 + + vp8_mbfilter + + chroma_vwrite r9 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .align 2 +;# r3 unsigned char *u +;# r4 unsigned char *v +;# r5 int p +;# r6 const signed char *flimit +;# r7 const signed char *limit +;# r8 const signed char *thresh +loop_filter_vertical_edge_uv_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xc000 + mtspr 256, r12 ;# set VRSAVE + + la r9, -48(r1) ;# temporary space for reading in vectors + + chroma_vread r9 + + build_constants r6, r7, r8, v8, v9, v10 + + SBFilter + + chroma_vwrite r9 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=- + +.macro vp8_simple_filter + Abs v14, v13, v1, v2 ;# M = abs( P0 - Q0) + vcmpgtub v8, v14, v8 ;# v5 = true if _over_ limit + + ;# preserve unsigned v0 and v3 + common_adjust v1, v2, v0, v3, 0 + + vxor v1, v1, v11 + vxor v2, v2, v11 ;# cvt Q0, P0 back to pels +.endm + +.macro simple_vertical + addi r8, 0, 16 + addi r7, r5, 32 + + lvx v0, 0, r5 + lvx v1, r8, r5 + lvx v2, 0, r7 + lvx v3, r8, r7 + + lis r12, _B_hihi@ha + la r0, _B_hihi@l(r12) + lvx v16, 0, r0 + + lis r12, _B_lolo@ha + la r0, _B_lolo@l(r12) + lvx v17, 0, r0 + + Transpose4times4x4 v16, v17 + vp8_simple_filter + + vxor v0, v0, v11 + vxor v3, v3, v11 ;# cvt Q0, P0 back to pels + + Transpose4times4x4 v16, v17 + + stvx v0, 0, r5 + stvx v1, r8, r5 + stvx v2, 0, r7 + stvx v3, r8, r7 +.endm + + .align 2 +;# r3 unsigned char *s +;# r4 int p +;# r5 const signed char *flimit +loop_filter_simple_horizontal_edge_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + mtspr 256, r12 ;# set VRSAVE + + ;# build constants + lvx v8, 0, r5 ;# flimit + + vspltisb v11, 8 + vspltisb v12, 4 + vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 + + neg r5, r4 ;# r5 = -1 * stride + add r6, r5, r5 ;# r6 = -2 * stride + + lvx v0, r6, r3 ;# v0 = P1 = 16 pels two rows above edge + lvx v1, r5, r3 ;# v1 = P0 = 16 pels one row above edge + lvx v2, 0, r3 ;# v2 = Q0 = 16 pels one row below edge + lvx v3, r4, r3 ;# v3 = Q1 = 16 pels two rows below edge + + vp8_simple_filter + + stvx v1, r5, r3 ;# store P0 + stvx v2, 0, r3 ;# store Q0 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + +.macro RLV Offs + stw r0, (\Offs*4)(r5) + lwzux r0, r7, r4 +.endm + +.macro WLV Offs + lwz r0, (\Offs*4)(r5) + stwux r0, r7, r4 +.endm + + .align 2 +;# r3 unsigned char *s +;# r4 int p +;# r5 const signed char *flimit +loop_filter_simple_vertical_edge_ppc: + mfspr r11, 256 ;# get old VRSAVE + oris r12, r11, 0xffff + ori r12, r12, 0xc000 + mtspr 256, r12 ;# set VRSAVE + + ;# build constants + lvx v8, 0, r5 ;# flimit + + vspltisb v11, 8 + vspltisb v12, 4 + vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 + + la r5, -96(r1) ;# temporary space for reading in vectors + + ;# Store 4 pels at word "Offs" in temp array, then advance r7 + ;# to next row and read another 4 pels from the frame buffer. + + subi r7, r3, 2 ;# r7 -> 2 pels before start + lwzx r0, 0, r7 ;# read first 4 pels + + ;# 16 unaligned word accesses + RLV 0 + RLV 4 + RLV 8 + RLV 12 + RLV 1 + RLV 5 + RLV 9 + RLV 13 + RLV 2 + RLV 6 + RLV 10 + RLV 14 + RLV 3 + RLV 7 + RLV 11 + + stw r0, (15*4)(r5) ;# write last 4 pels + + simple_vertical + + ;# Read temp array, write frame buffer. + subi r7, r3, 2 ;# r7 -> 2 pels before start + lwzx r0, 0, r5 ;# read/write first 4 pels + stwx r0, 0, r7 + + WLV 4 + WLV 8 + WLV 12 + WLV 1 + WLV 5 + WLV 9 + WLV 13 + WLV 2 + WLV 6 + WLV 10 + WLV 14 + WLV 3 + WLV 7 + WLV 11 + WLV 15 + + mtspr 256, r11 ;# reset old VRSAVE + + blr + + .data + +_chromaSelectors: + .long _B_hihi + .long _B_Ures0 + .long _B_Vres0 + .long 0 + .long _B_lolo + .long _B_Ures8 + .long _B_Vres8 + .long 0 + + .align 4 +_B_Vres8: + .byte 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 + + .align 4 +_B_Ures8: + .byte 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7 + + .align 4 +_B_lolo: + .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 + + .align 4 +_B_Vres0: + .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 + .align 4 +_B_Ures0: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 + + .align 4 +_B_hihi: + .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 diff --git a/vp9/common/ppc/vp9_platform_altivec.asm b/vp9/common/ppc/vp9_platform_altivec.asm new file mode 100644 index 000000000..f81d86f74 --- /dev/null +++ b/vp9/common/ppc/vp9_platform_altivec.asm @@ -0,0 +1,59 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl save_platform_context + .globl restore_platform_context + +.macro W V P + stvx \V, 0, \P + addi \P, \P, 16 +.endm + +.macro R V P + lvx \V, 0, \P + addi \P, \P, 16 +.endm + +;# r3 context_ptr + .align 2 +save_platform_contex: + W v20, r3 + W v21, r3 + W v22, r3 + W v23, r3 + W v24, r3 + W v25, r3 + W v26, r3 + W v27, r3 + W v28, r3 + W v29, r3 + W v30, r3 + W v31, r3 + + blr + +;# r3 context_ptr + .align 2 +restore_platform_context: + R v20, r3 + R v21, r3 + R v22, r3 + R v23, r3 + R v24, r3 + R v25, r3 + R v26, r3 + R v27, r3 + R v28, r3 + R v29, r3 + R v30, r3 + R v31, r3 + + blr diff --git a/vp9/common/ppc/vp9_recon_altivec.asm b/vp9/common/ppc/vp9_recon_altivec.asm new file mode 100644 index 000000000..dd39e05a8 --- /dev/null +++ b/vp9/common/ppc/vp9_recon_altivec.asm @@ -0,0 +1,175 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + .globl recon4b_ppc + .globl recon2b_ppc + .globl recon_b_ppc + +.macro row_of16 Diff Pred Dst Stride + lvx v1, 0, \Pred ;# v1 = pred = p0..p15 + addi \Pred, \Pred, 16 ;# next pred + vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7 + lvx v3, 0, \Diff ;# v3 = d0..d7 + vaddshs v2, v2, v3 ;# v2 = r0..r7 + vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15 + lvx v3, r8, \Diff ;# v3 = d8..d15 + addi \Diff, \Diff, 32 ;# next diff + vaddshs v3, v3, v1 ;# v3 = r8..r15 + vpkshus v2, v2, v3 ;# v2 = 8-bit r0..r15 + stvx v2, 0, \Dst ;# to dst + add \Dst, \Dst, \Stride ;# next dst +.endm + + .text + .align 2 +;# r3 = short *diff_ptr, +;# r4 = unsigned char *pred_ptr, +;# r5 = unsigned char *dst_ptr, +;# r6 = int stride +recon4b_ppc: + mfspr r0, 256 ;# get old VRSAVE + stw r0, -8(r1) ;# save old VRSAVE to stack + oris r0, r0, 0xf000 + mtspr 256,r0 ;# set VRSAVE + + vxor v0, v0, v0 + li r8, 16 + + row_of16 r3, r4, r5, r6 + row_of16 r3, r4, r5, r6 + row_of16 r3, r4, r5, r6 + row_of16 r3, r4, r5, r6 + + lwz r12, -8(r1) ;# restore old VRSAVE from stack + mtspr 256, r12 ;# reset old VRSAVE + + blr + +.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels + lvx v1, 0, \Pred ;# v1 = pred = p0..p15 + vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7 + lvx v3, 0, \Diff ;# v3 = d0..d7 + vaddshs v2, v2, v3 ;# v2 = r0..r7 + vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15 + lvx v3, r8, \Diff ;# v2 = d8..d15 + vaddshs v3, v3, v1 ;# v3 = r8..r15 + vpkshus v2, v2, v3 ;# v3 = 8-bit r0..r15 + stvx v2, 0, r10 ;# 2 rows to dst from buf + lwz r0, 0(r10) +.if \write_first_four_pels + stw r0, 0(\Dst) + .else + stwux r0, \Dst, \Stride +.endif + lwz r0, 4(r10) + stw r0, 4(\Dst) + lwz r0, 8(r10) + stwux r0, \Dst, \Stride ;# advance dst to next row + lwz r0, 12(r10) + stw r0, 4(\Dst) +.endm + + .align 2 +;# r3 = short *diff_ptr, +;# r4 = unsigned char *pred_ptr, +;# r5 = unsigned char *dst_ptr, +;# r6 = int stride + +recon2b_ppc: + mfspr r0, 256 ;# get old VRSAVE + stw r0, -8(r1) ;# save old VRSAVE to stack + oris r0, r0, 0xf000 + mtspr 256,r0 ;# set VRSAVE + + vxor v0, v0, v0 + li r8, 16 + + la r10, -48(r1) ;# buf + + two_rows_of8 r3, r4, r5, r6, 1 + + addi r4, r4, 16; ;# next pred + addi r3, r3, 32; ;# next diff + + two_rows_of8 r3, r4, r5, r6, 0 + + lwz r12, -8(r1) ;# restore old VRSAVE from stack + mtspr 256, r12 ;# reset old VRSAVE + + blr + +.macro get_two_diff_rows + stw r0, 0(r10) + lwz r0, 4(r3) + stw r0, 4(r10) + lwzu r0, 32(r3) + stw r0, 8(r10) + lwz r0, 4(r3) + stw r0, 12(r10) + lvx v3, 0, r10 +.endm + + .align 2 +;# r3 = short *diff_ptr, +;# r4 = unsigned char *pred_ptr, +;# r5 = unsigned char *dst_ptr, +;# r6 = int stride +recon_b_ppc: + mfspr r0, 256 ;# get old VRSAVE + stw r0, -8(r1) ;# save old VRSAVE to stack + oris r0, r0, 0xf000 + mtspr 256,r0 ;# set VRSAVE + + vxor v0, v0, v0 + + la r10, -48(r1) ;# buf + + lwz r0, 0(r4) + stw r0, 0(r10) + lwz r0, 16(r4) + stw r0, 4(r10) + lwz r0, 32(r4) + stw r0, 8(r10) + lwz r0, 48(r4) + stw r0, 12(r10) + + lvx v1, 0, r10; ;# v1 = pred = p0..p15 + + lwz r0, 0(r3) ;# v3 = d0..d7 + + get_two_diff_rows + + vmrghb v2, v0, v1; ;# v2 = 16-bit p0..p7 + vaddshs v2, v2, v3; ;# v2 = r0..r7 + + lwzu r0, 32(r3) ;# v3 = d8..d15 + + get_two_diff_rows + + vmrglb v1, v0, v1; ;# v1 = 16-bit p8..p15 + vaddshs v3, v3, v1; ;# v3 = r8..r15 + + vpkshus v2, v2, v3; ;# v2 = 8-bit r0..r15 + stvx v2, 0, r10; ;# 16 pels to dst from buf + + lwz r0, 0(r10) + stw r0, 0(r5) + lwz r0, 4(r10) + stwux r0, r5, r6 + lwz r0, 8(r10) + stwux r0, r5, r6 + lwz r0, 12(r10) + stwx r0, r5, r6 + + lwz r12, -8(r1) ;# restore old VRSAVE from stack + mtspr 256, r12 ;# reset old VRSAVE + + blr diff --git a/vp9/common/ppc/vp9_systemdependent.c b/vp9/common/ppc/vp9_systemdependent.c new file mode 100644 index 000000000..2137dee9c --- /dev/null +++ b/vp9/common/ppc/vp9_systemdependent.c @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9_subpixel.h" +#include "vp9_loopfilter.h" +#include "recon.h" +#include "vp9_onyxc_int.h" + +void (*vp8_short_idct4x4)(short *input, short *output, int pitch); +void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch); +void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch); + +extern void (*vp9_post_proc_down_and_across)( + unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int rows, + int cols, + int flimit +); + +extern void (*vp9_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit); +extern void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit); +extern void (*vp9_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit); +extern void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit); + +extern void vp9_post_proc_down_and_across_c +( + unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int rows, + int cols, + int flimit +); +void vp9_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a); + +extern copy_mem_block_function *vp9_copy_mem16x16; +extern copy_mem_block_function *vp9_copy_mem8x8; +extern copy_mem_block_function *vp9_copy_mem8x4; + +// PPC +extern subpixel_predict_function sixtap_predict_ppc; +extern subpixel_predict_function sixtap_predict8x4_ppc; +extern subpixel_predict_function sixtap_predict8x8_ppc; +extern subpixel_predict_function sixtap_predict16x16_ppc; +extern subpixel_predict_function bilinear_predict4x4_ppc; +extern subpixel_predict_function bilinear_predict8x4_ppc; +extern subpixel_predict_function bilinear_predict8x8_ppc; +extern subpixel_predict_function bilinear_predict16x16_ppc; + +extern copy_mem_block_function copy_mem16x16_ppc; + +void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); +void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); +void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); + +extern void short_idct4x4llm_ppc(short *input, short *output, int pitch); + +// Generic C +extern subpixel_predict_function vp9_sixtap_predict_c; +extern subpixel_predict_function vp9_sixtap_predict8x4_c; +extern subpixel_predict_function vp9_sixtap_predict8x8_c; +extern subpixel_predict_function vp9_sixtap_predict16x16_c; +extern subpixel_predict_function vp9_bilinear_predict4x4_c; +extern subpixel_predict_function vp9_bilinear_predict8x4_c; +extern subpixel_predict_function vp9_bilinear_predict8x8_c; +extern subpixel_predict_function vp9_bilinear_predict16x16_c; + +extern copy_mem_block_function vp9_copy_mem16x16_c; +extern copy_mem_block_function vp9_copy_mem8x8_c; +extern copy_mem_block_function vp9_copy_mem8x4_c; + +void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); +void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); +void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); + +extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch); +extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch); +extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch); + +// PPC +extern loop_filter_block_function loop_filter_mbv_ppc; +extern loop_filter_block_function loop_filter_bv_ppc; +extern loop_filter_block_function loop_filter_mbh_ppc; +extern loop_filter_block_function loop_filter_bh_ppc; + +extern loop_filter_block_function loop_filter_mbvs_ppc; +extern loop_filter_block_function loop_filter_bvs_ppc; +extern loop_filter_block_function loop_filter_mbhs_ppc; +extern loop_filter_block_function loop_filter_bhs_ppc; + +// Generic C +extern loop_filter_block_function vp9_loop_filter_mbv_c; +extern loop_filter_block_function vp9_loop_filter_bv_c; +extern loop_filter_block_function vp9_loop_filter_mbh_c; +extern loop_filter_block_function vp9_loop_filter_bh_c; + +extern loop_filter_block_function vp9_loop_filter_mbvs_c; +extern loop_filter_block_function vp9_loop_filter_bvs_c; +extern loop_filter_block_function vp9_loop_filter_mbhs_c; +extern loop_filter_block_function vp9_loop_filter_bhs_c; + +extern loop_filter_block_function *vp8_lf_mbvfull; +extern loop_filter_block_function *vp8_lf_mbhfull; +extern loop_filter_block_function *vp8_lf_bvfull; +extern loop_filter_block_function *vp8_lf_bhfull; + +extern loop_filter_block_function *vp8_lf_mbvsimple; +extern loop_filter_block_function *vp8_lf_mbhsimple; +extern loop_filter_block_function *vp8_lf_bvsimple; +extern loop_filter_block_function *vp8_lf_bhsimple; + +void vp9_clear_c(void) { +} + +void vp9_machine_specific_config(void) { + // Pure C: + vp9_clear_system_state = vp9_clear_c; + vp9_recon_b = vp9_recon_b_c; + vp9_recon4b = vp9_recon4b_c; + vp9_recon2b = vp9_recon2b_c; + + vp9_bilinear_predict16x16 = bilinear_predict16x16_ppc; + vp9_bilinear_predict8x8 = bilinear_predict8x8_ppc; + vp9_bilinear_predict8x4 = bilinear_predict8x4_ppc; + vp8_bilinear_predict = bilinear_predict4x4_ppc; + + vp9_sixtap_predict16x16 = sixtap_predict16x16_ppc; + vp9_sixtap_predict8x8 = sixtap_predict8x8_ppc; + vp9_sixtap_predict8x4 = sixtap_predict8x4_ppc; + vp9_sixtap_predict = sixtap_predict_ppc; + + vp8_short_idct4x4_1 = vp9_short_idct4x4llm_1_c; + vp8_short_idct4x4 = short_idct4x4llm_ppc; + vp8_dc_only_idct = vp8_dc_only_idct_c; + + vp8_lf_mbvfull = loop_filter_mbv_ppc; + vp8_lf_bvfull = loop_filter_bv_ppc; + vp8_lf_mbhfull = loop_filter_mbh_ppc; + vp8_lf_bhfull = loop_filter_bh_ppc; + + vp8_lf_mbvsimple = loop_filter_mbvs_ppc; + vp8_lf_bvsimple = loop_filter_bvs_ppc; + vp8_lf_mbhsimple = loop_filter_mbhs_ppc; + vp8_lf_bhsimple = loop_filter_bhs_ppc; + + vp9_post_proc_down_and_across = vp9_post_proc_down_and_across_c; + vp9_mbpost_proc_down = vp9_mbpost_proc_down_c; + vp9_mbpost_proc_across_ip = vp9_mbpost_proc_across_ip_c; + vp9_plane_add_noise = vp9_plane_add_noise_c; + + vp9_copy_mem16x16 = copy_mem16x16_ppc; + vp9_copy_mem8x8 = vp9_copy_mem8x8_c; + vp9_copy_mem8x4 = vp9_copy_mem8x4_c; + +} diff --git a/vp9/common/ppflags.h b/vp9/common/ppflags.h deleted file mode 100644 index fd8371180..000000000 --- a/vp9/common/ppflags.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_PPFLAGS_H -#define __INC_PPFLAGS_H -enum { - VP9D_NOFILTERING = 0, - VP9D_DEBLOCK = 1 << 0, - VP9D_DEMACROBLOCK = 1 << 1, - VP9D_ADDNOISE = 1 << 2, - VP9D_DEBUG_TXT_FRAME_INFO = 1 << 3, - VP9D_DEBUG_TXT_MBLK_MODES = 1 << 4, - VP9D_DEBUG_TXT_DC_DIFF = 1 << 5, - VP9D_DEBUG_TXT_RATE_INFO = 1 << 6, - VP9D_DEBUG_DRAW_MV = 1 << 7, - VP9D_DEBUG_CLR_BLK_MODES = 1 << 8, - VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9 -}; - -typedef struct { - int post_proc_flag; - int deblocking_level; - int noise_level; - int display_ref_frame_flag; - int display_mb_modes_flag; - int display_b_modes_flag; - int display_mv_flag; -} vp9_ppflags_t; - -#endif diff --git a/vp9/common/pragmas.h b/vp9/common/pragmas.h deleted file mode 100644 index 99fee5ae2..000000000 --- a/vp9/common/pragmas.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - - - -#ifdef __INTEL_COMPILER -#pragma warning(disable:997 1011 170) -#endif -#ifdef _MSC_VER -#pragma warning(disable:4799) -#endif diff --git a/vp9/common/pred_common.c b/vp9/common/pred_common.c deleted file mode 100644 index d779c641d..000000000 --- a/vp9/common/pred_common.c +++ /dev/null @@ -1,463 +0,0 @@ - -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vp9/common/pred_common.h" -#include "vp9/common/seg_common.h" - -// TBD prediction functions for various bitstream signals - -// Returns a context number for the given MB prediction signal -unsigned char vp9_get_pred_context(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - PRED_ID pred_id) { - int pred_context; - MODE_INFO *m = xd->mode_info_context; - - // Note: - // The mode info data structure has a one element border above and to the - // left of the entries correpsonding to real macroblocks. - // The prediction flags in these dummy entries are initialised to 0. - switch (pred_id) { - case PRED_SEG_ID: - pred_context = (m - 1)->mbmi.seg_id_predicted + - (m - cm->mode_info_stride)->mbmi.seg_id_predicted; - break; - - - case PRED_REF: - pred_context = (m - 1)->mbmi.ref_predicted + - (m - cm->mode_info_stride)->mbmi.ref_predicted; - break; - - case PRED_COMP: - // Context based on use of comp pred flag by neighbours - // pred_context = - // ((m - 1)->mbmi.second_ref_frame > INTRA_FRAME) + - // ((m - cm->mode_info_stride)->mbmi.second_ref_frame > INTRA_FRAME); - - // Context based on mode and reference frame - // if ( m->mbmi.ref_frame == LAST_FRAME ) - // pred_context = 0 + (m->mbmi.mode != ZEROMV); - // else if ( m->mbmi.ref_frame == GOLDEN_FRAME ) - // pred_context = 2 + (m->mbmi.mode != ZEROMV); - // else - // pred_context = 4 + (m->mbmi.mode != ZEROMV); - - if (m->mbmi.ref_frame == LAST_FRAME) - pred_context = 0; - else - pred_context = 1; - - break; - - case PRED_MBSKIP: - pred_context = (m - 1)->mbmi.mb_skip_coeff + - (m - cm->mode_info_stride)->mbmi.mb_skip_coeff; - break; - - case PRED_SWITCHABLE_INTERP: - { - int left_in_image = (m - 1)->mbmi.mb_in_image; - int above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image; - int left_mode = (m - 1)->mbmi.mode; - int above_mode = (m - cm->mode_info_stride)->mbmi.mode; - int left_interp, above_interp; - if (left_in_image && left_mode >= NEARESTMV && left_mode <= SPLITMV) - left_interp = vp9_switchable_interp_map[(m - 1)->mbmi.interp_filter]; - else - left_interp = VP9_SWITCHABLE_FILTERS; - if (above_in_image && above_mode >= NEARESTMV && above_mode <= SPLITMV) - above_interp = vp9_switchable_interp_map[ - (m - cm->mode_info_stride)->mbmi.interp_filter]; - else - above_interp = VP9_SWITCHABLE_FILTERS; - - if (left_interp == above_interp) - pred_context = left_interp; - else if (left_interp == VP9_SWITCHABLE_FILTERS && - above_interp != VP9_SWITCHABLE_FILTERS) - pred_context = above_interp; - else if (left_interp != VP9_SWITCHABLE_FILTERS && - above_interp == VP9_SWITCHABLE_FILTERS) - pred_context = left_interp; - else - pred_context = VP9_SWITCHABLE_FILTERS; - } - break; - - default: - // TODO *** add error trap code. - pred_context = 0; - break; - } - - return pred_context; -} - -// This function returns a context probability for coding a given -// prediction signal -vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - PRED_ID pred_id) { - vp9_prob pred_probability; - int pred_context; - - // Get the appropriate prediction context - pred_context = vp9_get_pred_context(cm, xd, pred_id); - - switch (pred_id) { - case PRED_SEG_ID: - pred_probability = cm->segment_pred_probs[pred_context]; - break; - - case PRED_REF: - pred_probability = cm->ref_pred_probs[pred_context]; - break; - - case PRED_COMP: - // In keeping with convention elsewhre the probability returned is - // the probability of a "0" outcome which in this case means the - // probability of comp pred off. - pred_probability = cm->prob_comppred[pred_context]; - break; - - case PRED_MBSKIP: - pred_probability = cm->mbskip_pred_probs[pred_context]; - break; - - default: - // TODO *** add error trap code. - pred_probability = 128; - break; - } - - return pred_probability; -} - -// This function returns a context probability ptr for coding a given -// prediction signal -const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - PRED_ID pred_id) { - const vp9_prob *pred_probability; - int pred_context; - - // Get the appropriate prediction context - pred_context = vp9_get_pred_context(cm, xd, pred_id); - - switch (pred_id) { - case PRED_SEG_ID: - pred_probability = &cm->segment_pred_probs[pred_context]; - break; - - case PRED_REF: - pred_probability = &cm->ref_pred_probs[pred_context]; - break; - - case PRED_COMP: - // In keeping with convention elsewhre the probability returned is - // the probability of a "0" outcome which in this case means the - // probability of comp pred off. - pred_probability = &cm->prob_comppred[pred_context]; - break; - - case PRED_MBSKIP: - pred_probability = &cm->mbskip_pred_probs[pred_context]; - break; - - case PRED_SWITCHABLE_INTERP: - pred_probability = &cm->fc.switchable_interp_prob[pred_context][0]; - break; - - default: - // TODO *** add error trap code. - pred_probability = NULL; - break; - } - - return pred_probability; -} - -// This function returns the status of the given prediction signal. -// I.e. is the predicted value for the given signal correct. -unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd, - PRED_ID pred_id) { - unsigned char pred_flag = 0; - - switch (pred_id) { - case PRED_SEG_ID: - pred_flag = xd->mode_info_context->mbmi.seg_id_predicted; - break; - - case PRED_REF: - pred_flag = xd->mode_info_context->mbmi.ref_predicted; - break; - - case PRED_MBSKIP: - pred_flag = xd->mode_info_context->mbmi.mb_skip_coeff; - break; - - default: - // TODO *** add error trap code. - pred_flag = 0; - break; - } - - return pred_flag; -} - -// This function sets the status of the given prediction signal. -// I.e. is the predicted value for the given signal correct. -void vp9_set_pred_flag(MACROBLOCKD *const xd, - PRED_ID pred_id, - unsigned char pred_flag) { -#if CONFIG_SUPERBLOCKS - const int mis = xd->mode_info_stride; -#endif - - switch (pred_id) { - case PRED_SEG_ID: - xd->mode_info_context->mbmi.seg_id_predicted = pred_flag; -#if CONFIG_SUPERBLOCKS - if (xd->mode_info_context->mbmi.encoded_as_sb) { - if (xd->mb_to_right_edge >= 0) - xd->mode_info_context[1].mbmi.seg_id_predicted = pred_flag; - if (xd->mb_to_bottom_edge >= 0) { - xd->mode_info_context[mis].mbmi.seg_id_predicted = pred_flag; - if (xd->mb_to_right_edge >= 0) - xd->mode_info_context[mis + 1].mbmi.seg_id_predicted = pred_flag; - } - } -#endif - break; - - case PRED_REF: - xd->mode_info_context->mbmi.ref_predicted = pred_flag; -#if CONFIG_SUPERBLOCKS - if (xd->mode_info_context->mbmi.encoded_as_sb) { - if (xd->mb_to_right_edge >= 0) - xd->mode_info_context[1].mbmi.ref_predicted = pred_flag; - if (xd->mb_to_bottom_edge >= 0) { - xd->mode_info_context[mis].mbmi.ref_predicted = pred_flag; - if (xd->mb_to_right_edge >= 0) - xd->mode_info_context[mis + 1].mbmi.ref_predicted = pred_flag; - } - } -#endif - break; - - case PRED_MBSKIP: - xd->mode_info_context->mbmi.mb_skip_coeff = pred_flag; -#if CONFIG_SUPERBLOCKS - if (xd->mode_info_context->mbmi.encoded_as_sb) { - if (xd->mb_to_right_edge >= 0) - xd->mode_info_context[1].mbmi.mb_skip_coeff = pred_flag; - if (xd->mb_to_bottom_edge >= 0) { - xd->mode_info_context[mis].mbmi.mb_skip_coeff = pred_flag; - if (xd->mb_to_right_edge >= 0) - xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = pred_flag; - } - } -#endif - break; - - default: - // TODO *** add error trap code. - break; - } -} - - -// The following contain the guts of the prediction code used to -// peredict various bitstream signals. - -// Macroblock segment id prediction function -unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, int MbIndex) { - // Currently the prediction for the macroblock segment ID is - // the value stored for this macroblock in the previous frame. -#if CONFIG_SUPERBLOCKS - if (!xd->mode_info_context->mbmi.encoded_as_sb) { -#endif - return cm->last_frame_seg_map[MbIndex]; -#if CONFIG_SUPERBLOCKS - } else { - int seg_id = cm->last_frame_seg_map[MbIndex]; - int mb_col = MbIndex % cm->mb_cols; - int mb_row = MbIndex / cm->mb_cols; - if (mb_col + 1 < cm->mb_cols) - seg_id = seg_id && cm->last_frame_seg_map[MbIndex + 1]; - if (mb_row + 1 < cm->mb_rows) { - seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols]; - if (mb_col + 1 < cm->mb_cols) - seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols + 1]; - } - return seg_id; - } -#endif -} - -MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd) { - MODE_INFO *m = xd->mode_info_context; - - MV_REFERENCE_FRAME left; - MV_REFERENCE_FRAME above; - MV_REFERENCE_FRAME above_left; - MV_REFERENCE_FRAME pred_ref = LAST_FRAME; - - int segment_id = xd->mode_info_context->mbmi.segment_id; - int seg_ref_active; - int i; - - unsigned char frame_allowed[MAX_REF_FRAMES] = {1, 1, 1, 1}; - unsigned char ref_score[MAX_REF_FRAMES]; - unsigned char best_score = 0; - unsigned char left_in_image; - unsigned char above_in_image; - unsigned char above_left_in_image; - - // Is segment coding ennabled - seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME); - - // Special case treatment if segment coding is enabled. - // Dont allow prediction of a reference frame that the segment - // does not allow - if (seg_ref_active) { - for (i = 0; i < MAX_REF_FRAMES; i++) { - frame_allowed[i] = - vp9_check_segref(xd, segment_id, i); - - // Score set to 0 if ref frame not allowed - ref_score[i] = cm->ref_scores[i] * frame_allowed[i]; - } - } else - vpx_memcpy(ref_score, cm->ref_scores, sizeof(ref_score)); - - // Reference frames used by neighbours - left = (m - 1)->mbmi.ref_frame; - above = (m - cm->mode_info_stride)->mbmi.ref_frame; - above_left = (m - 1 - cm->mode_info_stride)->mbmi.ref_frame; - - // Are neighbours in image - left_in_image = (m - 1)->mbmi.mb_in_image; - above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image; - above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image; - - // Adjust scores for candidate reference frames based on neigbours - if (frame_allowed[left] && left_in_image) { - ref_score[left] += 16; - if (above_left_in_image && (left == above_left)) - ref_score[left] += 4; - } - if (frame_allowed[above] && above_in_image) { - ref_score[above] += 16; - if (above_left_in_image && (above == above_left)) - ref_score[above] += 4; - } - - // Now choose the candidate with the highest score - for (i = 0; i < MAX_REF_FRAMES; i++) { - if (ref_score[i] > best_score) { - pred_ref = i; - best_score = ref_score[i]; - } - } - - return pred_ref; -} - -// Functions to computes a set of modified reference frame probabilities -// to use when the prediction of the reference frame value fails -void vp9_calc_ref_probs(int *count, vp9_prob *probs) { - int tot_count; - - tot_count = count[0] + count[1] + count[2] + count[3]; - if (tot_count) { - probs[0] = (vp9_prob)((count[0] * 255 + (tot_count >> 1)) / tot_count); - probs[0] += !probs[0]; - } else - probs[0] = 128; - - tot_count -= count[0]; - if (tot_count) { - probs[1] = (vp9_prob)((count[1] * 255 + (tot_count >> 1)) / tot_count); - probs[1] += !probs[1]; - } else - probs[1] = 128; - - tot_count -= count[1]; - if (tot_count) { - probs[2] = (vp9_prob)((count[2] * 255 + (tot_count >> 1)) / tot_count); - probs[2] += !probs[2]; - } else - probs[2] = 128; - -} - -// Computes a set of modified conditional probabilities for the reference frame -// Values willbe set to 0 for reference frame options that are not possible -// because wither they were predicted and prediction has failed or because -// they are not allowed for a given segment. -void vp9_compute_mod_refprobs(VP9_COMMON *const cm) { - int norm_cnt[MAX_REF_FRAMES]; - int intra_count; - int inter_count; - int last_count; - int gfarf_count; - int gf_count; - int arf_count; - - intra_count = cm->prob_intra_coded; - inter_count = (255 - intra_count); - last_count = (inter_count * cm->prob_last_coded) / 255; - gfarf_count = inter_count - last_count; - gf_count = (gfarf_count * cm->prob_gf_coded) / 255; - arf_count = gfarf_count - gf_count; - - // Work out modified reference frame probabilities to use where prediction - // of the reference frame fails - norm_cnt[0] = 0; - norm_cnt[1] = last_count; - norm_cnt[2] = gf_count; - norm_cnt[3] = arf_count; - vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[INTRA_FRAME]); - cm->mod_refprobs[INTRA_FRAME][0] = 0; // This branch implicit - - norm_cnt[0] = intra_count; - norm_cnt[1] = 0; - norm_cnt[2] = gf_count; - norm_cnt[3] = arf_count; - vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[LAST_FRAME]); - cm->mod_refprobs[LAST_FRAME][1] = 0; // This branch implicit - - norm_cnt[0] = intra_count; - norm_cnt[1] = last_count; - norm_cnt[2] = 0; - norm_cnt[3] = arf_count; - vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[GOLDEN_FRAME]); - cm->mod_refprobs[GOLDEN_FRAME][2] = 0; // This branch implicit - - norm_cnt[0] = intra_count; - norm_cnt[1] = last_count; - norm_cnt[2] = gf_count; - norm_cnt[3] = 0; - vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[ALTREF_FRAME]); - cm->mod_refprobs[ALTREF_FRAME][2] = 0; // This branch implicit - - // Score the reference frames based on overal frequency. - // These scores contribute to the prediction choices. - // Max score 17 min 1 - cm->ref_scores[INTRA_FRAME] = 1 + (intra_count * 16 / 255); - cm->ref_scores[LAST_FRAME] = 1 + (last_count * 16 / 255); - cm->ref_scores[GOLDEN_FRAME] = 1 + (gf_count * 16 / 255); - cm->ref_scores[ALTREF_FRAME] = 1 + (arf_count * 16 / 255); -} diff --git a/vp9/common/pred_common.h b/vp9/common/pred_common.h deleted file mode 100644 index 33656f23a..000000000 --- a/vp9/common/pred_common.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "type_aliases.h" -#include "onyxc_int.h" -#include "vp9/common/blockd.h" - -#ifndef __INC_PRED_COMMON_H__ -#define __INC_PRED_COMMON_H__ 1 - - -// Predicted items -typedef enum { - PRED_SEG_ID = 0, // Segment identifier - PRED_REF = 1, - PRED_COMP = 2, - PRED_MBSKIP = 3, - PRED_SWITCHABLE_INTERP = 4 -} PRED_ID; - -extern unsigned char vp9_get_pred_context(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - PRED_ID pred_id); - -extern vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - PRED_ID pred_id); - -extern const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - PRED_ID pred_id); - -extern unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd, - PRED_ID pred_id); - -extern void vp9_set_pred_flag(MACROBLOCKD *const xd, - PRED_ID pred_id, - unsigned char pred_flag); - - -extern unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd, - int MbIndex); - -extern MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm, - const MACROBLOCKD *const xd); -extern void vp9_compute_mod_refprobs(VP9_COMMON *const cm); - -#endif /* __INC_PRED_COMMON_H__ */ diff --git a/vp9/common/quant_common.c b/vp9/common/quant_common.c deleted file mode 100644 index 720b27113..000000000 --- a/vp9/common/quant_common.c +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "quant_common.h" - -static int dc_qlookup[QINDEX_RANGE]; -static int ac_qlookup[QINDEX_RANGE]; - -#define ACDC_MIN 4 - -void vp9_init_quant_tables() { - int i; - int current_val = 4; - int last_val = 4; - int ac_val; - - for (i = 0; i < QINDEX_RANGE; i++) { - ac_qlookup[i] = current_val; - current_val = (int)((double)current_val * 1.02); - if (current_val == last_val) - current_val++; - last_val = current_val; - - ac_val = ac_qlookup[i]; - dc_qlookup[i] = (int)((0.000000305 * ac_val * ac_val * ac_val) + - (-0.00065 * ac_val * ac_val) + - (0.9 * ac_val) + 0.5); - if (dc_qlookup[i] < ACDC_MIN) - dc_qlookup[i] = ACDC_MIN; - } -} - -int vp9_dc_quant(int QIndex, int Delta) { - int retval; - - QIndex = QIndex + Delta; - - if (QIndex > MAXQ) - QIndex = MAXQ; - else if (QIndex < 0) - QIndex = 0; - - retval = dc_qlookup[ QIndex ]; - return retval; -} - -int vp9_dc2quant(int QIndex, int Delta) { - int retval; - - QIndex = QIndex + Delta; - - if (QIndex > MAXQ) - QIndex = MAXQ; - else if (QIndex < 0) - QIndex = 0; - - retval = dc_qlookup[ QIndex ]; - - return retval; - -} -int vp9_dc_uv_quant(int QIndex, int Delta) { - int retval; - - QIndex = QIndex + Delta; - - if (QIndex > MAXQ) - QIndex = MAXQ; - else if (QIndex < 0) - QIndex = 0; - - retval = dc_qlookup[ QIndex ]; - - return retval; -} - -int vp9_ac_yquant(int QIndex) { - int retval; - - if (QIndex > MAXQ) - QIndex = MAXQ; - else if (QIndex < 0) - QIndex = 0; - - retval = ac_qlookup[ QIndex ]; - return retval; -} - -int vp9_ac2quant(int QIndex, int Delta) { - int retval; - - QIndex = QIndex + Delta; - - if (QIndex > MAXQ) - QIndex = MAXQ; - else if (QIndex < 0) - QIndex = 0; - - retval = (ac_qlookup[ QIndex ] * 775) / 1000; - if (retval < 4) - retval = 4; - - return retval; -} -int vp9_ac_uv_quant(int QIndex, int Delta) { - int retval; - - QIndex = QIndex + Delta; - - if (QIndex > MAXQ) - QIndex = MAXQ; - else if (QIndex < 0) - QIndex = 0; - - retval = ac_qlookup[ QIndex ]; - return retval; -} diff --git a/vp9/common/quant_common.h b/vp9/common/quant_common.h deleted file mode 100644 index 2978dd8a2..000000000 --- a/vp9/common/quant_common.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "string.h" -#include "blockd.h" -#include "onyxc_int.h" - -extern void vp9_init_quant_tables(); -extern int vp9_ac_yquant(int QIndex); -extern int vp9_dc_quant(int QIndex, int Delta); -extern int vp9_dc2quant(int QIndex, int Delta); -extern int vp9_ac2quant(int QIndex, int Delta); -extern int vp9_dc_uv_quant(int QIndex, int Delta); -extern int vp9_ac_uv_quant(int QIndex, int Delta); diff --git a/vp9/common/recon.c b/vp9/common/recon.c deleted file mode 100644 index 77035668e..000000000 --- a/vp9/common/recon.c +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include "vp9_rtcd.h" -#include "blockd.h" - -void vp9_recon_b_c -( - unsigned char *pred_ptr, - short *diff_ptr, - unsigned char *dst_ptr, - int stride -) { - int r, c; - - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - int a = diff_ptr[c] + pred_ptr[c]; - - if (a < 0) - a = 0; - - if (a > 255) - a = 255; - - dst_ptr[c] = (unsigned char) a; - } - - dst_ptr += stride; - diff_ptr += 16; - pred_ptr += 16; - } -} - -void vp9_recon_uv_b_c -( - unsigned char *pred_ptr, - short *diff_ptr, - unsigned char *dst_ptr, - int stride -) { - int r, c; - - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - int a = diff_ptr[c] + pred_ptr[c]; - - if (a < 0) - a = 0; - - if (a > 255) - a = 255; - - dst_ptr[c] = (unsigned char) a; - } - - dst_ptr += stride; - diff_ptr += 8; - pred_ptr += 8; - } -} -void vp9_recon4b_c -( - unsigned char *pred_ptr, - short *diff_ptr, - unsigned char *dst_ptr, - int stride -) { - int r, c; - - for (r = 0; r < 4; r++) { - for (c = 0; c < 16; c++) { - int a = diff_ptr[c] + pred_ptr[c]; - - if (a < 0) - a = 0; - - if (a > 255) - a = 255; - - dst_ptr[c] = (unsigned char) a; - } - - dst_ptr += stride; - diff_ptr += 16; - pred_ptr += 16; - } -} - -void vp9_recon2b_c -( - unsigned char *pred_ptr, - short *diff_ptr, - unsigned char *dst_ptr, - int stride -) { - int r, c; - - for (r = 0; r < 4; r++) { - for (c = 0; c < 8; c++) { - int a = diff_ptr[c] + pred_ptr[c]; - - if (a < 0) - a = 0; - - if (a > 255) - a = 255; - - dst_ptr[c] = (unsigned char) a; - } - - dst_ptr += stride; - diff_ptr += 8; - pred_ptr += 8; - } -} - -#if CONFIG_SUPERBLOCKS -void vp9_recon_mby_s_c(MACROBLOCKD *xd, uint8_t *dst) { - int x, y; - BLOCKD *b = &xd->block[0]; - int stride = b->dst_stride; - short *diff = b->diff; - - for (y = 0; y < 16; y++) { - for (x = 0; x < 16; x++) { - int a = dst[x] + diff[x]; - if (a < 0) - a = 0; - else if (a > 255) - a = 255; - dst[x] = a; - } - dst += stride; - diff += 16; - } -} - -void vp9_recon_mbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) { - int x, y, i; - uint8_t *dst = udst; - - for (i = 0; i < 2; i++, dst = vdst) { - BLOCKD *b = &xd->block[16 + 4 * i]; - int stride = b->dst_stride; - short *diff = b->diff; - - for (y = 0; y < 8; y++) { - for (x = 0; x < 8; x++) { - int a = dst[x] + diff[x]; - if (a < 0) - a = 0; - else if (a > 255) - a = 255; - dst[x] = a; - } - dst += stride; - diff += 8; - } - } -} -#endif - -void vp9_recon_mby_c(MACROBLOCKD *xd) { - int i; - - for (i = 0; i < 16; i += 4) { - BLOCKD *b = &xd->block[i]; - - vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); - } -} - -void vp9_recon_mb_c(MACROBLOCKD *xd) { - int i; - - for (i = 0; i < 16; i += 4) { - BLOCKD *b = &xd->block[i]; - - vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); - } - - for (i = 16; i < 24; i += 2) { - BLOCKD *b = &xd->block[i]; - - vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); - } -} diff --git a/vp9/common/reconinter.c b/vp9/common/reconinter.c deleted file mode 100644 index 37478f71e..000000000 --- a/vp9/common/reconinter.c +++ /dev/null @@ -1,1140 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include "vpx/vpx_integer.h" -#include "blockd.h" -#include "reconinter.h" -#include "vp9/common/reconintra.h" -#if CONFIG_RUNTIME_CPU_DETECT -#include "onyxc_int.h" -#endif - -void vp9_setup_interp_filters(MACROBLOCKD *xd, - INTERPOLATIONFILTERTYPE mcomp_filter_type, - VP9_COMMON *cm) { - if (mcomp_filter_type == SIXTAP) { - xd->subpixel_predict = vp9_sixtap_predict; - xd->subpixel_predict8x4 = vp9_sixtap_predict8x4; - xd->subpixel_predict8x8 = vp9_sixtap_predict8x8; - xd->subpixel_predict16x16 = vp9_sixtap_predict16x16; - xd->subpixel_predict_avg = vp9_sixtap_predict_avg; - xd->subpixel_predict_avg8x8 = vp9_sixtap_predict_avg8x8; - xd->subpixel_predict_avg16x16 = vp9_sixtap_predict_avg16x16; - } else if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) { - xd->subpixel_predict = vp9_eighttap_predict; - xd->subpixel_predict8x4 = vp9_eighttap_predict8x4; - xd->subpixel_predict8x8 = vp9_eighttap_predict8x8; - xd->subpixel_predict16x16 = vp9_eighttap_predict16x16; - xd->subpixel_predict_avg = vp9_eighttap_predict_avg4x4; - xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8; - xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16; - } else if (mcomp_filter_type == EIGHTTAP_SHARP) { - xd->subpixel_predict = vp9_eighttap_predict_sharp; - xd->subpixel_predict8x4 = vp9_eighttap_predict8x4_sharp; - xd->subpixel_predict8x8 = vp9_eighttap_predict8x8_sharp; - xd->subpixel_predict16x16 = vp9_eighttap_predict16x16_sharp; - xd->subpixel_predict_avg = vp9_eighttap_predict_avg4x4_sharp; - xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_sharp; - xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_sharp_c; - } - else { - xd->subpixel_predict = vp9_bilinear_predict4x4; - xd->subpixel_predict8x4 = vp9_bilinear_predict8x4; - xd->subpixel_predict8x8 = vp9_bilinear_predict8x8; - xd->subpixel_predict16x16 = vp9_bilinear_predict16x16; - xd->subpixel_predict_avg = vp9_bilinear_predict_avg4x4; - xd->subpixel_predict_avg8x8 = vp9_bilinear_predict_avg8x8; - xd->subpixel_predict_avg16x16 = vp9_bilinear_predict_avg16x16; - } -} - -void vp9_copy_mem16x16_c(unsigned char *src, - int src_stride, - unsigned char *dst, - int dst_stride) { - int r; - - for (r = 0; r < 16; r++) { -#if !(CONFIG_FAST_UNALIGNED) - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - dst[3] = src[3]; - dst[4] = src[4]; - dst[5] = src[5]; - dst[6] = src[6]; - dst[7] = src[7]; - dst[8] = src[8]; - dst[9] = src[9]; - dst[10] = src[10]; - dst[11] = src[11]; - dst[12] = src[12]; - dst[13] = src[13]; - dst[14] = src[14]; - dst[15] = src[15]; - -#else - ((uint32_t *)dst)[0] = ((uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((uint32_t *)src)[1]; - ((uint32_t *)dst)[2] = ((uint32_t *)src)[2]; - ((uint32_t *)dst)[3] = ((uint32_t *)src)[3]; - -#endif - src += src_stride; - dst += dst_stride; - } -} - -void vp9_avg_mem16x16_c(unsigned char *src, - int src_stride, - unsigned char *dst, - int dst_stride) { - int r; - - for (r = 0; r < 16; r++) { - int n; - - for (n = 0; n < 16; n++) { - dst[n] = (dst[n] + src[n] + 1) >> 1; - } - - src += src_stride; - dst += dst_stride; - } -} - -void vp9_copy_mem8x8_c(unsigned char *src, - int src_stride, - unsigned char *dst, - int dst_stride) { - int r; - - for (r = 0; r < 8; r++) { -#if !(CONFIG_FAST_UNALIGNED) - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - dst[3] = src[3]; - dst[4] = src[4]; - dst[5] = src[5]; - dst[6] = src[6]; - dst[7] = src[7]; -#else - ((uint32_t *)dst)[0] = ((uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((uint32_t *)src)[1]; -#endif - src += src_stride; - dst += dst_stride; - } -} - -void vp9_avg_mem8x8_c(unsigned char *src, - int src_stride, - unsigned char *dst, - int dst_stride) { - int r; - - for (r = 0; r < 8; r++) { - int n; - - for (n = 0; n < 8; n++) { - dst[n] = (dst[n] + src[n] + 1) >> 1; - } - - src += src_stride; - dst += dst_stride; - } -} - -void vp9_copy_mem8x4_c(unsigned char *src, - int src_stride, - unsigned char *dst, - int dst_stride) { - int r; - - for (r = 0; r < 4; r++) { -#if !(CONFIG_FAST_UNALIGNED) - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - dst[3] = src[3]; - dst[4] = src[4]; - dst[5] = src[5]; - dst[6] = src[6]; - dst[7] = src[7]; -#else - ((uint32_t *)dst)[0] = ((uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((uint32_t *)src)[1]; -#endif - src += src_stride; - dst += dst_stride; - } -} - -void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) { - int r; - unsigned char *ptr_base; - unsigned char *ptr; - unsigned char *pred_ptr = d->predictor; - int_mv mv; - - ptr_base = *(d->base_pre); - mv.as_int = d->bmi.as_mv.first.as_int; - - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1, - pred_ptr, pitch); - } else { - ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - ptr = ptr_base; - - for (r = 0; r < 4; r++) { -#if !(CONFIG_FAST_UNALIGNED) - pred_ptr[0] = ptr[0]; - pred_ptr[1] = ptr[1]; - pred_ptr[2] = ptr[2]; - pred_ptr[3] = ptr[3]; -#else - *(uint32_t *)pred_ptr = *(uint32_t *)ptr; -#endif - pred_ptr += pitch; - ptr += d->pre_stride; - } - } -} - -/* - * Similar to vp9_build_inter_predictors_b(), but instead of storing the - * results in d->predictor, we average the contents of d->predictor (which - * come from an earlier call to vp9_build_inter_predictors_b()) with the - * predictor of the second reference frame / motion vector. - */ -void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch, - vp9_subpix_fn_t sppf) { - int r; - unsigned char *ptr_base; - unsigned char *ptr; - unsigned char *pred_ptr = d->predictor; - int_mv mv; - - ptr_base = *(d->base_second_pre); - mv.as_int = d->bmi.as_mv.second.as_int; - - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1, - pred_ptr, pitch); - } else { - ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - ptr = ptr_base; - - for (r = 0; r < 4; r++) { - pred_ptr[0] = (pred_ptr[0] + ptr[0] + 1) >> 1; - pred_ptr[1] = (pred_ptr[1] + ptr[1] + 1) >> 1; - pred_ptr[2] = (pred_ptr[2] + ptr[2] + 1) >> 1; - pred_ptr[3] = (pred_ptr[3] + ptr[3] + 1) >> 1; - pred_ptr += pitch; - ptr += d->pre_stride; - } - } -} - -void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { - unsigned char *ptr_base; - unsigned char *ptr; - unsigned char *pred_ptr = d->predictor; - int_mv mv; - - ptr_base = *(d->base_pre); - mv.as_int = d->bmi.as_mv.first.as_int; - ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, - (mv.as_mv.row & 7) << 1, pred_ptr, pitch); - } else { - vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch); - } -} - -/* - * Similar to build_inter_predictors_4b(), but instead of storing the - * results in d->predictor, we average the contents of d->predictor (which - * come from an earlier call to build_inter_predictors_4b()) with the - * predictor of the second reference frame / motion vector. - */ -void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd, - BLOCKD *d, int pitch) { - unsigned char *ptr_base; - unsigned char *ptr; - unsigned char *pred_ptr = d->predictor; - int_mv mv; - - ptr_base = *(d->base_second_pre); - mv.as_int = d->bmi.as_mv.second.as_int; - ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, - (mv.as_mv.row & 7) << 1, pred_ptr, pitch); - } else { - vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch); - } -} - -static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { - unsigned char *ptr_base; - unsigned char *ptr; - unsigned char *pred_ptr = d->predictor; - int_mv mv; - - ptr_base = *(d->base_pre); - mv.as_int = d->bmi.as_mv.first.as_int; - ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, - (mv.as_mv.row & 7) << 1, pred_ptr, pitch); - } else { - vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch); - } -} - - -/*encoder only*/ -#if CONFIG_PRED_FILTER - -// Select the thresholded or non-thresholded filter -#define USE_THRESH_FILTER 0 - -#define PRED_FILT_LEN 5 - -static const int filt_shift = 4; -static const int pred_filter[PRED_FILT_LEN] = {1, 2, 10, 2, 1}; -// Alternative filter {1, 1, 4, 1, 1} - -#if !USE_THRESH_FILTER -void filter_mb(unsigned char *src, int src_stride, - unsigned char *dst, int dst_stride, - int width, int height) { - int i, j, k; - unsigned int Temp[32 * 32]; - unsigned int *pTmp = Temp; - unsigned char *pSrc = src - (1 + src_stride) * (PRED_FILT_LEN / 2); - - // Horizontal - for (i = 0; i < height + PRED_FILT_LEN - 1; i++) { - for (j = 0; j < width; j++) { - int sum = 0; - for (k = 0; k < PRED_FILT_LEN; k++) - sum += pSrc[j + k] * pred_filter[k]; - pTmp[j] = sum; - } - - pSrc += src_stride; - pTmp += width; - } - - // Vertical - pTmp = Temp; - for (i = 0; i < width; i++) { - unsigned char *pDst = dst + i; - for (j = 0; j < height; j++) { - int sum = 0; - for (k = 0; k < PRED_FILT_LEN; k++) - sum += pTmp[(j + k) * width] * pred_filter[k]; - // Round - sum = (sum + ((1 << (filt_shift << 1)) >> 1)) >> (filt_shift << 1); - pDst[j * dst_stride] = (sum < 0 ? 0 : sum > 255 ? 255 : sum); - } - ++pTmp; - } -} -#else -// Based on vp9_post_proc_down_and_across_c (postproc.c) -void filter_mb(unsigned char *src, int src_stride, - unsigned char *dst, int dst_stride, - int width, int height) { - unsigned char *pSrc, *pDst; - int row; - int col; - int i; - int v; - unsigned char d[8]; - - /* TODO flimit should be linked to the quantizer value */ - int flimit = 7; - - for (row = 0; row < height; row++) { - /* post_proc_down for one row */ - pSrc = src; - pDst = dst; - - for (col = 0; col < width; col++) { - int kernel = (1 << (filt_shift - 1)); - int v = pSrc[col]; - - for (i = -2; i <= 2; i++) { - if (abs(v - pSrc[col + i * src_stride]) > flimit) - goto down_skip_convolve; - - kernel += pred_filter[2 + i] * pSrc[col + i * src_stride]; - } - - v = (kernel >> filt_shift); - down_skip_convolve: - pDst[col] = v; - } - - /* now post_proc_across */ - pSrc = dst; - pDst = dst; - - for (i = 0; i < 8; i++) - d[i] = pSrc[i]; - - for (col = 0; col < width; col++) { - int kernel = (1 << (filt_shift - 1)); - v = pSrc[col]; - - d[col & 7] = v; - - for (i = -2; i <= 2; i++) { - if (abs(v - pSrc[col + i]) > flimit) - goto across_skip_convolve; - - kernel += pred_filter[2 + i] * pSrc[col + i]; - } - - d[col & 7] = (kernel >> filt_shift); - across_skip_convolve: - - if (col >= 2) - pDst[col - 2] = d[(col - 2) & 7]; - } - - /* handle the last two pixels */ - pDst[col - 2] = d[(col - 2) & 7]; - pDst[col - 1] = d[(col - 1) & 7]; - - /* next row */ - src += src_stride; - dst += dst_stride; - } -} -#endif // !USE_THRESH_FILTER - -#endif // CONFIG_PRED_FILTER - -/*encoder only*/ -void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) { - int i, j; - BLOCKD *blockd = xd->block; - - /* build uv mvs */ - for (i = 0; i < 2; i++) { - for (j = 0; j < 2; j++) { - int yoffset = i * 8 + j * 2; - int uoffset = 16 + i * 2 + j; - int voffset = 20 + i * 2 + j; - int temp; - - temp = blockd[yoffset ].bmi.as_mv.first.as_mv.row - + blockd[yoffset + 1].bmi.as_mv.first.as_mv.row - + blockd[yoffset + 4].bmi.as_mv.first.as_mv.row - + blockd[yoffset + 5].bmi.as_mv.first.as_mv.row; - - if (temp < 0) temp -= 4; - else temp += 4; - - xd->block[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) & - xd->fullpixel_mask; - - temp = blockd[yoffset ].bmi.as_mv.first.as_mv.col - + blockd[yoffset + 1].bmi.as_mv.first.as_mv.col - + blockd[yoffset + 4].bmi.as_mv.first.as_mv.col - + blockd[yoffset + 5].bmi.as_mv.first.as_mv.col; - - if (temp < 0) temp -= 4; - else temp += 4; - - blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) & - xd->fullpixel_mask; - - blockd[voffset].bmi.as_mv.first.as_mv.row = - blockd[uoffset].bmi.as_mv.first.as_mv.row; - blockd[voffset].bmi.as_mv.first.as_mv.col = - blockd[uoffset].bmi.as_mv.first.as_mv.col; - - if (xd->mode_info_context->mbmi.second_ref_frame > 0) { - temp = blockd[yoffset ].bmi.as_mv.second.as_mv.row - + blockd[yoffset + 1].bmi.as_mv.second.as_mv.row - + blockd[yoffset + 4].bmi.as_mv.second.as_mv.row - + blockd[yoffset + 5].bmi.as_mv.second.as_mv.row; - - if (temp < 0) { - temp -= 4; - } else { - temp += 4; - } - - blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) & - xd->fullpixel_mask; - - temp = blockd[yoffset ].bmi.as_mv.second.as_mv.col - + blockd[yoffset + 1].bmi.as_mv.second.as_mv.col - + blockd[yoffset + 4].bmi.as_mv.second.as_mv.col - + blockd[yoffset + 5].bmi.as_mv.second.as_mv.col; - - if (temp < 0) { - temp -= 4; - } else { - temp += 4; - } - - blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) & - xd->fullpixel_mask; - - blockd[voffset].bmi.as_mv.second.as_mv.row = - blockd[uoffset].bmi.as_mv.second.as_mv.row; - blockd[voffset].bmi.as_mv.second.as_mv.col = - blockd[uoffset].bmi.as_mv.second.as_mv.col; - } - } - } - - for (i = 16; i < 24; i += 2) { - BLOCKD *d0 = &blockd[i]; - BLOCKD *d1 = &blockd[i + 1]; - - if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) - build_inter_predictors2b(xd, d0, 8); - else { - vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict); - vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict); - } - - if (xd->mode_info_context->mbmi.second_ref_frame > 0) { - vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg); - vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg); - } - } -} - -static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) { - /* If the MV points so far into the UMV border that no visible pixels - * are used for reconstruction, the subpel part of the MV can be - * discarded and the MV limited to 16 pixels with equivalent results. - * - * This limit kicks in at 19 pixels for the top and left edges, for - * the 16 pixels plus 3 taps right of the central pixel when subpel - * filtering. The bottom and right edges use 16 pixels plus 2 pixels - * left of the central pixel when filtering. - */ - if (mv->col < (xd->mb_to_left_edge - ((16 + VP9_INTERP_EXTEND) << 3))) - mv->col = xd->mb_to_left_edge - (16 << 3); - else if (mv->col > xd->mb_to_right_edge + ((15 + VP9_INTERP_EXTEND) << 3)) - mv->col = xd->mb_to_right_edge + (16 << 3); - - if (mv->row < (xd->mb_to_top_edge - ((16 + VP9_INTERP_EXTEND) << 3))) - mv->row = xd->mb_to_top_edge - (16 << 3); - else if (mv->row > xd->mb_to_bottom_edge + ((15 + VP9_INTERP_EXTEND) << 3)) - mv->row = xd->mb_to_bottom_edge + (16 << 3); -} - -/* A version of the above function for chroma block MVs.*/ -static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) { - const int extend = VP9_INTERP_EXTEND; - - mv->col = (2 * mv->col < (xd->mb_to_left_edge - ((16 + extend) << 3))) ? - (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col; - mv->col = (2 * mv->col > xd->mb_to_right_edge + ((15 + extend) << 3)) ? - (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col; - - mv->row = (2 * mv->row < (xd->mb_to_top_edge - ((16 + extend) << 3))) ? - (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row; - mv->row = (2 * mv->row > xd->mb_to_bottom_edge + ((15 + extend) << 3)) ? - (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row; -} - -/*encoder only*/ -void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd, - unsigned char *dst_y, - int dst_ystride, - int clamp_mvs) { - unsigned char *ptr_base = xd->pre.y_buffer; - unsigned char *ptr; - int pre_stride = xd->block[0].pre_stride; - int_mv ymv; - - ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int; - - if (clamp_mvs) - clamp_mv_to_umv_border(&ymv.as_mv, xd); - - ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3); - -#if CONFIG_PRED_FILTER - if (xd->mode_info_context->mbmi.pred_filter_enabled) { - if ((ymv.as_mv.row | ymv.as_mv.col) & 7) { - // Sub-pel filter needs extended input - int len = 15 + (VP9_INTERP_EXTEND << 1); - unsigned char Temp[32 * 32]; // Data required by sub-pel filter - unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1); - - // Copy extended MB into Temp array, applying the spatial filter - filter_mb(ptr - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride, - Temp, len, len, len); - - // Sub-pel interpolation - xd->subpixel_predict16x16(pTemp, len, - (ymv.as_mv.col & 7) << 1, - (ymv.as_mv.row & 7) << 1, - dst_y, dst_ystride); - } else { - // Apply spatial filter to create the prediction directly - filter_mb(ptr, pre_stride, dst_y, dst_ystride, 16, 16); - } - } else -#endif - if ((ymv.as_mv.row | ymv.as_mv.col) & 7) { - xd->subpixel_predict16x16(ptr, pre_stride, - (ymv.as_mv.col & 7) << 1, - (ymv.as_mv.row & 7) << 1, - dst_y, dst_ystride); - } else { - vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride); - } -} - -void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd, - unsigned char *dst_u, - unsigned char *dst_v, - int dst_uvstride) { - int offset; - unsigned char *uptr, *vptr; - int pre_stride = xd->block[0].pre_stride; - int_mv _o16x16mv; - int_mv _16x16mv; - - _16x16mv.as_int = xd->mode_info_context->mbmi.mv[0].as_int; - - if (xd->mode_info_context->mbmi.need_to_clamp_mvs) - clamp_mv_to_umv_border(&_16x16mv.as_mv, xd); - - _o16x16mv = _16x16mv; - /* calc uv motion vectors */ - if (_16x16mv.as_mv.row < 0) - _16x16mv.as_mv.row -= 1; - else - _16x16mv.as_mv.row += 1; - - if (_16x16mv.as_mv.col < 0) - _16x16mv.as_mv.col -= 1; - else - _16x16mv.as_mv.col += 1; - - _16x16mv.as_mv.row /= 2; - _16x16mv.as_mv.col /= 2; - - _16x16mv.as_mv.row &= xd->fullpixel_mask; - _16x16mv.as_mv.col &= xd->fullpixel_mask; - - pre_stride >>= 1; - offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3); - uptr = xd->pre.u_buffer + offset; - vptr = xd->pre.v_buffer + offset; - -#if CONFIG_PRED_FILTER - if (xd->mode_info_context->mbmi.pred_filter_enabled) { - int i; - unsigned char *pSrc = uptr; - unsigned char *pDst = dst_u; - int len = 7 + (VP9_INTERP_EXTEND << 1); - unsigned char Temp[32 * 32]; // Data required by the sub-pel filter - unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1); - - // U & V - for (i = 0; i < 2; i++) { - if (_o16x16mv.as_int & 0x000f000f) { - // Copy extended MB into Temp array, applying the spatial filter - filter_mb(pSrc - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride, - Temp, len, len, len); - - // Sub-pel filter - xd->subpixel_predict8x8(pTemp, len, - _o16x16mv.as_mv.col & 15, - _o16x16mv.as_mv.row & 15, - pDst, dst_uvstride); - } else { - filter_mb(pSrc, pre_stride, pDst, dst_uvstride, 8, 8); - } - - // V - pSrc = vptr; - pDst = dst_v; - } - } else -#endif - if (_o16x16mv.as_int & 0x000f000f) { - xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15, - _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride); - xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15, - _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride); - } else { - vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride); - vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride); - } -} - - -void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd, - unsigned char *dst_y, - unsigned char *dst_u, - unsigned char *dst_v, - int dst_ystride, int dst_uvstride) { - vp9_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride, - xd->mode_info_context->mbmi.need_to_clamp_mvs); - vp9_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride); -} - -#if CONFIG_SUPERBLOCKS -void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x, - unsigned char *dst_y, - unsigned char *dst_u, - unsigned char *dst_v, - int dst_ystride, - int dst_uvstride) { - uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer; - uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer, - *v2 = x->second_pre.v_buffer; - int edge[4], n; - - edge[0] = x->mb_to_top_edge; - edge[1] = x->mb_to_bottom_edge; - edge[2] = x->mb_to_left_edge; - edge[3] = x->mb_to_right_edge; - - for (n = 0; n < 4; n++) { - const int x_idx = n & 1, y_idx = n >> 1; - - x->mb_to_top_edge = edge[0] - ((y_idx * 16) << 3); - x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3); - x->mb_to_left_edge = edge[2] - ((x_idx * 16) << 3); - x->mb_to_right_edge = edge[3] + (((1 - x_idx) * 16) << 3); - - x->pre.y_buffer = y1 + y_idx * 16 * x->pre.y_stride + x_idx * 16; - x->pre.u_buffer = u1 + y_idx * 8 * x->pre.uv_stride + x_idx * 8; - x->pre.v_buffer = v1 + y_idx * 8 * x->pre.uv_stride + x_idx * 8; - - vp9_build_1st_inter16x16_predictors_mb(x, - dst_y + y_idx * 16 * dst_ystride + x_idx * 16, - dst_u + y_idx * 8 * dst_uvstride + x_idx * 8, - dst_v + y_idx * 8 * dst_uvstride + x_idx * 8, - dst_ystride, dst_uvstride); - if (x->mode_info_context->mbmi.second_ref_frame > 0) { - x->second_pre.y_buffer = y2 + y_idx * 16 * x->pre.y_stride + x_idx * 16; - x->second_pre.u_buffer = u2 + y_idx * 8 * x->pre.uv_stride + x_idx * 8; - x->second_pre.v_buffer = v2 + y_idx * 8 * x->pre.uv_stride + x_idx * 8; - - vp9_build_2nd_inter16x16_predictors_mb(x, - dst_y + y_idx * 16 * dst_ystride + x_idx * 16, - dst_u + y_idx * 8 * dst_uvstride + x_idx * 8, - dst_v + y_idx * 8 * dst_uvstride + x_idx * 8, - dst_ystride, dst_uvstride); - } - } - - x->mb_to_top_edge = edge[0]; - x->mb_to_bottom_edge = edge[1]; - x->mb_to_left_edge = edge[2]; - x->mb_to_right_edge = edge[3]; - - x->pre.y_buffer = y1; - x->pre.u_buffer = u1; - x->pre.v_buffer = v1; - - if (x->mode_info_context->mbmi.second_ref_frame > 0) { - x->second_pre.y_buffer = y2; - x->second_pre.u_buffer = u2; - x->second_pre.v_buffer = v2; - } -} -#endif - -/* - * The following functions should be called after an initial - * call to vp9_build_1st_inter16x16_predictors_mb() or _mby()/_mbuv(). - * It will run a second sixtap filter on a (different) ref - * frame and average the result with the output of the - * first sixtap filter. The second reference frame is stored - * in x->second_pre (the reference frame index is in - * x->mode_info_context->mbmi.second_ref_frame). The second - * motion vector is x->mode_info_context->mbmi.second_mv. - * - * This allows blending prediction from two reference frames - * which sometimes leads to better prediction than from a - * single reference framer. - */ -void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd, - unsigned char *dst_y, - int dst_ystride) { - unsigned char *ptr; - - int_mv _16x16mv; - int mv_row; - int mv_col; - - unsigned char *ptr_base = xd->second_pre.y_buffer; - int pre_stride = xd->block[0].pre_stride; - - _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int; - - if (xd->mode_info_context->mbmi.need_to_clamp_secondmv) - clamp_mv_to_umv_border(&_16x16mv.as_mv, xd); - - mv_row = _16x16mv.as_mv.row; - mv_col = _16x16mv.as_mv.col; - - ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); - -#if CONFIG_PRED_FILTER - if (xd->mode_info_context->mbmi.pred_filter_enabled) { - if ((mv_row | mv_col) & 7) { - // Sub-pel filter needs extended input - int len = 15 + (VP9_INTERP_EXTEND << 1); - unsigned char Temp[32 * 32]; // Data required by sub-pel filter - unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1); - - // Copy extended MB into Temp array, applying the spatial filter - filter_mb(ptr - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride, - Temp, len, len, len); - - // Sub-pel filter - xd->subpixel_predict_avg16x16(pTemp, len, (mv_col & 7) << 1, - (mv_row & 7) << 1, dst_y, dst_ystride); - } else { - // TODO Needs to AVERAGE with the dst_y - // For now, do not apply the prediction filter in these cases! - vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride); - } - } else -#endif // CONFIG_PRED_FILTER - { - if ((mv_row | mv_col) & 7) { - xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1, - (mv_row & 7) << 1, dst_y, dst_ystride); - } else { - vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride); - } - } -} - -void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd, - unsigned char *dst_u, - unsigned char *dst_v, - int dst_uvstride) { - int offset; - unsigned char *uptr, *vptr; - - int_mv _16x16mv; - int mv_row; - int mv_col; - int omv_row, omv_col; - - int pre_stride = xd->block[0].pre_stride; - - _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int; - - if (xd->mode_info_context->mbmi.need_to_clamp_secondmv) - clamp_mv_to_umv_border(&_16x16mv.as_mv, xd); - - mv_row = _16x16mv.as_mv.row; - mv_col = _16x16mv.as_mv.col; - - /* calc uv motion vectors */ - omv_row = mv_row; - omv_col = mv_col; - mv_row = (mv_row + (mv_row > 0)) >> 1; - mv_col = (mv_col + (mv_col > 0)) >> 1; - - mv_row &= xd->fullpixel_mask; - mv_col &= xd->fullpixel_mask; - - pre_stride >>= 1; - offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); - uptr = xd->second_pre.u_buffer + offset; - vptr = xd->second_pre.v_buffer + offset; - -#if CONFIG_PRED_FILTER - if (xd->mode_info_context->mbmi.pred_filter_enabled) { - int i; - int len = 7 + (VP9_INTERP_EXTEND << 1); - unsigned char Temp[32 * 32]; // Data required by sub-pel filter - unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1); - unsigned char *pSrc = uptr; - unsigned char *pDst = dst_u; - - // U & V - for (i = 0; i < 2; i++) { - if ((omv_row | omv_col) & 15) { - // Copy extended MB into Temp array, applying the spatial filter - filter_mb(pSrc - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride, - Temp, len, len, len); - - // Sub-pel filter - xd->subpixel_predict_avg8x8(pTemp, len, omv_col & 15, - omv_row & 15, pDst, dst_uvstride); - } else { - // TODO Needs to AVERAGE with the dst_[u|v] - // For now, do not apply the prediction filter here! - vp9_avg_mem8x8(pSrc, pre_stride, pDst, dst_uvstride); - } - - // V - pSrc = vptr; - pDst = dst_v; - } - } else -#endif // CONFIG_PRED_FILTER - if ((omv_row | omv_col) & 15) { - xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15, - omv_row & 15, dst_u, dst_uvstride); - xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15, - omv_row & 15, dst_v, dst_uvstride); - } else { - vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride); - vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride); - } -} - -void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd, - unsigned char *dst_y, - unsigned char *dst_u, - unsigned char *dst_v, - int dst_ystride, - int dst_uvstride) { - vp9_build_2nd_inter16x16_predictors_mby(xd, dst_y, dst_ystride); - vp9_build_2nd_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride); -} - -static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) { - int i; - MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi; - BLOCKD *blockd = xd->block; - - if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) { - blockd[ 0].bmi = xd->mode_info_context->bmi[ 0]; - blockd[ 2].bmi = xd->mode_info_context->bmi[ 2]; - blockd[ 8].bmi = xd->mode_info_context->bmi[ 8]; - blockd[10].bmi = xd->mode_info_context->bmi[10]; - - if (mbmi->need_to_clamp_mvs) { - clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.first.as_mv, xd); - clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.first.as_mv, xd); - clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.first.as_mv, xd); - clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.first.as_mv, xd); - if (mbmi->second_ref_frame > 0) { - clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.second.as_mv, xd); - clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.second.as_mv, xd); - clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.second.as_mv, xd); - clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.second.as_mv, xd); - } - } - - - vp9_build_inter_predictors4b(xd, &blockd[ 0], 16); - vp9_build_inter_predictors4b(xd, &blockd[ 2], 16); - vp9_build_inter_predictors4b(xd, &blockd[ 8], 16); - vp9_build_inter_predictors4b(xd, &blockd[10], 16); - - if (mbmi->second_ref_frame > 0) { - vp9_build_2nd_inter_predictors4b(xd, &blockd[ 0], 16); - vp9_build_2nd_inter_predictors4b(xd, &blockd[ 2], 16); - vp9_build_2nd_inter_predictors4b(xd, &blockd[ 8], 16); - vp9_build_2nd_inter_predictors4b(xd, &blockd[10], 16); - } - } else { - for (i = 0; i < 16; i += 2) { - BLOCKD *d0 = &blockd[i]; - BLOCKD *d1 = &blockd[i + 1]; - - blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0]; - blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1]; - - if (mbmi->need_to_clamp_mvs) { - clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.first.as_mv, xd); - clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.first.as_mv, xd); - if (mbmi->second_ref_frame > 0) { - clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.second.as_mv, xd); - clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.second.as_mv, xd); - } - } - - if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) - build_inter_predictors2b(xd, d0, 16); - else { - vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict); - vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict); - } - - if (mbmi->second_ref_frame > 0) { - vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg); - vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg); - } - } - } - - for (i = 16; i < 24; i += 2) { - BLOCKD *d0 = &blockd[i]; - BLOCKD *d1 = &blockd[i + 1]; - - if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) - build_inter_predictors2b(xd, d0, 8); - else { - vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict); - vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict); - } - - if (mbmi->second_ref_frame > 0) { - vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg); - vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg); - } - } -} - -static -void build_4x4uvmvs(MACROBLOCKD *xd) { - int i, j; - BLOCKD *blockd = xd->block; - - for (i = 0; i < 2; i++) { - for (j = 0; j < 2; j++) { - int yoffset = i * 8 + j * 2; - int uoffset = 16 + i * 2 + j; - int voffset = 20 + i * 2 + j; - - int temp; - - temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.row - + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.row - + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.row - + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.row; - - if (temp < 0) temp -= 4; - else temp += 4; - - blockd[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) & - xd->fullpixel_mask; - - temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.col - + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.col - + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.col - + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.col; - - if (temp < 0) temp -= 4; - else temp += 4; - - blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) & - xd->fullpixel_mask; - - // if (x->mode_info_context->mbmi.need_to_clamp_mvs) - clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd); - - // if (x->mode_info_context->mbmi.need_to_clamp_mvs) - clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd); - - blockd[voffset].bmi.as_mv.first.as_mv.row = - blockd[uoffset].bmi.as_mv.first.as_mv.row; - blockd[voffset].bmi.as_mv.first.as_mv.col = - blockd[uoffset].bmi.as_mv.first.as_mv.col; - - if (xd->mode_info_context->mbmi.second_ref_frame > 0) { - temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.row - + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.row - + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.row - + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.row; - - if (temp < 0) { - temp -= 4; - } else { - temp += 4; - } - - blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) & - xd->fullpixel_mask; - - temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.col - + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.col - + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.col - + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.col; - - if (temp < 0) { - temp -= 4; - } else { - temp += 4; - } - - blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) & - xd->fullpixel_mask; - - // if (mbmi->need_to_clamp_mvs) - clamp_uvmv_to_umv_border( - &blockd[uoffset].bmi.as_mv.second.as_mv, xd); - - // if (mbmi->need_to_clamp_mvs) - clamp_uvmv_to_umv_border( - &blockd[uoffset].bmi.as_mv.second.as_mv, xd); - - blockd[voffset].bmi.as_mv.second.as_mv.row = - blockd[uoffset].bmi.as_mv.second.as_mv.row; - blockd[voffset].bmi.as_mv.second.as_mv.col = - blockd[uoffset].bmi.as_mv.second.as_mv.col; - } - } - } -} - -void vp9_build_inter_predictors_mb(MACROBLOCKD *xd) { - if (xd->mode_info_context->mbmi.mode != SPLITMV) { - vp9_build_1st_inter16x16_predictors_mb(xd, xd->predictor, - &xd->predictor[256], - &xd->predictor[320], 16, 8); - - if (xd->mode_info_context->mbmi.second_ref_frame > 0) { - /* 256 = offset of U plane in Y+U+V buffer; - * 320 = offset of V plane in Y+U+V buffer. - * (256=16x16, 320=16x16+8x8). */ - vp9_build_2nd_inter16x16_predictors_mb(xd, xd->predictor, - &xd->predictor[256], - &xd->predictor[320], 16, 8); - } -#if CONFIG_COMP_INTERINTRA_PRED - else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) { - vp9_build_interintra_16x16_predictors_mb(xd, xd->predictor, - &xd->predictor[256], - &xd->predictor[320], 16, 8); - } -#endif - } else { - build_4x4uvmvs(xd); - build_inter4x4_predictors_mb(xd); - } -} diff --git a/vp9/common/reconinter.h b/vp9/common/reconinter.h deleted file mode 100644 index ad1cf9197..000000000 --- a/vp9/common/reconinter.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef __INC_RECONINTER_H -#define __INC_RECONINTER_H - -#include "onyxc_int.h" - -extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd, - unsigned char *dst_y, - int dst_ystride, - int clamp_mvs); - -extern void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd, - unsigned char *dst_u, - unsigned char *dst_v, - int dst_uvstride); - -extern void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd, - unsigned char *dst_y, - unsigned char *dst_u, - unsigned char *dst_v, - int dst_ystride, - int dst_uvstride); - -extern void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd, - unsigned char *dst_y, - int dst_ystride); - -extern void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd, - unsigned char *dst_u, - unsigned char *dst_v, - int dst_uvstride); - -extern void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd, - unsigned char *dst_y, - unsigned char *dst_u, - unsigned char *dst_v, - int dst_ystride, - int dst_uvstride); - -#if CONFIG_SUPERBLOCKS -extern void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x, - unsigned char *dst_y, - unsigned char *dst_u, - unsigned char *dst_v, - int dst_ystride, - int dst_uvstride); -#endif - -extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd); - -extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, - vp9_subpix_fn_t sppf); - -extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch, - vp9_subpix_fn_t sppf); - -extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, - int pitch); - -extern void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd, - BLOCKD *d, int pitch); - -extern void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd); - -extern void vp9_setup_interp_filters(MACROBLOCKD *xd, - INTERPOLATIONFILTERTYPE filter, - VP9_COMMON *cm); - -#endif // __INC_RECONINTER_H diff --git a/vp9/common/reconintra.c b/vp9/common/reconintra.c deleted file mode 100644 index 89533c7ae..000000000 --- a/vp9/common/reconintra.c +++ /dev/null @@ -1,819 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include "vpx_ports/config.h" -#include "vp9_rtcd.h" -#include "reconintra.h" -#include "vpx_mem/vpx_mem.h" - -/* For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) - * and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd). - */ - -static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n, - uint8_t *yabove_row, uint8_t *yleft_col) { - int r, c, h, w, v; - int a, b; - r = 0; - for (c = 0; c < n - 2; c++) { - if (c & 1) - a = yleft_col[r + 1]; - else - a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1; - b = yabove_row[c + 2]; - ypred_ptr[c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3); - } - for (r = 1; r < n / 2 - 1; r++) { - for (c = 0; c < n - 2 - 2 * r; c++) { - if (c & 1) - a = yleft_col[r + 1]; - else - a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1; - b = ypred_ptr[(r - 1) * y_stride + c + 2]; - ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3); - } - } - for (; r < n - 1; ++r) { - for (c = 0; c < n; c++) { - v = (c & 1 ? yleft_col[r + 1] : (yleft_col[r] + yleft_col[r + 1] + 1) >> 1); - h = r - c / 2; - ypred_ptr[h * y_stride + c] = v; - } - } - c = 0; - r = n - 1; - ypred_ptr[r * y_stride] = (ypred_ptr[(r - 1) * y_stride] + - yleft_col[r] + 1) >> 1; - for (r = n - 2; r >= n / 2; --r) { - w = c + (n - 1 - r) * 2; - ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] + - ypred_ptr[r * y_stride + w - 1] + 1) >> 1; - } - for (c = 1; c < n; c++) { - for (r = n - 1; r >= n / 2 + c / 2; --r) { - w = c + (n - 1 - r) * 2; - ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] + - ypred_ptr[r * y_stride + w - 1] + 1) >> 1; - } - } -} - -static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n, - uint8_t *yabove_row, uint8_t *yleft_col) { - int r, c, h, w, v; - int a, b; - c = 0; - for (r = 0; r < n - 2; r++) { - if (r & 1) - a = yabove_row[c + 1]; - else - a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1; - b = yleft_col[r + 2]; - ypred_ptr[r * y_stride] = (2 * a + (r + 1) * b + (r + 3) / 2) / (r + 3); - } - for (c = 1; c < n / 2 - 1; c++) { - for (r = 0; r < n - 2 - 2 * c; r++) { - if (r & 1) - a = yabove_row[c + 1]; - else - a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1; - b = ypred_ptr[(r + 2) * y_stride + c - 1]; - ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3); - } - } - for (; c < n - 1; ++c) { - for (r = 0; r < n; r++) { - v = (r & 1 ? yabove_row[c + 1] : (yabove_row[c] + yabove_row[c + 1] + 1) >> 1); - w = c - r / 2; - ypred_ptr[r * y_stride + w] = v; - } - } - r = 0; - c = n - 1; - ypred_ptr[c] = (ypred_ptr[(c - 1)] + yabove_row[c] + 1) >> 1; - for (c = n - 2; c >= n / 2; --c) { - h = r + (n - 1 - c) * 2; - ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] + - ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1; - } - for (r = 1; r < n; r++) { - for (c = n - 1; c >= n / 2 + r / 2; --c) { - h = r + (n - 1 - c) * 2; - ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] + - ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1; - } - } -} - -static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n, - uint8_t *yabove_row, uint8_t *yleft_col) { - int r, c; - for (r = 0; r < n - 1; ++r) { - for (c = 0; c <= r; ++c) { - ypred_ptr[(r - c) * y_stride + c] = - (yabove_row[r + 1] * (c + 1) + - yleft_col[r + 1] * (r - c + 1) + r / 2 + 1) / (r + 2); - } - } - for (c = 0; c <= r; ++c) { - int yabove_ext = yabove_row[r]; // 2*yabove_row[r] - yabove_row[r-1]; - int yleft_ext = yleft_col[r]; // 2*yleft_col[r] - yleft_col[r-1]; - yabove_ext = (yabove_ext > 255 ? 255 : (yabove_ext < 0 ? 0 : yabove_ext)); - yleft_ext = (yleft_ext > 255 ? 255 : (yleft_ext < 0 ? 0 : yleft_ext)); - ypred_ptr[(r - c) * y_stride + c] = - (yabove_ext * (c + 1) + - yleft_ext * (r - c + 1) + r / 2 + 1) / (r + 2); - } - for (r = 1; r < n; ++r) { - for (c = n - r; c < n; ++c) - ypred_ptr[r * y_stride + c] = (ypred_ptr[(r - 1) * y_stride + c] + - ypred_ptr[r * y_stride + c - 1] + 1) >> 1; - } -} - -static void d117_predictor(uint8_t *ypred_ptr, int y_stride, int n, - uint8_t *yabove_row, uint8_t *yleft_col) { - int r, c; - for (c = 0; c < n; c++) - ypred_ptr[c] = (yabove_row[c - 1] + yabove_row[c] + 1) >> 1; - ypred_ptr += y_stride; - for (c = 0; c < n; c++) - ypred_ptr[c] = yabove_row[c - 1]; - ypred_ptr += y_stride; - for (r = 2; r < n; ++r) { - ypred_ptr[0] = yleft_col[r - 2]; - for (c = 1; c < n; c++) - ypred_ptr[c] = ypred_ptr[-2 * y_stride + c - 1]; - ypred_ptr += y_stride; - } -} - -static void d135_predictor(uint8_t *ypred_ptr, int y_stride, int n, - uint8_t *yabove_row, uint8_t *yleft_col) { - int r, c; - ypred_ptr[0] = yabove_row[-1]; - for (c = 1; c < n; c++) - ypred_ptr[c] = yabove_row[c - 1]; - for (r = 1; r < n; ++r) - ypred_ptr[r * y_stride] = yleft_col[r - 1]; - - ypred_ptr += y_stride; - for (r = 1; r < n; ++r) { - for (c = 1; c < n; c++) { - ypred_ptr[c] = ypred_ptr[-y_stride + c - 1]; - } - ypred_ptr += y_stride; - } -} - -static void d153_predictor(uint8_t *ypred_ptr, int y_stride, int n, - uint8_t *yabove_row, uint8_t *yleft_col) { - int r, c; - ypred_ptr[0] = (yabove_row[-1] + yleft_col[0] + 1) >> 1; - for (r = 1; r < n; r++) - ypred_ptr[r * y_stride] = (yleft_col[r - 1] + yleft_col[r] + 1) >> 1; - ypred_ptr++; - ypred_ptr[0] = yabove_row[-1]; - for (r = 1; r < n; r++) - ypred_ptr[r * y_stride] = yleft_col[r - 1]; - ypred_ptr++; - - for (c = 0; c < n - 2; c++) - ypred_ptr[c] = yabove_row[c]; - ypred_ptr += y_stride; - for (r = 1; r < n; ++r) { - for (c = 0; c < n - 2; c++) - ypred_ptr[c] = ypred_ptr[-y_stride + c - 2]; - ypred_ptr += y_stride; - } -} - -static void corner_predictor(unsigned char *ypred_ptr, int y_stride, int n, - unsigned char *yabove_row, - unsigned char *yleft_col) { - int mh, mv, maxgradh, maxgradv, x, y, nx, ny; - int i, j; - int top_left = yabove_row[-1]; - mh = mv = 0; - maxgradh = yabove_row[1] - top_left; - maxgradv = yleft_col[1] - top_left; - for (i = 2; i < n; ++i) { - int gh = yabove_row[i] - yabove_row[i - 2]; - int gv = yleft_col[i] - yleft_col[i - 2]; - if (gh > maxgradh) { - maxgradh = gh; - mh = i - 1; - } - if (gv > maxgradv) { - maxgradv = gv; - mv = i - 1; - } - } - nx = mh + mv + 3; - ny = 2 * n + 1 - nx; - - x = top_left; - for (i = 0; i <= mh; ++i) x += yabove_row[i]; - for (i = 0; i <= mv; ++i) x += yleft_col[i]; - x += (nx >> 1); - x /= nx; - y = 0; - for (i = mh + 1; i < n; ++i) y += yabove_row[i]; - for (i = mv + 1; i < n; ++i) y += yleft_col[i]; - y += (ny >> 1); - y /= ny; - - for (i = 0; i < n; ++i) { - for (j = 0; j < n; ++j) - ypred_ptr[j] = (i <= mh && j <= mv ? x : y); - ypred_ptr += y_stride; - } -} - -void vp9_recon_intra_mbuv(MACROBLOCKD *xd) { - int i; - for (i = 16; i < 24; i += 2) { - BLOCKD *b = &xd->block[i]; - vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); - } -} - -void vp9_build_intra_predictors_internal(unsigned char *src, int src_stride, - unsigned char *ypred_ptr, - int y_stride, int mode, int bsize, - int up_available, int left_available) { - - unsigned char *yabove_row = src - src_stride; - unsigned char yleft_col[32]; - unsigned char ytop_left = yabove_row[-1]; - int r, c, i; - - for (i = 0; i < bsize; i++) { - yleft_col[i] = src[i * src_stride - 1]; - } - - /* for Y */ - switch (mode) { - case DC_PRED: { - int expected_dc; - int i; - int shift; - int average = 0; - int log2_bsize_minus_1; - - assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32); - if (bsize == 4) { - log2_bsize_minus_1 = 1; - } else if (bsize == 8) { - log2_bsize_minus_1 = 2; - } else if (bsize == 16) { - log2_bsize_minus_1 = 3; - } else /* bsize == 32 */ { - log2_bsize_minus_1 = 4; - } - - if (up_available || left_available) { - if (up_available) { - for (i = 0; i < bsize; i++) { - average += yabove_row[i]; - } - } - - if (left_available) { - for (i = 0; i < bsize; i++) { - average += yleft_col[i]; - } - } - shift = log2_bsize_minus_1 + up_available + left_available; - expected_dc = (average + (1 << (shift - 1))) >> shift; - } else { - expected_dc = 128; - } - - for (r = 0; r < bsize; r++) { - vpx_memset(ypred_ptr, expected_dc, bsize); - ypred_ptr += y_stride; - } - } - break; - case V_PRED: { - for (r = 0; r < bsize; r++) { - memcpy(ypred_ptr, yabove_row, bsize); - ypred_ptr += y_stride; - } - } - break; - case H_PRED: { - for (r = 0; r < bsize; r++) { - vpx_memset(ypred_ptr, yleft_col[r], bsize); - ypred_ptr += y_stride; - } - } - break; - case TM_PRED: { - for (r = 0; r < bsize; r++) { - for (c = 0; c < bsize; c++) { - int pred = yleft_col[r] + yabove_row[ c] - ytop_left; - - if (pred < 0) - pred = 0; - - if (pred > 255) - pred = 255; - - ypred_ptr[c] = pred; - } - - ypred_ptr += y_stride; - } - } - break; - case D45_PRED: { - d45_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); - } - break; - case D135_PRED: { - d135_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); - } - break; - case D117_PRED: { - d117_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); - } - break; - case D153_PRED: { - d153_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); - } - break; - case D27_PRED: { - d27_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); - } - break; - case D63_PRED: { - d63_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); - } - break; - case I8X8_PRED: - case B_PRED: - case NEARESTMV: - case NEARMV: - case ZEROMV: - case NEWMV: - case SPLITMV: - case MB_MODE_COUNT: - break; - } -} - -#if CONFIG_COMP_INTERINTRA_PRED -static void combine_interintra(MB_PREDICTION_MODE mode, - unsigned char *interpred, - int interstride, - unsigned char *intrapred, - int intrastride, - int size) { - // TODO(debargha): Explore different ways of combining predictors - // or designing the tables below - static const int scale_bits = 8; - static const int scale_max = 1 << scale_bits; - static const int scale_round = (1 << scale_bits) - 1; - // This table is a function A + B*exp(-kx), where x is hor. index - static const int weights1d[32] = { - 128, 122, 116, 111, 107, 103, 99, 96, - 93, 90, 88, 85, 83, 81, 80, 78, - 77, 76, 75, 74, 73, 72, 71, 70, - 70, 69, 69, 68, 68, 68, 67, 67, - }; - // This table is a function A + B*exp(-k.sqrt(xy)), where x, y are - // hor. and vert. indices - static const int weights2d[1024] = { - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 122, 120, 118, 116, 115, 114, 113, - 112, 111, 111, 110, 109, 109, 108, 107, - 107, 106, 106, 105, 105, 104, 104, 104, - 103, 103, 102, 102, 102, 101, 101, 101, - 128, 120, 116, 114, 112, 111, 109, 108, - 107, 106, 105, 104, 103, 102, 102, 101, - 100, 100, 99, 99, 98, 97, 97, 96, - 96, 96, 95, 95, 94, 94, 93, 93, - 128, 118, 114, 111, 109, 107, 106, 104, - 103, 102, 101, 100, 99, 98, 97, 97, - 96, 95, 95, 94, 93, 93, 92, 92, - 91, 91, 90, 90, 90, 89, 89, 88, - 128, 116, 112, 109, 107, 105, 103, 102, - 100, 99, 98, 97, 96, 95, 94, 93, - 93, 92, 91, 91, 90, 90, 89, 89, - 88, 88, 87, 87, 86, 86, 85, 85, - 128, 115, 111, 107, 105, 103, 101, 99, - 98, 97, 96, 94, 93, 93, 92, 91, - 90, 89, 89, 88, 88, 87, 86, 86, - 85, 85, 84, 84, 84, 83, 83, 82, - 128, 114, 109, 106, 103, 101, 99, 97, - 96, 95, 93, 92, 91, 90, 90, 89, - 88, 87, 87, 86, 85, 85, 84, 84, - 83, 83, 82, 82, 82, 81, 81, 80, - 128, 113, 108, 104, 102, 99, 97, 96, - 94, 93, 92, 91, 90, 89, 88, 87, - 86, 85, 85, 84, 84, 83, 83, 82, - 82, 81, 81, 80, 80, 79, 79, 79, - 128, 112, 107, 103, 100, 98, 96, 94, - 93, 91, 90, 89, 88, 87, 86, 85, - 85, 84, 83, 83, 82, 82, 81, 80, - 80, 80, 79, 79, 78, 78, 78, 77, - 128, 111, 106, 102, 99, 97, 95, 93, - 91, 90, 89, 88, 87, 86, 85, 84, - 83, 83, 82, 81, 81, 80, 80, 79, - 79, 78, 78, 77, 77, 77, 76, 76, - 128, 111, 105, 101, 98, 96, 93, 92, - 90, 89, 88, 86, 85, 84, 84, 83, - 82, 81, 81, 80, 80, 79, 79, 78, - 78, 77, 77, 76, 76, 76, 75, 75, - 128, 110, 104, 100, 97, 94, 92, 91, - 89, 88, 86, 85, 84, 83, 83, 82, - 81, 80, 80, 79, 79, 78, 78, 77, - 77, 76, 76, 75, 75, 75, 74, 74, - 128, 109, 103, 99, 96, 93, 91, 90, - 88, 87, 85, 84, 83, 82, 82, 81, - 80, 79, 79, 78, 78, 77, 77, 76, - 76, 75, 75, 75, 74, 74, 74, 73, - 128, 109, 102, 98, 95, 93, 90, 89, - 87, 86, 84, 83, 82, 81, 81, 80, - 79, 78, 78, 77, 77, 76, 76, 75, - 75, 75, 74, 74, 73, 73, 73, 73, - 128, 108, 102, 97, 94, 92, 90, 88, - 86, 85, 84, 83, 82, 81, 80, 79, - 78, 78, 77, 77, 76, 76, 75, 75, - 74, 74, 73, 73, 73, 73, 72, 72, - 128, 107, 101, 97, 93, 91, 89, 87, - 85, 84, 83, 82, 81, 80, 79, 78, - 78, 77, 76, 76, 75, 75, 74, 74, - 74, 73, 73, 73, 72, 72, 72, 71, - 128, 107, 100, 96, 93, 90, 88, 86, - 85, 83, 82, 81, 80, 79, 78, 78, - 77, 76, 76, 75, 75, 74, 74, 73, - 73, 73, 72, 72, 72, 71, 71, 71, - 128, 106, 100, 95, 92, 89, 87, 85, - 84, 83, 81, 80, 79, 78, 78, 77, - 76, 76, 75, 75, 74, 74, 73, 73, - 72, 72, 72, 72, 71, 71, 71, 70, - 128, 106, 99, 95, 91, 89, 87, 85, - 83, 82, 81, 80, 79, 78, 77, 76, - 76, 75, 75, 74, 74, 73, 73, 72, - 72, 72, 71, 71, 71, 71, 70, 70, - 128, 105, 99, 94, 91, 88, 86, 84, - 83, 81, 80, 79, 78, 77, 77, 76, - 75, 75, 74, 74, 73, 73, 72, 72, - 72, 71, 71, 71, 70, 70, 70, 70, - 128, 105, 98, 93, 90, 88, 85, 84, - 82, 81, 80, 79, 78, 77, 76, 75, - 75, 74, 74, 73, 73, 72, 72, 71, - 71, 71, 71, 70, 70, 70, 70, 69, - 128, 104, 97, 93, 90, 87, 85, 83, - 82, 80, 79, 78, 77, 76, 76, 75, - 74, 74, 73, 73, 72, 72, 71, 71, - 71, 70, 70, 70, 70, 69, 69, 69, - 128, 104, 97, 92, 89, 86, 84, 83, - 81, 80, 79, 78, 77, 76, 75, 74, - 74, 73, 73, 72, 72, 71, 71, 71, - 70, 70, 70, 70, 69, 69, 69, 69, - 128, 104, 96, 92, 89, 86, 84, 82, - 80, 79, 78, 77, 76, 75, 75, 74, - 73, 73, 72, 72, 71, 71, 71, 70, - 70, 70, 70, 69, 69, 69, 69, 68, - 128, 103, 96, 91, 88, 85, 83, 82, - 80, 79, 78, 77, 76, 75, 74, 74, - 73, 72, 72, 72, 71, 71, 70, 70, - 70, 70, 69, 69, 69, 69, 68, 68, - 128, 103, 96, 91, 88, 85, 83, 81, - 80, 78, 77, 76, 75, 75, 74, 73, - 73, 72, 72, 71, 71, 70, 70, 70, - 70, 69, 69, 69, 69, 68, 68, 68, - 128, 102, 95, 90, 87, 84, 82, 81, - 79, 78, 77, 76, 75, 74, 73, 73, - 72, 72, 71, 71, 71, 70, 70, 70, - 69, 69, 69, 69, 68, 68, 68, 68, - 128, 102, 95, 90, 87, 84, 82, 80, - 79, 77, 76, 75, 75, 74, 73, 73, - 72, 72, 71, 71, 70, 70, 70, 69, - 69, 69, 69, 68, 68, 68, 68, 68, - 128, 102, 94, 90, 86, 84, 82, 80, - 78, 77, 76, 75, 74, 73, 73, 72, - 72, 71, 71, 70, 70, 70, 69, 69, - 69, 69, 68, 68, 68, 68, 68, 67, - 128, 101, 94, 89, 86, 83, 81, 79, - 78, 77, 76, 75, 74, 73, 73, 72, - 71, 71, 71, 70, 70, 69, 69, 69, - 69, 68, 68, 68, 68, 68, 67, 67, - 128, 101, 93, 89, 85, 83, 81, 79, - 78, 76, 75, 74, 74, 73, 72, 72, - 71, 71, 70, 70, 70, 69, 69, 69, - 68, 68, 68, 68, 68, 67, 67, 67, - 128, 101, 93, 88, 85, 82, 80, 79, - 77, 76, 75, 74, 73, 73, 72, 71, - 71, 70, 70, 70, 69, 69, 69, 68, - 68, 68, 68, 68, 67, 67, 67, 67, - }; - int size_scale = (size == 32 ? 1 : - size == 16 ? 2 : - size == 8 ? 4 : 8); - int i, j; - switch (mode) { - case V_PRED: - for (i = 0; i < size; ++i) { - for (j = 0; j < size; ++j) { - int k = i * interstride + j; - int scale = weights1d[i * size_scale]; - interpred[k] = - ((scale_max - scale) * interpred[k] + - scale * intrapred[i * intrastride + j] + scale_round) - >> scale_bits; - } - } - break; - - case H_PRED: - for (i = 0; i < size; ++i) { - for (j = 0; j < size; ++j) { - int k = i * interstride + j; - int scale = weights1d[j * size_scale]; - interpred[k] = - ((scale_max - scale) * interpred[k] + - scale * intrapred[i * intrastride + j] + scale_round) - >> scale_bits; - } - } - break; - - case D63_PRED: - case D117_PRED: - for (i = 0; i < size; ++i) { - for (j = 0; j < size; ++j) { - int k = i * interstride + j; - int scale = (weights2d[i * size_scale * 32 + j * size_scale] + - weights1d[i * size_scale]) >> 1; - interpred[k] = - ((scale_max - scale) * interpred[k] + - scale * intrapred[i * intrastride + j] + scale_round) - >> scale_bits; - } - } - break; - - case D27_PRED: - case D153_PRED: - for (i = 0; i < size; ++i) { - for (j = 0; j < size; ++j) { - int k = i * interstride + j; - int scale = (weights2d[i * size_scale * 32 + j * size_scale] + - weights1d[j * size_scale]) >> 1; - interpred[k] = - ((scale_max - scale) * interpred[k] + - scale * intrapred[i * intrastride + j] + scale_round) - >> scale_bits; - } - } - break; - - case D135_PRED: - for (i = 0; i < size; ++i) { - for (j = 0; j < size; ++j) { - int k = i * interstride + j; - int scale = weights2d[i * size_scale * 32 + j * size_scale]; - interpred[k] = - ((scale_max - scale) * interpred[k] + - scale * intrapred[i * intrastride + j] + scale_round) - >> scale_bits; - } - } - break; - - case D45_PRED: - case DC_PRED: - case TM_PRED: - default: - // simple average - for (i = 0; i < size; ++i) { - for (j = 0; j < size; ++j) { - int k = i * interstride + j; - interpred[k] = (interpred[k] + intrapred[i * intrastride + j]) >> 1; - } - } - break; - } -} - -void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd, - unsigned char *ypred, - unsigned char *upred, - unsigned char *vpred, - int ystride, int uvstride) { - vp9_build_interintra_16x16_predictors_mby(xd, ypred, ystride); - vp9_build_interintra_16x16_predictors_mbuv(xd, upred, vpred, uvstride); -} - -void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd, - unsigned char *ypred, - int ystride) { - static const int scale_bits = 6; - unsigned char intrapredictor[256]; - int i, j; - vp9_build_intra_predictors_internal( - xd->dst.y_buffer, xd->dst.y_stride, - intrapredictor, 16, - xd->mode_info_context->mbmi.interintra_mode, 16, - xd->up_available, xd->left_available); - combine_interintra(xd->mode_info_context->mbmi.interintra_mode, - ypred, ystride, intrapredictor, 16, 16); -} - -void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd, - unsigned char *upred, - unsigned char *vpred, - int uvstride) { - int i, j; - unsigned char uintrapredictor[64]; - unsigned char vintrapredictor[64]; - vp9_build_intra_predictors_internal( - xd->dst.u_buffer, xd->dst.uv_stride, - uintrapredictor, 8, - xd->mode_info_context->mbmi.interintra_uv_mode, 8, - xd->up_available, xd->left_available); - vp9_build_intra_predictors_internal( - xd->dst.v_buffer, xd->dst.uv_stride, - vintrapredictor, 8, - xd->mode_info_context->mbmi.interintra_uv_mode, 8, - xd->up_available, xd->left_available); - combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode, - upred, uvstride, uintrapredictor, 8, 8); - combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode, - vpred, uvstride, vintrapredictor, 8, 8); -} -#endif - -void vp9_build_intra_predictors_mby(MACROBLOCKD *xd) { - vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, - xd->predictor, 16, - xd->mode_info_context->mbmi.mode, 16, - xd->up_available, xd->left_available); -} - -void vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) { - vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, - xd->dst.y_buffer, xd->dst.y_stride, - xd->mode_info_context->mbmi.mode, 16, - xd->up_available, xd->left_available); -} - -#if CONFIG_SUPERBLOCKS -void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd) { - vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, - xd->dst.y_buffer, xd->dst.y_stride, - xd->mode_info_context->mbmi.mode, 32, - xd->up_available, xd->left_available); -} -#endif - -#if CONFIG_COMP_INTRA_PRED -void vp9_build_comp_intra_predictors_mby(MACROBLOCKD *xd) { - unsigned char predictor[2][256]; - int i; - - vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, - predictor[0], 16, - xd->mode_info_context->mbmi.mode, - 16, xd->up_available, - xd->left_available); - vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, - predictor[1], 16, - xd->mode_info_context->mbmi.second_mode, - 16, xd->up_available, - xd->left_available); - - for (i = 0; i < 256; i++) { - xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1; - } -} -#endif - -void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd, - unsigned char *upred_ptr, - unsigned char *vpred_ptr, - int uv_stride, - int mode, int bsize) { - vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride, - upred_ptr, uv_stride, mode, bsize, - xd->up_available, xd->left_available); - vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride, - vpred_ptr, uv_stride, mode, bsize, - xd->up_available, xd->left_available); -} - -void vp9_build_intra_predictors_mbuv(MACROBLOCKD *xd) { - vp9_build_intra_predictors_mbuv_internal(xd, &xd->predictor[256], - &xd->predictor[320], 8, - xd->mode_info_context->mbmi.uv_mode, - 8); -} - -void vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd) { - vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer, - xd->dst.v_buffer, - xd->dst.uv_stride, - xd->mode_info_context->mbmi.uv_mode, - 8); -} - -#if CONFIG_SUPERBLOCKS -void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd) { - vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer, - xd->dst.v_buffer, xd->dst.uv_stride, - xd->mode_info_context->mbmi.uv_mode, - 16); -} -#endif - -#if CONFIG_COMP_INTRA_PRED -void vp9_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) { - unsigned char predictor[2][2][64]; - int i; - - vp9_build_intra_predictors_mbuv_internal( - xd, predictor[0][0], predictor[1][0], 8, - xd->mode_info_context->mbmi.uv_mode, 8); - vp9_build_intra_predictors_mbuv_internal( - xd, predictor[0][1], predictor[1][1], 8, - xd->mode_info_context->mbmi.second_uv_mode, 8); - for (i = 0; i < 64; i++) { - xd->predictor[256 + i] = (predictor[0][0][i] + predictor[0][1][i] + 1) >> 1; - xd->predictor[256 + 64 + i] = (predictor[1][0][i] + - predictor[1][1][i] + 1) >> 1; - } -} -#endif - -void vp9_intra8x8_predict(BLOCKD *xd, - int mode, - unsigned char *predictor) { - vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst, - xd->dst_stride, predictor, 16, - mode, 8, 1, 1); -} - -#if CONFIG_COMP_INTRA_PRED -void vp9_comp_intra8x8_predict(BLOCKD *xd, - int mode, int second_mode, - unsigned char *out_predictor) { - unsigned char predictor[2][8 * 16]; - int i, j; - - vp9_intra8x8_predict(xd, mode, predictor[0]); - vp9_intra8x8_predict(xd, second_mode, predictor[1]); - - for (i = 0; i < 8 * 16; i += 16) { - for (j = i; j < i + 8; j++) { - out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1; - } - } -} -#endif - -void vp9_intra_uv4x4_predict(BLOCKD *xd, - int mode, - unsigned char *predictor) { - vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst, - xd->dst_stride, predictor, 8, - mode, 4, 1, 1); -} - -#if CONFIG_COMP_INTRA_PRED -void vp9_comp_intra_uv4x4_predict(BLOCKD *xd, - int mode, int mode2, - unsigned char *out_predictor) { - unsigned char predictor[2][8 * 4]; - int i, j; - - vp9_intra_uv4x4_predict(xd, mode, predictor[0]); - vp9_intra_uv4x4_predict(xd, mode2, predictor[1]); - - for (i = 0; i < 4 * 8; i += 8) { - for (j = i; j < i + 4; j++) { - out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1; - } - } -} -#endif - -/* TODO: try different ways of use Y-UV mode correlation - Current code assumes that a uv 4x4 block use same mode - as corresponding Y 8x8 area - */ diff --git a/vp9/common/reconintra.h b/vp9/common/reconintra.h deleted file mode 100644 index c72e8e912..000000000 --- a/vp9/common/reconintra.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef __INC_RECONINTRA_H -#define __INC_RECONINTRA_H - -#include "blockd.h" - -extern void vp9_recon_intra_mbuv(MACROBLOCKD *xd); -extern B_PREDICTION_MODE vp9_find_dominant_direction(unsigned char *ptr, - int stride, int n); -extern B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x); -#if CONFIG_COMP_INTERINTRA_PRED -extern void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd, - unsigned char *ypred, - unsigned char *upred, - unsigned char *vpred, - int ystride, - int uvstride); -extern void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd, - unsigned char *ypred, - int ystride); -extern void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd, - unsigned char *upred, - unsigned char *vpred, - int uvstride); -#endif - -#endif // __INC_RECONINTRA_H diff --git a/vp9/common/reconintra4x4.c b/vp9/common/reconintra4x4.c deleted file mode 100644 index 1cf960cb7..000000000 --- a/vp9/common/reconintra4x4.c +++ /dev/null @@ -1,472 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include "vpx_mem/vpx_mem.h" -#include "reconintra.h" -#include "vp9_rtcd.h" - -#if CONFIG_NEWBINTRAMODES -static int find_grad_measure(unsigned char *x, int stride, int n, int t, - int dx, int dy) { - int i, j; - int count = 0, gsum = 0, gdiv; - /* TODO: Make this code more efficient by breaking up into two loops */ - for (i = -t; i < n; ++i) - for (j = -t; j < n; ++j) { - int g; - if (i >= 0 && j >= 0) continue; - if (i + dy >= 0 && j + dx >= 0) continue; - if (i + dy < -t || i + dy >= n || j + dx < -t || j + dx >= n) continue; - g = abs(x[(i + dy) * stride + j + dx] - x[i * stride + j]); - gsum += g * g; - count++; - } - gdiv = (dx * dx + dy * dy) * count; - return ((gsum << 8) + (gdiv >> 1)) / gdiv; -} - -#if CONTEXT_PRED_REPLACEMENTS == 6 -B_PREDICTION_MODE vp9_find_dominant_direction( - unsigned char *ptr, int stride, int n) { - int g[8], i, imin, imax; - g[1] = find_grad_measure(ptr, stride, n, 4, 2, 1); - g[2] = find_grad_measure(ptr, stride, n, 4, 1, 1); - g[3] = find_grad_measure(ptr, stride, n, 4, 1, 2); - g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2); - g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1); - g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1); - imin = 1; - for (i = 2; i < 8; i += 1 + (i == 3)) - imin = (g[i] < g[imin] ? i : imin); - imax = 1; - for (i = 2; i < 8; i += 1 + (i == 3)) - imax = (g[i] > g[imax] ? i : imax); - /* - printf("%d %d %d %d %d %d = %d %d\n", - g[1], g[2], g[3], g[5], g[6], g[7], imin, imax); - */ - switch (imin) { - case 1: - return B_HD_PRED; - case 2: - return B_RD_PRED; - case 3: - return B_VR_PRED; - case 5: - return B_VL_PRED; - case 6: - return B_LD_PRED; - case 7: - return B_HU_PRED; - default: - assert(0); - } -} -#elif CONTEXT_PRED_REPLACEMENTS == 4 -B_PREDICTION_MODE vp9_find_dominant_direction( - unsigned char *ptr, int stride, int n) { - int g[8], i, imin, imax; - g[1] = find_grad_measure(ptr, stride, n, 4, 2, 1); - g[3] = find_grad_measure(ptr, stride, n, 4, 1, 2); - g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2); - g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1); - imin = 1; - for (i = 3; i < 8; i+=2) - imin = (g[i] < g[imin] ? i : imin); - imax = 1; - for (i = 3; i < 8; i+=2) - imax = (g[i] > g[imax] ? i : imax); - /* - printf("%d %d %d %d = %d %d\n", - g[1], g[3], g[5], g[7], imin, imax); - */ - switch (imin) { - case 1: - return B_HD_PRED; - case 3: - return B_VR_PRED; - case 5: - return B_VL_PRED; - case 7: - return B_HU_PRED; - default: - assert(0); - } -} -#elif CONTEXT_PRED_REPLACEMENTS == 0 -B_PREDICTION_MODE vp9_find_dominant_direction( - unsigned char *ptr, int stride, int n) { - int g[8], i, imin, imin2, imax; - g[0] = find_grad_measure(ptr, stride, n, 4, 1, 0); - g[1] = find_grad_measure(ptr, stride, n, 4, 2, 1); - g[2] = find_grad_measure(ptr, stride, n, 4, 1, 1); - g[3] = find_grad_measure(ptr, stride, n, 4, 1, 2); - g[4] = find_grad_measure(ptr, stride, n, 4, 0, 1); - g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2); - g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1); - g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1); - imax = 0; - for (i = 1; i < 8; i++) - imax = (g[i] > g[imax] ? i : imax); - imin = 0; - for (i = 1; i < 8; i++) - imin = (g[i] < g[imin] ? i : imin); - - switch (imin) { - case 0: - return B_HE_PRED; - case 1: - return B_HD_PRED; - case 2: - return B_RD_PRED; - case 3: - return B_VR_PRED; - case 4: - return B_VE_PRED; - case 5: - return B_VL_PRED; - case 6: - return B_LD_PRED; - case 7: - return B_HU_PRED; - default: - assert(0); - } -} -#endif - -B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x) { - unsigned char *ptr = *(x->base_dst) + x->dst; - int stride = x->dst_stride; - return vp9_find_dominant_direction(ptr, stride, 4); -} -#endif - -void vp9_intra4x4_predict(BLOCKD *x, - int b_mode, - unsigned char *predictor) { - int i, r, c; - - unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride; - unsigned char Left[4]; - unsigned char top_left = Above[-1]; - - Left[0] = (*(x->base_dst))[x->dst - 1]; - Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride]; - Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride]; - Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride]; - -#if CONFIG_NEWBINTRAMODES - if (b_mode == B_CONTEXT_PRED) - b_mode = x->bmi.as_mode.context; -#endif - - switch (b_mode) { - case B_DC_PRED: { - int expected_dc = 0; - - for (i = 0; i < 4; i++) { - expected_dc += Above[i]; - expected_dc += Left[i]; - } - - expected_dc = (expected_dc + 4) >> 3; - - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - predictor[c] = expected_dc; - } - - predictor += 16; - } - } - break; - case B_TM_PRED: { - /* prediction similar to true_motion prediction */ - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - int pred = Above[c] - top_left + Left[r]; - - if (pred < 0) - pred = 0; - - if (pred > 255) - pred = 255; - - predictor[c] = pred; - } - - predictor += 16; - } - } - break; - - case B_VE_PRED: { - - unsigned int ap[4]; - ap[0] = Above[0]; - ap[1] = Above[1]; - ap[2] = Above[2]; - ap[3] = Above[3]; - - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - - predictor[c] = ap[c]; - } - - predictor += 16; - } - - } - break; - - - case B_HE_PRED: { - - unsigned int lp[4]; - lp[0] = Left[0]; - lp[1] = Left[1]; - lp[2] = Left[2]; - lp[3] = Left[3]; - - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - predictor[c] = lp[r]; - } - - predictor += 16; - } - } - break; - case B_LD_PRED: { - unsigned char *ptr = Above; - predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2; - predictor[0 * 16 + 1] = - predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2; - predictor[0 * 16 + 2] = - predictor[1 * 16 + 1] = - predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2; - predictor[0 * 16 + 3] = - predictor[1 * 16 + 2] = - predictor[2 * 16 + 1] = - predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2; - predictor[1 * 16 + 3] = - predictor[2 * 16 + 2] = - predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2; - predictor[2 * 16 + 3] = - predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2; - predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2; - - } - break; - case B_RD_PRED: { - - unsigned char pp[9]; - - pp[0] = Left[3]; - pp[1] = Left[2]; - pp[2] = Left[1]; - pp[3] = Left[0]; - pp[4] = top_left; - pp[5] = Above[0]; - pp[6] = Above[1]; - pp[7] = Above[2]; - pp[8] = Above[3]; - - predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[3 * 16 + 1] = - predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[3 * 16 + 2] = - predictor[2 * 16 + 1] = - predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[3 * 16 + 3] = - predictor[2 * 16 + 2] = - predictor[1 * 16 + 1] = - predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[2 * 16 + 3] = - predictor[1 * 16 + 2] = - predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[1 * 16 + 3] = - predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; - predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; - - } - break; - case B_VR_PRED: { - - unsigned char pp[9]; - - pp[0] = Left[3]; - pp[1] = Left[2]; - pp[2] = Left[1]; - pp[3] = Left[0]; - pp[4] = top_left; - pp[5] = Above[0]; - pp[6] = Above[1]; - pp[7] = Above[2]; - pp[8] = Above[3]; - - - predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[3 * 16 + 1] = - predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[2 * 16 + 1] = - predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1; - predictor[3 * 16 + 2] = - predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[2 * 16 + 2] = - predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1; - predictor[3 * 16 + 3] = - predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; - predictor[2 * 16 + 3] = - predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1; - predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; - predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1; - - } - break; - case B_VL_PRED: { - - unsigned char *pp = Above; - - predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; - predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[2 * 16 + 0] = - predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1; - predictor[1 * 16 + 1] = - predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[2 * 16 + 1] = - predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1; - predictor[3 * 16 + 1] = - predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[0 * 16 + 3] = - predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1; - predictor[1 * 16 + 3] = - predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; - } - break; - - case B_HD_PRED: { - unsigned char pp[9]; - pp[0] = Left[3]; - pp[1] = Left[2]; - pp[2] = Left[1]; - pp[3] = Left[0]; - pp[4] = top_left; - pp[5] = Above[0]; - pp[6] = Above[1]; - pp[7] = Above[2]; - pp[8] = Above[3]; - - - predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; - predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[2 * 16 + 0] = - predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1; - predictor[2 * 16 + 1] = - predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[2 * 16 + 2] = - predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1; - predictor[2 * 16 + 3] = - predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; - predictor[1 * 16 + 2] = - predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1; - predictor[1 * 16 + 3] = - predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; - predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; - predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; - } - break; - - - case B_HU_PRED: { - unsigned char *pp = Left; - predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; - predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; - predictor[0 * 16 + 2] = - predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1; - predictor[0 * 16 + 3] = - predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; - predictor[1 * 16 + 2] = - predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1; - predictor[1 * 16 + 3] = - predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2; - predictor[2 * 16 + 2] = - predictor[2 * 16 + 3] = - predictor[3 * 16 + 0] = - predictor[3 * 16 + 1] = - predictor[3 * 16 + 2] = - predictor[3 * 16 + 3] = pp[3]; - } - break; - -#if CONFIG_NEWBINTRAMODES - case B_CONTEXT_PRED: - break; - /* - case B_CORNER_PRED: - corner_predictor(predictor, 16, 4, Above, Left); - break; - */ -#endif - } -} - -#if CONFIG_COMP_INTRA_PRED -void vp9_comp_intra4x4_predict_c(BLOCKD *x, - int b_mode, int b_mode2, - unsigned char *out_predictor) { - unsigned char predictor[2][4 * 16]; - int i, j; - - vp9_intra4x4_predict(x, b_mode, predictor[0]); - vp9_intra4x4_predict(x, b_mode2, predictor[1]); - - for (i = 0; i < 16 * 4; i += 16) { - for (j = i; j < i + 4; j++) { - out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1; - } - } -} -#endif - -/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and - * to the right prediction have filled in pixels to use. - */ -void vp9_intra_prediction_down_copy(MACROBLOCKD *xd) { - int extend_edge = (xd->mb_to_right_edge == 0 && xd->mb_index < 2); - unsigned char *above_right = *(xd->block[0].base_dst) + xd->block[0].dst - - xd->block[0].dst_stride + 16; - unsigned int *src_ptr = (unsigned int *) - (above_right - (xd->mb_index == 3 ? 16 * xd->block[0].dst_stride : 0)); - - unsigned int *dst_ptr0 = (unsigned int *)above_right; - unsigned int *dst_ptr1 = - (unsigned int *)(above_right + 4 * xd->block[0].dst_stride); - unsigned int *dst_ptr2 = - (unsigned int *)(above_right + 8 * xd->block[0].dst_stride); - unsigned int *dst_ptr3 = - (unsigned int *)(above_right + 12 * xd->block[0].dst_stride); - - if (extend_edge) { - *src_ptr = ((uint8_t *) src_ptr)[-1] * 0x01010101U; - } - - *dst_ptr0 = *src_ptr; - *dst_ptr1 = *src_ptr; - *dst_ptr2 = *src_ptr; - *dst_ptr3 = *src_ptr; -} diff --git a/vp9/common/reconintra4x4.h b/vp9/common/reconintra4x4.h deleted file mode 100644 index 79a048076..000000000 --- a/vp9/common/reconintra4x4.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_RECONINTRA4x4_H -#define __INC_RECONINTRA4x4_H - -extern void vp9_intra_prediction_down_copy(MACROBLOCKD *xd); - -#endif diff --git a/vp9/common/rtcd.c b/vp9/common/rtcd.c deleted file mode 100644 index 277d5b217..000000000 --- a/vp9/common/rtcd.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include "vpx_config.h" -#define RTCD_C -#include "vp9_rtcd.h" -#include "vpx_ports/vpx_once.h" - -extern void vpx_scale_rtcd(void); - -void vp9_rtcd() -{ - vpx_scale_rtcd(); - once(setup_rtcd_internal); -} diff --git a/vp9/common/rtcd_defs.sh b/vp9/common/rtcd_defs.sh deleted file mode 100644 index ea134a854..000000000 --- a/vp9/common/rtcd_defs.sh +++ /dev/null @@ -1,689 +0,0 @@ -vp9_common_forward_decls() { -cat <segmentation_enabled && - (xd->segment_feature_mask[segment_id] & - (0x01 << feature_id))); -} - -void vp9_clearall_segfeatures(MACROBLOCKD *xd) { - vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data)); - vpx_memset(xd->segment_feature_mask, 0, sizeof(xd->segment_feature_mask)); -} - -void vp9_enable_segfeature(MACROBLOCKD *xd, - int segment_id, - SEG_LVL_FEATURES feature_id) { - xd->segment_feature_mask[segment_id] |= (0x01 << feature_id); -} - -void vp9_disable_segfeature(MACROBLOCKD *xd, - int segment_id, - SEG_LVL_FEATURES feature_id) { - xd->segment_feature_mask[segment_id] &= ~(1 << feature_id); -} - -int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id) { - return seg_feature_data_bits[feature_id]; -} - -int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) { - return (segfeaturedata_signed[feature_id]); -} - -void vp9_clear_segdata(MACROBLOCKD *xd, - int segment_id, - SEG_LVL_FEATURES feature_id) { - xd->segment_feature_data[segment_id][feature_id] = 0; -} - -void vp9_set_segdata(MACROBLOCKD *xd, - int segment_id, - SEG_LVL_FEATURES feature_id, - int seg_data) { - xd->segment_feature_data[segment_id][feature_id] = seg_data; -} - -int vp9_get_segdata(const MACROBLOCKD *xd, - int segment_id, - SEG_LVL_FEATURES feature_id) { - return xd->segment_feature_data[segment_id][feature_id]; -} - -void vp9_clear_segref(MACROBLOCKD *xd, int segment_id) { - xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] = 0; -} - -void vp9_set_segref(MACROBLOCKD *xd, - int segment_id, - MV_REFERENCE_FRAME ref_frame) { - xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] |= - (1 << ref_frame); -} - -int vp9_check_segref(const MACROBLOCKD *xd, - int segment_id, - MV_REFERENCE_FRAME ref_frame) { - return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] & - (1 << ref_frame)) ? 1 : 0; -} - -int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id) { - return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] & - ~(1 << INTRA_FRAME)) ? 1 : 0; -} - -int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id) { - if (vp9_segfeature_active(xd, segment_id, SEG_LVL_TRANSFORM)) - return vp9_get_segdata(xd, segment_id, SEG_LVL_TRANSFORM); - else - return TX_4X4; -} -// TBD? Functions to read and write segment data with range / validity checking diff --git a/vp9/common/seg_common.h b/vp9/common/seg_common.h deleted file mode 100644 index 58edf1612..000000000 --- a/vp9/common/seg_common.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "type_aliases.h" -#include "onyxc_int.h" -#include "vp9/common/blockd.h" - -#ifndef __INC_SEG_COMMON_H__ -#define __INC_SEG_COMMON_H__ 1 - -int vp9_segfeature_active(const MACROBLOCKD *xd, - int segment_id, - SEG_LVL_FEATURES feature_id); - -void vp9_clearall_segfeatures(MACROBLOCKD *xd); - -void vp9_enable_segfeature(MACROBLOCKD *xd, - int segment_id, - SEG_LVL_FEATURES feature_id); - -void vp9_disable_segfeature(MACROBLOCKD *xd, - int segment_id, - SEG_LVL_FEATURES feature_id); - -int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id); - -int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id); - -void vp9_clear_segdata(MACROBLOCKD *xd, - int segment_id, - SEG_LVL_FEATURES feature_id); - -void vp9_set_segdata(MACROBLOCKD *xd, - int segment_id, - SEG_LVL_FEATURES feature_id, - int seg_data); - -int vp9_get_segdata(const MACROBLOCKD *xd, - int segment_id, - SEG_LVL_FEATURES feature_id); - -void vp9_clear_segref(MACROBLOCKD *xd, int segment_id); - -void vp9_set_segref(MACROBLOCKD *xd, - int segment_id, - MV_REFERENCE_FRAME ref_frame); - -int vp9_check_segref(const MACROBLOCKD *xd, - int segment_id, - MV_REFERENCE_FRAME ref_frame); - -int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id); - -int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id); - -#endif /* __INC_SEG_COMMON_H__ */ - diff --git a/vp9/common/setupintrarecon.c b/vp9/common/setupintrarecon.c deleted file mode 100644 index 9cb037228..000000000 --- a/vp9/common/setupintrarecon.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "setupintrarecon.h" -#include "vpx_mem/vpx_mem.h" - -void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) { - int i; - - /* set up frame new frame for intra coded blocks */ - vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5); - for (i = 0; i < ybf->y_height; i++) - ybf->y_buffer[ybf->y_stride * i - 1] = (unsigned char) 129; - - vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); - for (i = 0; i < ybf->uv_height; i++) - ybf->u_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129; - - vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); - for (i = 0; i < ybf->uv_height; i++) - ybf->v_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129; - -} diff --git a/vp9/common/setupintrarecon.h b/vp9/common/setupintrarecon.h deleted file mode 100644 index 1a55d0ad6..000000000 --- a/vp9/common/setupintrarecon.h +++ /dev/null @@ -1,13 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_scale/yv12config.h" -extern void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf); diff --git a/vp9/common/subpelvar.h b/vp9/common/subpelvar.h deleted file mode 100644 index 83cc2a7ce..000000000 --- a/vp9/common/subpelvar.h +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vp9/common/filter.h" - - - -static void variance(const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - int w, - int h, - unsigned int *sse, - int *sum) { - int i, j; - int diff; - - *sum = 0; - *sse = 0; - - for (i = 0; i < h; i++) { - for (j = 0; j < w; j++) { - diff = src_ptr[j] - ref_ptr[j]; - *sum += diff; - *sse += diff * diff; - } - - src_ptr += source_stride; - ref_ptr += recon_stride; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_first_pass - * - * INPUTS : UINT8 *src_ptr : Pointer to source block. - * UINT32 src_pixels_per_line : Stride of input block. - * UINT32 pixel_step : Offset between filter input samples (see notes). - * UINT32 output_height : Input block height. - * UINT32 output_width : Input block width. - * INT32 *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : INT32 *output_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in - * either horizontal or vertical direction to produce the - * filtered output block. Used to implement first-pass - * of 2-D separable filter. - * - * SPECIAL NOTES : Produces INT32 output to retain precision for next pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * pixel_step defines whether the filter is applied - * horizontally (pixel_step=1) or vertically (pixel_step=stride). - * It defines the offset required to move from one input - * to the next. - * - ****************************************************************************/ -static void var_filter_block2d_bil_first_pass(const unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - // Apply bilinear filter - output_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[pixel_step] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT; - src_ptr++; - } - - // Next row... - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_second_pass - * - * INPUTS : INT32 *src_ptr : Pointer to source block. - * UINT32 src_pixels_per_line : Stride of input block. - * UINT32 pixel_step : Offset between filter input samples (see notes). - * UINT32 output_height : Input block height. - * UINT32 output_width : Input block width. - * INT32 *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : UINT16 *output_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in - * either horizontal or vertical direction to produce the - * filtered output block. Used to implement second-pass - * of 2-D separable filter. - * - * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * pixel_step defines whether the filter is applied - * horizontally (pixel_step=1) or vertically (pixel_step=stride). - * It defines the offset required to move from one input - * to the next. - * - ****************************************************************************/ -static void var_filter_block2d_bil_second_pass(const unsigned short *src_ptr, - unsigned char *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter) { - unsigned int i, j; - int Temp; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - // Apply filter - Temp = ((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[pixel_step] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2); - output_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT); - src_ptr++; - } - - // Next row... - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - diff --git a/vp9/common/subpixel.h b/vp9/common/subpixel.h deleted file mode 100644 index 2b8429198..000000000 --- a/vp9/common/subpixel.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef SUBPIXEL_H -#define SUBPIXEL_H - -#define prototype_subpixel_predict(sym) \ - void sym(unsigned char *src, int src_pitch, int xofst, int yofst, \ - unsigned char *dst, int dst_pitch) - -typedef prototype_subpixel_predict((*vp9_subpix_fn_t)); - -#endif diff --git a/vp9/common/swapyv12buffer.c b/vp9/common/swapyv12buffer.c deleted file mode 100644 index 13fc8d4bc..000000000 --- a/vp9/common/swapyv12buffer.c +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "swapyv12buffer.h" - -void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, - YV12_BUFFER_CONFIG *last_frame) { - unsigned char *temp; - - temp = last_frame->buffer_alloc; - last_frame->buffer_alloc = new_frame->buffer_alloc; - new_frame->buffer_alloc = temp; - - temp = last_frame->y_buffer; - last_frame->y_buffer = new_frame->y_buffer; - new_frame->y_buffer = temp; - - temp = last_frame->u_buffer; - last_frame->u_buffer = new_frame->u_buffer; - new_frame->u_buffer = temp; - - temp = last_frame->v_buffer; - last_frame->v_buffer = new_frame->v_buffer; - new_frame->v_buffer = temp; -} diff --git a/vp9/common/swapyv12buffer.h b/vp9/common/swapyv12buffer.h deleted file mode 100644 index 44ed5e84d..000000000 --- a/vp9/common/swapyv12buffer.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef __SWAPYV12_BUFFER_H -#define __SWAPYV12_BUFFER_H - -#include "vpx_scale/yv12config.h" - -void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, - YV12_BUFFER_CONFIG *last_frame); - -#endif // __SWAPYV12_BUFFER_H diff --git a/vp9/common/systemdependent.h b/vp9/common/systemdependent.h deleted file mode 100644 index 5d778bcd0..000000000 --- a/vp9/common/systemdependent.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#if ARCH_X86 || ARCH_X86_64 -void vpx_reset_mmx_state(void); -#define vp9_clear_system_state() vpx_reset_mmx_state() -#else -#define vp9_clear_system_state() -#endif - -struct VP9Common; -void vp9_machine_specific_config(struct VP9Common *); diff --git a/vp9/common/tapify.py b/vp9/common/tapify.py deleted file mode 100644 index 99529cff0..000000000 --- a/vp9/common/tapify.py +++ /dev/null @@ -1,106 +0,0 @@ -""" - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. -""" -#!/usr/bin/env python -import sys,string,os,re,math,numpy -scale = 2**16 -def dist(p1,p2): - x1,y1 = p1 - x2,y2 = p2 - if x1==x2 and y1==y2 : - return 1.0 - return 1/ math.sqrt((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2)) - -def gettaps(p): - def l(b): - return int(math.floor(b)) - def h(b): - return int(math.ceil(b)) - def t(b,p,s): - return int((scale*dist(b,p)+s/2)/s) - r,c = p - ul=[l(r),l(c)] - ur=[l(r),h(c)] - ll=[h(r),l(c)] - lr=[h(r),h(c)] - sum = dist(ul,p)+dist(ur,p)+dist(ll,p)+dist(lr,p) - t4 = scale - t(ul,p,sum) - t(ur,p,sum) - t(ll,p,sum); - return [[ul,t(ul,p,sum)],[ur,t(ur,p,sum)], - [ll,t(ll,p,sum)],[lr,t4]] - -def print_mb_taps(angle,blocksize): - theta = angle / 57.2957795; - affine = [[math.cos(theta),-math.sin(theta)], - [math.sin(theta),math.cos(theta)]] - radius = (float(blocksize)-1)/2 - print " // angle of",angle,"degrees" - for y in range(blocksize) : - for x in range(blocksize) : - r,c = numpy.dot(affine,[y-radius, x-radius]) - tps = gettaps([r+radius,c+radius]) - for t in tps : - p,t = t - tr,tc = p - print " %2d, %2d, %5d, " % (tr,tc,t,), - print " // %2d,%2d " % (y,x) - -i=float(sys.argv[1]) -while i <= float(sys.argv[2]) : - print_mb_taps(i,float(sys.argv[4])) - i=i+float(sys.argv[3]) -""" - -taps = [] -pt=dict() -ptr=dict() -for y in range(16) : - for x in range(16) : - r,c = numpy.dot(affine,[y-7.5, x-7.5]) - tps = gettaps([r+7.5,c+7.5]) - j=0 - for tp in tps : - p,i = tp - r,c = p - pt[y,x,j]= [p,i] - try: - ptr[r,j,c].append([y,x]) - except: - ptr[r,j,c]=[[y,x]] - j = j+1 - -for key in sorted(pt.keys()) : - print key,pt[key] - -lr = -99 -lj = -99 -lc = 0 - -shuf="" -mask="" -for r,j,c in sorted(ptr.keys()) : - for y,x in ptr[r,j,c] : - if lr != r or lj != j : - print "shuf_"+str(lr)+"_"+str(lj)+"_"+shuf.ljust(16,"0"), lc - shuf="" - lc = 0 - for i in range(lc,c-1) : - shuf = shuf +"0" - shuf = shuf + hex(x)[2] - lc =c - break - lr = r - lj = j -# print r,j,c,ptr[r,j,c] -# print - -for r,j,c in sorted(ptr.keys()) : - for y,x in ptr[r,j,c] : - print r,j,c,y,x - break -""" diff --git a/vp9/common/textblit.c b/vp9/common/textblit.c deleted file mode 100644 index 601d298dc..000000000 --- a/vp9/common/textblit.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "vp9/common/textblit.h" - -void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) { - int letter_bitmap; - unsigned char *output_pos = address; - int colpos; - const int font[] = { - 0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000, - 0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110, - 0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA, - 0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20, - 0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF, - 0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F, - 0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2, - 0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731, - 0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820 - }; - colpos = 0; - - while (msg[colpos] != 0) { - char letter = msg[colpos]; - int fontcol, fontrow; - - if (letter <= 'Z' && letter >= ' ') - letter_bitmap = font[letter - ' ']; - else if (letter <= 'z' && letter >= 'a') - letter_bitmap = font[letter - 'a' + 'A' - ' ']; - else - letter_bitmap = font[0]; - - for (fontcol = 6; fontcol >= 0; fontcol--) - for (fontrow = 0; fontrow < 5; fontrow++) - output_pos[fontrow * pitch + fontcol] = - ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0); - - output_pos += 7; - colpos++; - } -} - -static void plot(const int x, const int y, unsigned char *image, const int pitch) { - image [x + y * pitch] ^= 255; -} - -/* Bresenham line algorithm */ -void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch) { - int steep = abs(y1 - y0) > abs(x1 - x0); - int deltax, deltay; - int error, ystep, y, x; - - if (steep) { - int t; - t = x0; - x0 = y0; - y0 = t; - - t = x1; - x1 = y1; - y1 = t; - } - - if (x0 > x1) { - int t; - t = x0; - x0 = x1; - x1 = t; - - t = y0; - y0 = y1; - y1 = t; - } - - deltax = x1 - x0; - deltay = abs(y1 - y0); - error = deltax / 2; - - y = y0; - - if (y0 < y1) - ystep = 1; - else - ystep = -1; - - if (steep) { - for (x = x0; x <= x1; x++) { - plot(y, x, image, pitch); - - error = error - deltay; - if (error < 0) { - y = y + ystep; - error = error + deltax; - } - } - } else { - for (x = x0; x <= x1; x++) { - plot(x, y, image, pitch); - - error = error - deltay; - if (error < 0) { - y = y + ystep; - error = error + deltax; - } - } - } -} diff --git a/vp9/common/textblit.h b/vp9/common/textblit.h deleted file mode 100644 index 39edbb09d..000000000 --- a/vp9/common/textblit.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef __INC_TEXTBLIT_H -#define __INC_TEXTBLIT_H - -extern void vp9_blit_text(const char *msg, unsigned char *address, - const int pitch); -extern void vp9_blit_line(int x0, int x1, int y0, int y1, - unsigned char *image, const int pitch); - -#endif // __INC_TEXTBLIT_H diff --git a/vp9/common/treecoder.c b/vp9/common/treecoder.c deleted file mode 100644 index 39629406e..000000000 --- a/vp9/common/treecoder.c +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_config.h" - -#if defined(CONFIG_DEBUG) && CONFIG_DEBUG -#include -#endif -#include - -#include "treecoder.h" - -static void tree2tok( - struct vp9_token_struct *const p, - vp9_tree t, - int i, - int v, - int L -) { - v += v; - ++L; - - do { - const vp9_tree_index j = t[i++]; - - if (j <= 0) { - p[-j].value = v; - p[-j].Len = L; - } else - tree2tok(p, t, j, v, L); - } while (++v & 1); -} - -void vp9_tokens_from_tree(struct vp9_token_struct *p, vp9_tree t) { - tree2tok(p, t, 0, 0, 0); -} - -void vp9_tokens_from_tree_offset(struct vp9_token_struct *p, vp9_tree t, - int offset) { - tree2tok(p - offset, t, 0, 0, 0); -} - -static void branch_counts( - int n, /* n = size of alphabet */ - vp9_token tok [ /* n */ ], - vp9_tree tree, - unsigned int branch_ct [ /* n-1 */ ] [2], - const unsigned int num_events[ /* n */ ] -) { - const int tree_len = n - 1; - int t = 0; - -#if CONFIG_DEBUG - assert(tree_len); -#endif - - do { - branch_ct[t][0] = branch_ct[t][1] = 0; - } while (++t < tree_len); - - t = 0; - - do { - int L = tok[t].Len; - const int enc = tok[t].value; - const unsigned int ct = num_events[t]; - - vp9_tree_index i = 0; - - do { - const int b = (enc >> --L) & 1; - const int j = i >> 1; -#if CONFIG_DEBUG - assert(j < tree_len && 0 <= L); -#endif - - branch_ct [j] [b] += ct; - i = tree[ i + b]; - } while (i > 0); - -#if CONFIG_DEBUG - assert(!L); -#endif - } while (++t < n); - -} - - -void vp9_tree_probs_from_distribution( - int n, /* n = size of alphabet */ - vp9_token tok [ /* n */ ], - vp9_tree tree, - vp9_prob probs [ /* n-1 */ ], - unsigned int branch_ct [ /* n-1 */ ] [2], - const unsigned int num_events[ /* n */ ], - unsigned int Pfac, - int rd -) { - const int tree_len = n - 1; - int t = 0; - - branch_counts(n, tok, tree, branch_ct, num_events); - - do { - const unsigned int *const c = branch_ct[t]; - const unsigned int tot = c[0] + c[1]; - -#if CONFIG_DEBUG - assert(tot < (1 << 24)); /* no overflow below */ -#endif - - if (tot) { - const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot; - probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */ - } else - probs[t] = vp9_prob_half; - } while (++t < tree_len); -} - -vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]) { - int tot_count = counts[0] + counts[1]; - vp9_prob prob; - if (tot_count) { - prob = (counts[0] * 255 + (tot_count >> 1)) / tot_count; - prob += !prob; - } else { - prob = 128; - } - return prob; -} diff --git a/vp9/common/treecoder.h b/vp9/common/treecoder.h deleted file mode 100644 index 92b92ef55..000000000 --- a/vp9/common/treecoder.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef __INC_TREECODER_H -#define __INC_TREECODER_H - -typedef unsigned char vp9_prob; - -#define vp9_prob_half ( (vp9_prob) 128) - -typedef signed char vp9_tree_index; -struct bool_coder_spec; - -typedef struct bool_coder_spec bool_coder_spec; -typedef struct bool_writer bool_writer; -typedef struct bool_reader bool_reader; - -typedef const bool_coder_spec c_bool_coder_spec; -typedef const bool_writer c_bool_writer; -typedef const bool_reader c_bool_reader; - - - -# define vp9_complement( x) (255 - x) - - -/* We build coding trees compactly in arrays. - Each node of the tree is a pair of vp9_tree_indices. - Array index often references a corresponding probability table. - Index <= 0 means done encoding/decoding and value = -Index, - Index > 0 means need another bit, specification at index. - Nonnegative indices are always even; processing begins at node 0. */ - -typedef const vp9_tree_index vp9_tree[], *vp9_tree_p; - - -typedef const struct vp9_token_struct { - int value; - int Len; -} vp9_token; - -/* Construct encoding array from tree. */ - -void vp9_tokens_from_tree(struct vp9_token_struct *, vp9_tree); -void vp9_tokens_from_tree_offset(struct vp9_token_struct *, vp9_tree, - int offset); - - -/* Convert array of token occurrence counts into a table of probabilities - for the associated binary encoding tree. Also writes count of branches - taken for each node on the tree; this facilitiates decisions as to - probability updates. */ - -void vp9_tree_probs_from_distribution( - int n, /* n = size of alphabet */ - vp9_token tok [ /* n */ ], - vp9_tree tree, - vp9_prob probs [ /* n-1 */ ], - unsigned int branch_ct [ /* n-1 */ ] [2], - const unsigned int num_events[ /* n */ ], - unsigned int Pfactor, - int Round -); - -static __inline int clip_prob(int p) { - if (p > 255) - return 255; - else if (p < 1) - return 1; - return p; -} - -vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]); - -#endif diff --git a/vp9/common/type_aliases.h b/vp9/common/type_aliases.h deleted file mode 100644 index eda6a2ec2..000000000 --- a/vp9/common/type_aliases.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -/**************************************************************************** -* -* Module Title : type_aliases.h -* -* Description : Standard type aliases -* -****************************************************************************/ -#ifndef __INC_TYPE_ALIASES_H -#define __INC_TYPE_ALIASES_H - -/**************************************************************************** -* Macros -****************************************************************************/ -#define EXPORT -#define IMPORT extern /* Used to declare imported data & routines */ -#define PRIVATE static /* Used to declare & define module-local data */ -#define LOCAL static /* Used to define all persistent routine-local data */ -#define STD_IN_PATH 0 /* Standard input path */ -#define STD_OUT_PATH 1 /* Standard output path */ -#define STD_ERR_PATH 2 /* Standard error path */ -#define STD_IN_FILE stdin /* Standard input file pointer */ -#define STD_OUT_FILE stdout /* Standard output file pointer */ -#define STD_ERR_FILE stderr /* Standard error file pointer */ -#define max_int 0x7FFFFFFF - -#define __export -#define _export - -#define CCONV - -#ifndef NULL -#ifdef __cplusplus -#define NULL 0 -#else -#define NULL ((void *)0) -#endif -#endif - -#ifndef FALSE -#define FALSE 0 -#endif - -#ifndef TRUE -#define TRUE 1 -#endif - -/**************************************************************************** -* Typedefs -****************************************************************************/ -#ifndef TYPE_INT8 -#define TYPE_INT8 -typedef signed char INT8; -#endif - -#ifndef TYPE_INT16 -/*#define TYPE_INT16*/ -typedef signed short INT16; -#endif - -#ifndef TYPE_INT32 -/*#define TYPE_INT32*/ -typedef signed int INT32; -#endif - -#ifndef TYPE_UINT8 -/*#define TYPE_UINT8*/ -typedef unsigned char UINT8; -#endif - -#ifndef TYPE_UINT32 -/*#define TYPE_UINT32*/ -typedef unsigned int UINT32; -#endif - -#ifndef TYPE_UINT16 -/*#define TYPE_UINT16*/ -typedef unsigned short UINT16; -#endif - -#ifndef TYPE_BOOL -/*#define TYPE_BOOL*/ -typedef int BOOL; -#endif - -typedef unsigned char BOOLEAN; - -#ifdef _MSC_VER -typedef __int64 INT64; -#ifndef INT64_MAX -#define INT64_MAX LLONG_MAX -#endif -#else - -#ifndef TYPE_INT64 -#ifdef _TMS320C6X -/* for now we only have 40bits */ -typedef long INT64; -#else -typedef long long INT64; -#endif -#endif - -#endif - -/* Floating point */ -typedef double FLOAT64; -typedef float FLOAT32; - -#endif diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c new file mode 100644 index 000000000..04a8b368a --- /dev/null +++ b/vp9/common/vp9_alloccommon.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vp9_blockd.h" +#include "vpx_mem/vpx_mem.h" +#include "vp9_onyxc_int.h" +#include "vp9_findnearmv.h" +#include "vp9_entropymode.h" +#include "vp9_entropymv.h" +#include "vp9_systemdependent.h" + + +void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base) { + int stride = cpi->mode_info_stride; + int i; + + // Clear down top border row + vpx_memset(mi_base, 0, sizeof(MODE_INFO) * cpi->mode_info_stride); + + // Clear left border column + for (i = 1; i < cpi->mb_rows + 1; i++) { + vpx_memset(&mi_base[i * stride], 0, sizeof(MODE_INFO)); + } +} + +void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi) { + int i, j; + + // For each in image mode_info element set the in image flag to 1 + for (i = 0; i < cpi->mb_rows; i++) { + for (j = 0; j < cpi->mb_cols; j++) { + mi->mbmi.mb_in_image = 1; + mi++; // Next element in the row + } + + mi++; // Step over border element at start of next row + } +} + +void vp9_de_alloc_frame_buffers(VP9_COMMON *oci) { + int i; + + for (i = 0; i < NUM_YV12_BUFFERS; i++) + vp8_yv12_de_alloc_frame_buffer(&oci->yv12_fb[i]); + + vp8_yv12_de_alloc_frame_buffer(&oci->temp_scale_frame); + vp8_yv12_de_alloc_frame_buffer(&oci->post_proc_buffer); + + vpx_free(oci->above_context); + vpx_free(oci->mip); + vpx_free(oci->prev_mip); + + oci->above_context = 0; + oci->mip = 0; + oci->prev_mip = 0; + +} + +int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) { + int i; + + vp9_de_alloc_frame_buffers(oci); + + /* our internal buffers are always multiples of 16 */ + if ((width & 0xf) != 0) + width += 16 - (width & 0xf); + + if ((height & 0xf) != 0) + height += 16 - (height & 0xf); + + + for (i = 0; i < NUM_YV12_BUFFERS; i++) { + oci->fb_idx_ref_cnt[i] = 0; + oci->yv12_fb[i].flags = 0; + if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, + VP9BORDERINPIXELS) < 0) { + vp9_de_alloc_frame_buffers(oci); + return 1; + } + } + + oci->new_fb_idx = 0; + oci->lst_fb_idx = 1; + oci->gld_fb_idx = 2; + oci->alt_fb_idx = 3; + + oci->fb_idx_ref_cnt[0] = 1; + oci->fb_idx_ref_cnt[1] = 1; + oci->fb_idx_ref_cnt[2] = 1; + oci->fb_idx_ref_cnt[3] = 1; + + if (vp8_yv12_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, + VP9BORDERINPIXELS) < 0) { + vp9_de_alloc_frame_buffers(oci); + return 1; + } + + if (vp8_yv12_alloc_frame_buffer(&oci->post_proc_buffer, width, height, + VP9BORDERINPIXELS) < 0) { + vp9_de_alloc_frame_buffers(oci); + return 1; + } + + oci->mb_rows = height >> 4; + oci->mb_cols = width >> 4; + oci->MBs = oci->mb_rows * oci->mb_cols; + oci->mode_info_stride = oci->mb_cols + 1; + oci->mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO)); + + if (!oci->mip) { + vp9_de_alloc_frame_buffers(oci); + return 1; + } + + oci->mi = oci->mip + oci->mode_info_stride + 1; + + /* allocate memory for last frame MODE_INFO array */ + + oci->prev_mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO)); + + if (!oci->prev_mip) { + vp9_de_alloc_frame_buffers(oci); + return 1; + } + + oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1; + + oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1); + + if (!oci->above_context) { + vp9_de_alloc_frame_buffers(oci); + return 1; + } + + vp9_update_mode_info_border(oci, oci->mip); + vp9_update_mode_info_in_image(oci, oci->mi); + + return 0; +} +void vp9_setup_version(VP9_COMMON *cm) { + if (cm->version & 0x4) { + if (!CONFIG_EXPERIMENTAL) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Bitstream was created by an experimental " + "encoder"); + cm->experimental = 1; + } + + switch (cm->version & 0x3) { + case 0: + cm->no_lpf = 0; + cm->filter_type = NORMAL_LOOPFILTER; + cm->use_bilinear_mc_filter = 0; + cm->full_pixel = 0; + break; + case 1: + cm->no_lpf = 0; + cm->filter_type = SIMPLE_LOOPFILTER; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 0; + break; + case 2: + case 3: + cm->no_lpf = 1; + cm->filter_type = NORMAL_LOOPFILTER; + cm->use_bilinear_mc_filter = 1; + cm->full_pixel = 0; + break; + // Full pel only code deprecated in experimental code base + // case 3: + // cm->no_lpf = 1; + // cm->filter_type = SIMPLE_LOOPFILTER; + // cm->use_bilinear_mc_filter = 1; + // cm->full_pixel = 1; + // break; + } +} +void vp9_create_common(VP9_COMMON *oci) { + vp9_machine_specific_config(oci); + + vp9_init_mbmode_probs(oci); + + vp9_default_bmode_probs(oci->fc.bmode_prob); + + oci->txfm_mode = ONLY_4X4; + oci->mb_no_coeff_skip = 1; + oci->comp_pred_mode = HYBRID_PREDICTION; + oci->no_lpf = 0; + oci->filter_type = NORMAL_LOOPFILTER; + oci->use_bilinear_mc_filter = 0; + oci->full_pixel = 0; + oci->clr_type = REG_YUV; + oci->clamp_type = RECON_CLAMP_REQUIRED; + + /* Initialise reference frame sign bias structure to defaults */ + vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias)); + + /* Default disable buffer to buffer copying */ + oci->copy_buffer_to_gf = 0; + oci->copy_buffer_to_arf = 0; + oci->kf_ymode_probs_update = 0; +} + +void vp9_remove_common(VP9_COMMON *oci) { + vp9_de_alloc_frame_buffers(oci); +} + +void vp9_initialize_common() { + vp9_coef_tree_initialize(); + + vp9_entropy_mode_init(); + + vp9_entropy_mv_init(); +} diff --git a/vp9/common/vp9_alloccommon.h b/vp9/common/vp9_alloccommon.h new file mode 100644 index 000000000..df94f421e --- /dev/null +++ b/vp9/common/vp9_alloccommon.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_ALLOCCOMMON_H +#define __INC_ALLOCCOMMON_H + +#include "vp9_onyxc_int.h" + +void vp9_create_common(VP9_COMMON *oci); +void vp9_remove_common(VP9_COMMON *oci); +void vp9_de_alloc_frame_buffers(VP9_COMMON *oci); +int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height); +void vp9_setup_version(VP9_COMMON *oci); + +void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi_base); +void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi); + +#endif diff --git a/vp9/common/vp9_asm_com_offsets.c b/vp9/common/vp9_asm_com_offsets.c new file mode 100644 index 000000000..07d3e333a --- /dev/null +++ b/vp9/common/vp9_asm_com_offsets.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/asm_offsets.h" +#include "vpx_scale/yv12config.h" + +BEGIN + +/* vpx_scale */ +DEFINE(yv12_buffer_config_y_width, offsetof(YV12_BUFFER_CONFIG, y_width)); +DEFINE(yv12_buffer_config_y_height, offsetof(YV12_BUFFER_CONFIG, y_height)); +DEFINE(yv12_buffer_config_y_stride, offsetof(YV12_BUFFER_CONFIG, y_stride)); +DEFINE(yv12_buffer_config_uv_width, offsetof(YV12_BUFFER_CONFIG, uv_width)); +DEFINE(yv12_buffer_config_uv_height, offsetof(YV12_BUFFER_CONFIG, uv_height)); +DEFINE(yv12_buffer_config_uv_stride, offsetof(YV12_BUFFER_CONFIG, uv_stride)); +DEFINE(yv12_buffer_config_y_buffer, offsetof(YV12_BUFFER_CONFIG, y_buffer)); +DEFINE(yv12_buffer_config_u_buffer, offsetof(YV12_BUFFER_CONFIG, u_buffer)); +DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_buffer)); +DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border)); +DEFINE(VP9BORDERINPIXELS_VAL, VP9BORDERINPIXELS); + +END + +/* add asserts for any offset that is not supported by assembly code */ +/* add asserts for any size that is not supported by assembly code */ + +#if HAVE_ARMV7 +/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */ +ct_assert(VP9BORDERINPIXELS_VAL, VP9BORDERINPIXELS == 32) +#endif diff --git a/vp9/common/vp9_blockd.c b/vp9/common/vp9_blockd.c new file mode 100644 index 000000000..d8bb394d4 --- /dev/null +++ b/vp9/common/vp9_blockd.c @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9_blockd.h" +#include "vpx_mem/vpx_mem.h" + + +const unsigned char vp9_block2left[25] = { + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +}; +const unsigned char vp9_block2above[25] = { + 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8 +}; + +const unsigned char vp9_block2left_8x8[25] = { + 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8 +}; +const unsigned char vp9_block2above_8x8[25] = { + 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8 +}; + diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h new file mode 100644 index 000000000..a91e8d4d0 --- /dev/null +++ b/vp9/common/vp9_blockd.h @@ -0,0 +1,568 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_BLOCKD_H +#define __INC_BLOCKD_H + +void vpx_log(const char *format, ...); + +#include "vpx_ports/config.h" +#include "vpx_scale/yv12config.h" +#include "vp9_mv.h" +#include "vp9_treecoder.h" +#include "vp9_subpixel.h" +#include "vpx_ports/mem.h" +#include "vp9_common.h" + +#define TRUE 1 +#define FALSE 0 + +// #define MODE_STATS + +/*#define DCPRED 1*/ +#define DCPREDSIMTHRESH 0 +#define DCPREDCNTTHRESH 3 + +#define MB_FEATURE_TREE_PROBS 3 +#define PREDICTION_PROBS 3 + +#define MBSKIP_CONTEXTS 3 + +#define MAX_MB_SEGMENTS 4 + +#define MAX_REF_LF_DELTAS 4 +#define MAX_MODE_LF_DELTAS 4 + +/* Segment Feature Masks */ +#define SEGMENT_DELTADATA 0 +#define SEGMENT_ABSDATA 1 +#define MAX_MV_REFS 9 + +typedef struct { + int r, c; +} POS; + +typedef enum PlaneType { + PLANE_TYPE_Y_NO_DC = 0, + PLANE_TYPE_Y2, + PLANE_TYPE_UV, + PLANE_TYPE_Y_WITH_DC, +} PLANE_TYPE; + +typedef char ENTROPY_CONTEXT; +typedef struct { + ENTROPY_CONTEXT y1[4]; + ENTROPY_CONTEXT u[2]; + ENTROPY_CONTEXT v[2]; + ENTROPY_CONTEXT y2; +} ENTROPY_CONTEXT_PLANES; + +extern const unsigned char vp9_block2left[25]; +extern const unsigned char vp9_block2above[25]; +extern const unsigned char vp9_block2left_8x8[25]; +extern const unsigned char vp9_block2above_8x8[25]; + +#define VP9_COMBINEENTROPYCONTEXTS( Dest, A, B) \ + Dest = ((A)!=0) + ((B)!=0); + +typedef enum { + KEY_FRAME = 0, + INTER_FRAME = 1 +} FRAME_TYPE; + +typedef enum +{ + SIXTAP = 0, + BILINEAR = 1, + EIGHTTAP = 2, + EIGHTTAP_SHARP = 3, + SWITCHABLE /* should be the last one */ +} INTERPOLATIONFILTERTYPE; + +typedef enum +{ + DC_PRED, /* average of above and left pixels */ + V_PRED, /* vertical prediction */ + H_PRED, /* horizontal prediction */ + D45_PRED, /* Directional 45 deg prediction [anti-clockwise from 0 deg hor] */ + D135_PRED, /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */ + D117_PRED, /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */ + D153_PRED, /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */ + D27_PRED, /* Directional 22 deg prediction [anti-clockwise from 0 deg hor] */ + D63_PRED, /* Directional 67 deg prediction [anti-clockwise from 0 deg hor] */ + TM_PRED, /* Truemotion prediction */ + I8X8_PRED, /* 8x8 based prediction, each 8x8 has its own prediction mode */ + B_PRED, /* block based prediction, each block has its own prediction mode */ + + NEARESTMV, + NEARMV, + ZEROMV, + NEWMV, + SPLITMV, + + MB_MODE_COUNT +} MB_PREDICTION_MODE; + +// Segment level features. +typedef enum { + SEG_LVL_ALT_Q = 0, // Use alternate Quantizer .... + SEG_LVL_ALT_LF = 1, // Use alternate loop filter value... + SEG_LVL_REF_FRAME = 2, // Optional Segment reference frame + SEG_LVL_MODE = 3, // Optional Segment mode + SEG_LVL_EOB = 4, // EOB end stop marker. + SEG_LVL_TRANSFORM = 5, // Block transform size. + SEG_LVL_MAX = 6 // Number of MB level features supported + +} SEG_LVL_FEATURES; + +// Segment level features. +typedef enum { + TX_4X4, // 4x4 dct transform + TX_8X8, // 8x8 dct transform + TX_16X16, // 16x16 dct transform + TX_SIZE_MAX // Number of different transforms available +} TX_SIZE; + +typedef enum { + DCT_DCT = 0, // DCT in both horizontal and vertical + ADST_DCT = 1, // ADST in vertical, DCT in horizontal + DCT_ADST = 2, // DCT in vertical, ADST in horizontal + ADST_ADST = 3 // ADST in both directions +} TX_TYPE; + +#define VP9_YMODES (B_PRED + 1) +#define VP9_UV_MODES (TM_PRED + 1) +#define VP9_I8X8_MODES (TM_PRED + 1) +#define VP9_I32X32_MODES (TM_PRED + 1) + +#define VP9_MVREFS (1 + SPLITMV - NEARESTMV) + +#if CONFIG_LOSSLESS +#define WHT_UPSCALE_FACTOR 3 +#define Y2_WHT_UPSCALE_FACTOR 2 +#endif + +typedef enum { + B_DC_PRED, /* average of above and left pixels */ + B_TM_PRED, + + B_VE_PRED, /* vertical prediction */ + B_HE_PRED, /* horizontal prediction */ + + B_LD_PRED, + B_RD_PRED, + + B_VR_PRED, + B_VL_PRED, + B_HD_PRED, + B_HU_PRED, +#if CONFIG_NEWBINTRAMODES + B_CONTEXT_PRED, +#endif + + LEFT4X4, + ABOVE4X4, + ZERO4X4, + NEW4X4, + + B_MODE_COUNT +} B_PREDICTION_MODE; + +#define VP9_BINTRAMODES (LEFT4X4) +#define VP9_SUBMVREFS (1 + NEW4X4 - LEFT4X4) + +#if CONFIG_NEWBINTRAMODES +/* The number of B_PRED intra modes that are replaced by B_CONTEXT_PRED */ +#define CONTEXT_PRED_REPLACEMENTS 0 +#define VP9_KF_BINTRAMODES (VP9_BINTRAMODES - 1) +#define VP9_NKF_BINTRAMODES (VP9_BINTRAMODES - CONTEXT_PRED_REPLACEMENTS) +#else +#define VP9_KF_BINTRAMODES (VP9_BINTRAMODES) /* 10 */ +#define VP9_NKF_BINTRAMODES (VP9_BINTRAMODES) /* 10 */ +#endif + +typedef enum { + PARTITIONING_16X8 = 0, + PARTITIONING_8X16, + PARTITIONING_8X8, + PARTITIONING_4X4, + NB_PARTITIONINGS, +} SPLITMV_PARTITIONING_TYPE; + +/* For keyframes, intra block modes are predicted by the (already decoded) + modes for the Y blocks to the left and above us; for interframes, there + is a single probability table. */ + +union b_mode_info { + struct { + B_PREDICTION_MODE first; + TX_TYPE tx_type; +#if CONFIG_COMP_INTRA_PRED + B_PREDICTION_MODE second; +#endif +#if CONFIG_NEWBINTRAMODES + B_PREDICTION_MODE context; +#endif + } as_mode; + struct { + int_mv first; + int_mv second; + } as_mv; +}; + +typedef enum { + NONE = -1, + INTRA_FRAME = 0, + LAST_FRAME = 1, + GOLDEN_FRAME = 2, + ALTREF_FRAME = 3, + MAX_REF_FRAMES = 4 +} MV_REFERENCE_FRAME; + +typedef struct { + MB_PREDICTION_MODE mode, uv_mode; +#if CONFIG_COMP_INTRA_PRED + MB_PREDICTION_MODE second_mode, second_uv_mode; +#endif +#if CONFIG_COMP_INTERINTRA_PRED + MB_PREDICTION_MODE interintra_mode, interintra_uv_mode; +#endif + MV_REFERENCE_FRAME ref_frame, second_ref_frame; + TX_SIZE txfm_size; + int_mv mv[2]; // for each reference frame used + int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS]; + int_mv best_mv, best_second_mv; +#if CONFIG_NEW_MVREF + int best_index, best_second_index; +#endif + + int mb_mode_context[MAX_REF_FRAMES]; + + SPLITMV_PARTITIONING_TYPE partitioning; + unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */ + unsigned char need_to_clamp_mvs; + unsigned char need_to_clamp_secondmv; + unsigned char segment_id; /* Which set of segmentation parameters should be used for this MB */ + + // Flags used for prediction status of various bistream signals + unsigned char seg_id_predicted; + unsigned char ref_predicted; + + // Indicates if the mb is part of the image (1) vs border (0) + // This can be useful in determining whether the MB provides + // a valid predictor + unsigned char mb_in_image; + +#if CONFIG_PRED_FILTER + // Flag to turn prediction signal filter on(1)/off(0 ) at the MB level + unsigned int pred_filter_enabled; +#endif + INTERPOLATIONFILTERTYPE interp_filter; + +#if CONFIG_SUPERBLOCKS + // FIXME need a SB array of 4 MB_MODE_INFOs that + // only needs one encoded_as_sb. + unsigned char encoded_as_sb; +#endif +} MB_MODE_INFO; + +typedef struct { + MB_MODE_INFO mbmi; + union b_mode_info bmi[16]; +} MODE_INFO; + +typedef struct blockd { + short *qcoeff; + short *dqcoeff; + unsigned char *predictor; + short *diff; + short *dequant; + + /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */ + unsigned char **base_pre; + unsigned char **base_second_pre; + int pre; + int pre_stride; + + unsigned char **base_dst; + int dst; + int dst_stride; + + int eob; + + union b_mode_info bmi; +} BLOCKD; + +typedef struct macroblockd { + DECLARE_ALIGNED(16, short, diff[400]); /* from idct diff */ + DECLARE_ALIGNED(16, unsigned char, predictor[384]); + DECLARE_ALIGNED(16, short, qcoeff[400]); + DECLARE_ALIGNED(16, short, dqcoeff[400]); + DECLARE_ALIGNED(16, unsigned short, eobs[25]); + + /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */ + BLOCKD block[25]; + int fullpixel_mask; + + YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */ + struct { + uint8_t *y_buffer, *u_buffer, *v_buffer; + } second_pre; + YV12_BUFFER_CONFIG dst; + + MODE_INFO *prev_mode_info_context; + MODE_INFO *mode_info_context; + int mode_info_stride; + + FRAME_TYPE frame_type; + + int up_available; + int left_available; + + /* Y,U,V,Y2 */ + ENTROPY_CONTEXT_PLANES *above_context; + ENTROPY_CONTEXT_PLANES *left_context; + + /* 0 indicates segmentation at MB level is not enabled. Otherwise the individual bits indicate which features are active. */ + unsigned char segmentation_enabled; + + /* 0 (do not update) 1 (update) the macroblock segmentation map. */ + unsigned char update_mb_segmentation_map; + + /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ + unsigned char update_mb_segmentation_data; + + /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ + unsigned char mb_segment_abs_delta; + + /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */ + /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */ + + // Probability Tree used to code Segment number + vp9_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS]; + +#if CONFIG_NEW_MVREF + vp9_prob mb_mv_ref_id_probs[MAX_REF_FRAMES][3]; +#endif + + // Segment features + signed char segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX]; + unsigned int segment_feature_mask[MAX_MB_SEGMENTS]; + + /* mode_based Loop filter adjustment */ + unsigned char mode_ref_lf_delta_enabled; + unsigned char mode_ref_lf_delta_update; + + /* Delta values have the range +/- MAX_LOOP_FILTER */ + signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */ + signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; /* 0 = Intra, Last, GF, ARF */ + signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */ + signed char mode_lf_deltas[MAX_MODE_LF_DELTAS]; /* 0 = BPRED, ZERO_MV, MV, SPLIT */ + + /* Distance of MB away from frame edges */ + int mb_to_left_edge; + int mb_to_right_edge; + int mb_to_top_edge; + int mb_to_bottom_edge; + + unsigned int frames_since_golden; + unsigned int frames_till_alt_ref_frame; + + /* Inverse transform function pointers. */ + void (*inv_xform4x4_1_x8)(short *input, short *output, int pitch); + void (*inv_xform4x4_x8)(short *input, short *output, int pitch); + void (*inv_walsh4x4_1)(short *in, short *out); + void (*inv_walsh4x4_lossless)(short *in, short *out); + + + vp9_subpix_fn_t subpixel_predict; + vp9_subpix_fn_t subpixel_predict8x4; + vp9_subpix_fn_t subpixel_predict8x8; + vp9_subpix_fn_t subpixel_predict16x16; + vp9_subpix_fn_t subpixel_predict_avg; + vp9_subpix_fn_t subpixel_predict_avg8x4; + vp9_subpix_fn_t subpixel_predict_avg8x8; + vp9_subpix_fn_t subpixel_predict_avg16x16; + int allow_high_precision_mv; + + int corrupted; + +#if !CONFIG_SUPERBLOCKS && (ARCH_X86 || ARCH_X86_64) + /* This is an intermediate buffer currently used in sub-pixel motion search + * to keep a copy of the reference area. This buffer can be used for other + * purpose. + */ + DECLARE_ALIGNED(32, unsigned char, y_buf[22 * 32]); +#endif + +#if CONFIG_RUNTIME_CPU_DETECT + struct VP9_COMMON_RTCD *rtcd; +#endif + + int mb_index; // Index of the MB in the SB (0..3) + int q_index; + +} MACROBLOCKD; + +#define ACTIVE_HT 110 // quantization stepsize threshold + +#define ACTIVE_HT8 300 + +#define ACTIVE_HT16 300 + +// convert MB_PREDICTION_MODE to B_PREDICTION_MODE +static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) { + B_PREDICTION_MODE b_mode; + switch (mode) { + case DC_PRED: + b_mode = B_DC_PRED; + break; + case V_PRED: + b_mode = B_VE_PRED; + break; + case H_PRED: + b_mode = B_HE_PRED; + break; + case TM_PRED: + b_mode = B_TM_PRED; + break; + case D45_PRED: + b_mode = B_LD_PRED; + break; + case D135_PRED: + b_mode = B_RD_PRED; + break; + case D117_PRED: + b_mode = B_VR_PRED; + break; + case D153_PRED: + b_mode = B_HD_PRED; + break; + case D27_PRED: + b_mode = B_HU_PRED; + break; + case D63_PRED: + b_mode = B_VL_PRED; + break; + default : + // for debug purpose, to be removed after full testing + assert(0); + break; + } + return b_mode; +} + +// transform mapping +static TX_TYPE txfm_map(B_PREDICTION_MODE bmode) { + // map transform type + TX_TYPE tx_type; + switch (bmode) { + case B_TM_PRED : + case B_RD_PRED : + tx_type = ADST_ADST; + break; + + case B_VE_PRED : + case B_VR_PRED : + tx_type = ADST_DCT; + break; + + case B_HE_PRED : + case B_HD_PRED : + case B_HU_PRED : + tx_type = DCT_ADST; + break; + +#if CONFIG_NEWBINTRAMODES + case B_CONTEXT_PRED: + assert(0); + break; +#endif + + default : + tx_type = DCT_DCT; + break; + } + return tx_type; +} + +static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) { + TX_TYPE tx_type = DCT_DCT; + if (xd->mode_info_context->mbmi.mode == B_PRED && + xd->q_index < ACTIVE_HT) { + tx_type = txfm_map( +#if CONFIG_NEWBINTRAMODES + b->bmi.as_mode.first == B_CONTEXT_PRED ? b->bmi.as_mode.context : +#endif + b->bmi.as_mode.first); + } + return tx_type; +} + +static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) { + TX_TYPE tx_type = DCT_DCT; + if (xd->mode_info_context->mbmi.mode == I8X8_PRED && + xd->q_index < ACTIVE_HT8) { + // TODO(rbultje): MB_PREDICTION_MODE / B_PREDICTION_MODE should be merged + // or the relationship otherwise modified to address this type conversion. + tx_type = txfm_map(pred_mode_conv( + (MB_PREDICTION_MODE)b->bmi.as_mode.first)); + } + return tx_type; +} + +static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) { + TX_TYPE tx_type = DCT_DCT; + if (xd->mode_info_context->mbmi.mode < I8X8_PRED && +#if CONFIG_SUPERBLOCKS + !xd->mode_info_context->mbmi.encoded_as_sb && +#endif + xd->q_index < ACTIVE_HT16) { + tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)); + } + return tx_type; +} + +static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) { + TX_TYPE tx_type = DCT_DCT; + int ib = (int)(b - xd->block); + if (ib >= 16) + return tx_type; + if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) { + tx_type = get_tx_type_16x16(xd, b); + } + if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) { + ib = (ib & 8) + ((ib & 4) >> 1); + tx_type = get_tx_type_8x8(xd, &xd->block[ib]); + } + if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) { + tx_type = get_tx_type_4x4(xd, b); + } + return tx_type; +} + +extern void vp9_build_block_doffsets(MACROBLOCKD *xd); +extern void vp9_setup_block_dptrs(MACROBLOCKD *xd); + +static void update_blockd_bmi(MACROBLOCKD *xd) { + int i; + int is_4x4; + is_4x4 = (xd->mode_info_context->mbmi.mode == SPLITMV) || + (xd->mode_info_context->mbmi.mode == I8X8_PRED) || + (xd->mode_info_context->mbmi.mode == B_PRED); + + if (is_4x4) { + for (i = 0; i < 16; i++) { + xd->block[i].bmi = xd->mode_info_context->bmi[i]; + } + } +} +#endif /* __INC_BLOCKD_H */ diff --git a/vp9/common/vp9_coefupdateprobs.h b/vp9/common/vp9_coefupdateprobs.h new file mode 100644 index 000000000..cd7eabfa5 --- /dev/null +++ b/vp9/common/vp9_coefupdateprobs.h @@ -0,0 +1,16 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* Update probabilities for the nodes in the token entropy tree. + Generated file included by vp9_entropy.c */ +#define COEF_UPDATE_PROB 252 +#define COEF_UPDATE_PROB_8X8 252 +#define COEF_UPDATE_PROB_16X16 252 diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h new file mode 100644 index 000000000..61c0e25d8 --- /dev/null +++ b/vp9/common/vp9_common.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef common_h +#define common_h 1 + +#include +#include "vpx_config.h" +/* Interface header for common constant data structures and lookup tables */ + +#include "vpx_mem/vpx_mem.h" + +#include "vp9_common_types.h" + +/* Only need this for fixed-size arrays, for structs just assign. */ + +#define vp9_copy( Dest, Src) { \ + assert( sizeof( Dest) == sizeof( Src)); \ + vpx_memcpy( Dest, Src, sizeof( Src)); \ + } + +/* Use this for variably-sized arrays. */ + +#define vp9_copy_array( Dest, Src, N) { \ + assert( sizeof( *Dest) == sizeof( *Src)); \ + vpx_memcpy( Dest, Src, N * sizeof( *Src)); \ + } + +#define vp9_zero( Dest) vpx_memset( &Dest, 0, sizeof( Dest)); + +#define vp9_zero_array( Dest, N) vpx_memset( Dest, 0, N * sizeof( *Dest)); + +#endif /* common_h */ diff --git a/vp9/common/vp9_common_types.h b/vp9/common/vp9_common_types.h new file mode 100644 index 000000000..4e6248697 --- /dev/null +++ b/vp9/common/vp9_common_types.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_COMMON_TYPES +#define __INC_COMMON_TYPES + +#define TRUE 1 +#define FALSE 0 + +#endif diff --git a/vp9/common/vp9_context.c b/vp9/common/vp9_context.c new file mode 100644 index 000000000..90a1f796f --- /dev/null +++ b/vp9/common/vp9_context.c @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9_entropy.h" + +/* *** GENERATED FILE: DO NOT EDIT *** */ + +#if 0 +int Contexts[vp8_coef_counter_dimen]; + +const int default_contexts[vp8_coef_counter_dimen] = { + { + // Block Type ( 0 ) + { + // Coeff Band ( 0 ) + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + }, + { + // Coeff Band ( 1 ) + {30190, 26544, 225, 24, 4, 0, 0, 0, 0, 0, 0, 4171593}, + {26846, 25157, 1241, 130, 26, 6, 1, 0, 0, 0, 0, 149987}, + {10484, 9538, 1006, 160, 36, 18, 0, 0, 0, 0, 0, 15104}, + }, + { + // Coeff Band ( 2 ) + {25842, 40456, 1126, 83, 11, 2, 0, 0, 0, 0, 0, 0}, + {9338, 8010, 512, 73, 7, 3, 2, 0, 0, 0, 0, 43294}, + {1047, 751, 149, 31, 13, 6, 1, 0, 0, 0, 0, 879}, + }, + { + // Coeff Band ( 3 ) + {26136, 9826, 252, 13, 0, 0, 0, 0, 0, 0, 0, 0}, + {8134, 5574, 191, 14, 2, 0, 0, 0, 0, 0, 0, 35302}, + { 605, 677, 116, 9, 1, 0, 0, 0, 0, 0, 0, 611}, + }, + { + // Coeff Band ( 4 ) + {10263, 15463, 283, 17, 0, 0, 0, 0, 0, 0, 0, 0}, + {2773, 2191, 128, 9, 2, 2, 0, 0, 0, 0, 0, 10073}, + { 134, 125, 32, 4, 0, 2, 0, 0, 0, 0, 0, 50}, + }, + { + // Coeff Band ( 5 ) + {10483, 2663, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0}, + {2137, 1251, 27, 1, 1, 0, 0, 0, 0, 0, 0, 14362}, + { 116, 156, 14, 2, 1, 0, 0, 0, 0, 0, 0, 190}, + }, + { + // Coeff Band ( 6 ) + {40977, 27614, 412, 28, 0, 0, 0, 0, 0, 0, 0, 0}, + {6113, 5213, 261, 22, 3, 0, 0, 0, 0, 0, 0, 26164}, + { 382, 312, 50, 14, 2, 0, 0, 0, 0, 0, 0, 345}, + }, + { + // Coeff Band ( 7 ) + { 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 319}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8}, + }, + }, + { + // Block Type ( 1 ) + { + // Coeff Band ( 0 ) + {3268, 19382, 1043, 250, 93, 82, 49, 26, 17, 8, 25, 82289}, + {8758, 32110, 5436, 1832, 827, 668, 420, 153, 24, 0, 3, 52914}, + {9337, 23725, 8487, 3954, 2107, 1836, 1069, 399, 59, 0, 0, 18620}, + }, + { + // Coeff Band ( 1 ) + {12419, 8420, 452, 62, 9, 1, 0, 0, 0, 0, 0, 0}, + {11715, 8705, 693, 92, 15, 7, 2, 0, 0, 0, 0, 53988}, + {7603, 8585, 2306, 778, 270, 145, 39, 5, 0, 0, 0, 9136}, + }, + { + // Coeff Band ( 2 ) + {15938, 14335, 1207, 184, 55, 13, 4, 1, 0, 0, 0, 0}, + {7415, 6829, 1138, 244, 71, 26, 7, 0, 0, 0, 0, 9980}, + {1580, 1824, 655, 241, 89, 46, 10, 2, 0, 0, 0, 429}, + }, + { + // Coeff Band ( 3 ) + {19453, 5260, 201, 19, 0, 0, 0, 0, 0, 0, 0, 0}, + {9173, 3758, 213, 22, 1, 1, 0, 0, 0, 0, 0, 9820}, + {1689, 1277, 276, 51, 17, 4, 0, 0, 0, 0, 0, 679}, + }, + { + // Coeff Band ( 4 ) + {12076, 10667, 620, 85, 19, 9, 5, 0, 0, 0, 0, 0}, + {4665, 3625, 423, 55, 19, 9, 0, 0, 0, 0, 0, 5127}, + { 415, 440, 143, 34, 20, 7, 2, 0, 0, 0, 0, 101}, + }, + { + // Coeff Band ( 5 ) + {12183, 4846, 115, 11, 1, 0, 0, 0, 0, 0, 0, 0}, + {4226, 3149, 177, 21, 2, 0, 0, 0, 0, 0, 0, 7157}, + { 375, 621, 189, 51, 11, 4, 1, 0, 0, 0, 0, 198}, + }, + { + // Coeff Band ( 6 ) + {61658, 37743, 1203, 94, 10, 3, 0, 0, 0, 0, 0, 0}, + {15514, 11563, 903, 111, 14, 5, 0, 0, 0, 0, 0, 25195}, + { 929, 1077, 291, 78, 14, 7, 1, 0, 0, 0, 0, 507}, + }, + { + // Coeff Band ( 7 ) + { 0, 990, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 412, 13, 0, 0, 0, 0, 0, 0, 0, 0, 1641}, + { 0, 18, 7, 1, 0, 0, 0, 0, 0, 0, 0, 30}, + }, + }, + { + // Block Type ( 2 ) + { + // Coeff Band ( 0 ) + { 953, 24519, 628, 120, 28, 12, 4, 0, 0, 0, 0, 2248798}, + {1525, 25654, 2647, 617, 239, 143, 42, 5, 0, 0, 0, 66837}, + {1180, 11011, 3001, 1237, 532, 448, 239, 54, 5, 0, 0, 7122}, + }, + { + // Coeff Band ( 1 ) + {1356, 2220, 67, 10, 4, 1, 0, 0, 0, 0, 0, 0}, + {1450, 2544, 102, 18, 4, 3, 0, 0, 0, 0, 0, 57063}, + {1182, 2110, 470, 130, 41, 21, 0, 0, 0, 0, 0, 6047}, + }, + { + // Coeff Band ( 2 ) + { 370, 3378, 200, 30, 5, 4, 1, 0, 0, 0, 0, 0}, + { 293, 1006, 131, 29, 11, 0, 0, 0, 0, 0, 0, 5404}, + { 114, 387, 98, 23, 4, 8, 1, 0, 0, 0, 0, 236}, + }, + { + // Coeff Band ( 3 ) + { 579, 194, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 395, 213, 5, 1, 0, 0, 0, 0, 0, 0, 0, 4157}, + { 119, 122, 4, 0, 0, 0, 0, 0, 0, 0, 0, 300}, + }, + { + // Coeff Band ( 4 ) + { 38, 557, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 21, 114, 12, 1, 0, 0, 0, 0, 0, 0, 0, 427}, + { 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7}, + }, + { + // Coeff Band ( 5 ) + { 52, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 18, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652}, + { 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30}, + }, + { + // Coeff Band ( 6 ) + { 640, 569, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 25, 77, 2, 0, 0, 0, 0, 0, 0, 0, 0, 517}, + { 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3}, + }, + { + // Coeff Band ( 7 ) + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + }, + }, + { + // Block Type ( 3 ) + { + // Coeff Band ( 0 ) + {2506, 20161, 2707, 767, 261, 178, 107, 30, 14, 3, 0, 100694}, + {8806, 36478, 8817, 3268, 1280, 850, 401, 114, 42, 0, 0, 58572}, + {11003, 27214, 11798, 5716, 2482, 2072, 1048, 175, 32, 0, 0, 19284}, + }, + { + // Coeff Band ( 1 ) + {9738, 11313, 959, 205, 70, 18, 11, 1, 0, 0, 0, 0}, + {12628, 15085, 1507, 273, 52, 19, 9, 0, 0, 0, 0, 54280}, + {10701, 15846, 5561, 1926, 813, 570, 249, 36, 0, 0, 0, 6460}, + }, + { + // Coeff Band ( 2 ) + {6781, 22539, 2784, 634, 182, 123, 20, 4, 0, 0, 0, 0}, + {6263, 11544, 2649, 790, 259, 168, 27, 5, 0, 0, 0, 20539}, + {3109, 4075, 2031, 896, 457, 386, 158, 29, 0, 0, 0, 1138}, + }, + { + // Coeff Band ( 3 ) + {11515, 4079, 465, 73, 5, 14, 2, 0, 0, 0, 0, 0}, + {9361, 5834, 650, 96, 24, 8, 4, 0, 0, 0, 0, 22181}, + {4343, 3974, 1360, 415, 132, 96, 14, 1, 0, 0, 0, 1267}, + }, + { + // Coeff Band ( 4 ) + {4787, 9297, 823, 168, 44, 12, 4, 0, 0, 0, 0, 0}, + {3619, 4472, 719, 198, 60, 31, 3, 0, 0, 0, 0, 8401}, + {1157, 1175, 483, 182, 88, 31, 8, 0, 0, 0, 0, 268}, + }, + { + // Coeff Band ( 5 ) + {8299, 1226, 32, 5, 1, 0, 0, 0, 0, 0, 0, 0}, + {3502, 1568, 57, 4, 1, 1, 0, 0, 0, 0, 0, 9811}, + {1055, 1070, 166, 29, 6, 1, 0, 0, 0, 0, 0, 527}, + }, + { + // Coeff Band ( 6 ) + {27414, 27927, 1989, 347, 69, 26, 0, 0, 0, 0, 0, 0}, + {5876, 10074, 1574, 341, 91, 24, 4, 0, 0, 0, 0, 21954}, + {1571, 2171, 778, 324, 124, 65, 16, 0, 0, 0, 0, 979}, + }, + { + // Coeff Band ( 7 ) + { 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459}, + { 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13}, + }, + }, +}; + +// Update probabilities for the nodes in the token entropy tree. +const vp9_prob tree_update_probs[vp9_coef_tree_dimen] = { + { + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255, }, + {250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255, }, + {234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255, }, + {234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255, }, + {251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255, }, + }, + { + {255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, + { + { + {248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255, }, + {248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, }, + {250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + { + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + {255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, }, + }, + }, +}; +#endif diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c new file mode 100644 index 000000000..058abb57e --- /dev/null +++ b/vp9/common/vp9_debugmodes.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "vp9_blockd.h" + +void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, + int frame) { + int mb_row; + int mb_col; + int mb_index = 0; + FILE *mvs = fopen("mvs.stt", "a"); + + /* print out the macroblock Y modes */ + mb_index = 0; + fprintf(mvs, "Mb Modes for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) { + for (mb_col = 0; mb_col < cols; mb_col++) { + + fprintf(mvs, "%2d ", mi[mb_index].mbmi.mode); + + mb_index++; + } + + fprintf(mvs, "\n"); + mb_index++; + } + + fprintf(mvs, "\n"); + + mb_index = 0; + fprintf(mvs, "Mb mv ref for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) { + for (mb_col = 0; mb_col < cols; mb_col++) { + + fprintf(mvs, "%2d ", mi[mb_index].mbmi.ref_frame); + + mb_index++; + } + + fprintf(mvs, "\n"); + mb_index++; + } + + fprintf(mvs, "\n"); + + /* print out the macroblock UV modes */ + mb_index = 0; + fprintf(mvs, "UV Modes for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) { + for (mb_col = 0; mb_col < cols; mb_col++) { + + fprintf(mvs, "%2d ", mi[mb_index].mbmi.uv_mode); + + mb_index++; + } + + mb_index++; + fprintf(mvs, "\n"); + } + + fprintf(mvs, "\n"); + + /* print out the block modes */ + mb_index = 0; + fprintf(mvs, "Mbs for Frame %d\n", frame); + { + int b_row; + + for (b_row = 0; b_row < 4 * rows; b_row++) { + int b_col; + int bindex; + + for (b_col = 0; b_col < 4 * cols; b_col++) { + mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); + bindex = (b_row & 3) * 4 + (b_col & 3); + + if (mi[mb_index].mbmi.mode == B_PRED) { + fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.first); +#if CONFIG_COMP_INTRA_PRED + fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode.second); +#endif + } else + fprintf(mvs, "xx "); + + } + + fprintf(mvs, "\n"); + } + } + fprintf(mvs, "\n"); + + /* print out the macroblock mvs */ + mb_index = 0; + fprintf(mvs, "MVs for Frame %d\n", frame); + + for (mb_row = 0; mb_row < rows; mb_row++) { + for (mb_col = 0; mb_col < cols; mb_col++) { + fprintf(mvs, "%5d:%-5d", mi[mb_index].mbmi.mv[0].as_mv.row / 2, + mi[mb_index].mbmi.mv[0].as_mv.col / 2); + + mb_index++; + } + + mb_index++; + fprintf(mvs, "\n"); + } + + fprintf(mvs, "\n"); + + /* print out the block modes */ + mb_index = 0; + fprintf(mvs, "MVs for Frame %d\n", frame); + { + int b_row; + + for (b_row = 0; b_row < 4 * rows; b_row++) { + int b_col; + int bindex; + + for (b_col = 0; b_col < 4 * cols; b_col++) { + mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); + bindex = (b_row & 3) * 4 + (b_col & 3); + fprintf(mvs, "%3d:%-3d ", + mi[mb_index].bmi[bindex].as_mv.first.as_mv.row, + mi[mb_index].bmi[bindex].as_mv.first.as_mv.col); + + } + + fprintf(mvs, "\n"); + } + } + fprintf(mvs, "\n"); + + fclose(mvs); +} diff --git a/vp9/common/vp9_default_coef_probs.h b/vp9/common/vp9_default_coef_probs.h new file mode 100644 index 000000000..52fb02f36 --- /dev/null +++ b/vp9/common/vp9_default_coef_probs.h @@ -0,0 +1,1377 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. +*/ + + +/*Generated file, included by vp9_entropy.c*/ + + +static const vp9_prob default_coef_probs [BLOCK_TYPES] + [COEF_BANDS] + [PREV_COEF_CONTEXTS] + [ENTROPY_NODES] = { + { + /* Block Type ( 0 ) */ + { + /* Coeff Band ( 0 )*/ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 1 )*/ + { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 }, + { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 }, + { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }, + { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 }, + }, + { + /* Coeff Band ( 2 )*/ + { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 }, + { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 }, + { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 }, + { 64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 }, + }, + { + /* Coeff Band ( 3 )*/ + { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 }, + { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 }, + { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 }, + { 64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 4 )*/ + { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 }, + { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 }, + { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }, + { 28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 }, + }, + { + /* Coeff Band ( 5 )*/ + { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 }, + { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 }, + { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }, + { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 6 )*/ + { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 }, + { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 }, + { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }, + { 64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 7 )*/ + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + } + }, + { + /* Block Type ( 1 ) */ + { + /* Coeff Band ( 0 )*/ + { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 }, + { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 }, + { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }, + { 48, 32, 146, 208, 149, 167, 221, 162, 255, 223, 128 }, + }, + { + /* Coeff Band ( 1 )*/ + { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 }, + { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 }, + { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }, + { 66, 90, 181, 242, 176, 190, 249, 202, 255, 255, 128 }, + }, + { + /* Coeff Band ( 2 )*/ + { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 }, + { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 }, + { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }, + { 18, 80, 163, 242, 170, 187, 247, 210, 255, 255, 128 }, + }, + { + /* Coeff Band ( 3 )*/ + { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 }, + { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 }, + { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }, + { 36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 }, + }, + { + /* Coeff Band ( 4 )*/ + { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 }, + { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 }, + { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }, + { 18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 }, + }, + { + /* Coeff Band ( 5 )*/ + { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 }, + { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 }, + { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }, + { 28, 70, 181, 251, 193, 211, 255, 205, 128, 128, 128 }, + }, + { + /* Coeff Band ( 6 )*/ + { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 }, + { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 }, + { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }, + { 40, 90, 188, 251, 195, 217, 255, 224, 128, 128, 128 }, + }, + { + /* Coeff Band ( 7 )*/ + { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }, + { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }, + } + }, + { + /* Block Type ( 2 ) */ + { + /* Coeff Band ( 0 )*/ + { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 }, + { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 }, + { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }, + { 64, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }, + }, + { + /* Coeff Band ( 1 )*/ + { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 }, + { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 }, + { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }, + { 140, 70, 195, 248, 188, 195, 255, 255, 128, 128, 128 }, + }, + { + /* Coeff Band ( 2 )*/ + { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 }, + { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 }, + { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }, + { 60, 40, 190, 239, 201, 218, 255, 228, 128, 128, 128 }, + }, + { + /* Coeff Band ( 3 )*/ + { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 4 )*/ + { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 }, + { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 5 )*/ + { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 6 )*/ + { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 48, 85, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 7 )*/ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + } + }, + { + /* Block Type ( 3 ) */ + { + /* Coeff Band ( 0 )*/ + { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 }, + { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 }, + { 63, 48, 138, 219, 151, 178, 240, 170, 255, 216, 128 }, + { 54, 40, 138, 219, 151, 178, 240, 170, 255, 216, 128 }, + }, + { + /* Coeff Band ( 1 )*/ + { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 }, + { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 }, + { 44, 84, 162, 232, 172, 180, 245, 178, 255, 255, 128 }, + { 32, 70, 162, 232, 172, 180, 245, 178, 255, 255, 128 }, + }, + { + /* Coeff Band ( 2 )*/ + { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 }, + { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 }, + { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }, + { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }, + }, + { + /* Coeff Band ( 3 )*/ + { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 }, + { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 }, + { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }, + { 26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 }, + }, + { + /* Coeff Band ( 4 )*/ + { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 }, + { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 }, + { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }, + { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }, + }, + { + /* Coeff Band ( 5 )*/ + { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 }, + { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 }, + { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }, + { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }, + }, + { + /* Coeff Band ( 6 )*/ + { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 }, + { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 }, + { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }, + { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }, + }, + { + /* Coeff Band ( 7 )*/ + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + } + } +}; + +static const vp9_prob default_hybrid_coef_probs [BLOCK_TYPES] + [COEF_BANDS] + [PREV_COEF_CONTEXTS] + [ENTROPY_NODES] = { + { + /* Block Type ( 0 ) */ + { + /* Coeff Band ( 0 )*/ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 1 )*/ + { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 }, + { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 }, + { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }, + { 90, 116, 227, 252, 214, 209, 255, 255, 128, 128, 128 }, + }, + { + /* Coeff Band ( 2 )*/ + { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 }, + { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 }, + { 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 }, + { 64, 128, 202, 247, 198, 180, 255, 219, 128, 128, 128 }, + }, + { + /* Coeff Band ( 3 )*/ + { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 }, + { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 }, + { 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 }, + { 64, 100, 216, 255, 236, 230, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 4 )*/ + { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 }, + { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 }, + { 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }, + { 28, 110, 196, 243, 228, 255, 255, 255, 128, 128, 128 }, + }, + { + /* Coeff Band ( 5 )*/ + { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 }, + { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 }, + { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }, + { 90, 90, 231, 255, 211, 171, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 6 )*/ + { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 }, + { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 }, + { 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }, + { 64, 120, 211, 255, 194, 224, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 7 )*/ + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + } + }, + { + /* Block Type ( 1 ) */ + { + /* Coeff Band ( 0 )*/ + { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 }, + { 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 }, + { 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }, + { 48, 32, 146, 208, 149, 167, 221, 162, 255, 223, 128 }, + }, + { + /* Coeff Band ( 1 )*/ + { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 }, + { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 }, + { 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }, + { 66, 90, 181, 242, 176, 190, 249, 202, 255, 255, 128 }, + }, + { + /* Coeff Band ( 2 )*/ + { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 }, + { 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 }, + { 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }, + { 18, 80, 163, 242, 170, 187, 247, 210, 255, 255, 128 }, + }, + { + /* Coeff Band ( 3 )*/ + { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 }, + { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 }, + { 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }, + { 36, 120, 201, 253, 205, 192, 255, 255, 128, 128, 128 }, + }, + { + /* Coeff Band ( 4 )*/ + { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 }, + { 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 }, + { 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }, + { 18, 90, 174, 245, 186, 161, 255, 199, 128, 128, 128 }, + }, + { + /* Coeff Band ( 5 )*/ + { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 }, + { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 }, + { 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }, + { 28, 70, 181, 251, 193, 211, 255, 205, 128, 128, 128 }, + }, + { + /* Coeff Band ( 6 )*/ + { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 }, + { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 }, + { 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }, + { 40, 90, 188, 251, 195, 217, 255, 224, 128, 128, 128 }, + }, + { + /* Coeff Band ( 7 )*/ + { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }, + { 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }, + } + }, + { + /* Block Type ( 2 ) */ + { + /* Coeff Band ( 0 )*/ + { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 }, + { 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 }, + { 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }, + { 64, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }, + }, + { + /* Coeff Band ( 1 )*/ + { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 }, + { 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 }, + { 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }, + { 140, 70, 195, 248, 188, 195, 255, 255, 128, 128, 128 }, + }, + { + /* Coeff Band ( 2 )*/ + { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 }, + { 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 }, + { 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }, + { 60, 40, 190, 239, 201, 218, 255, 228, 128, 128, 128 }, + }, + { + /* Coeff Band ( 3 )*/ + { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 }, + { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 132, 118, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 4 )*/ + { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 }, + { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 5 )*/ + { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 6 )*/ + { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 }, + { 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 48, 85, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + }, + { + /* Coeff Band ( 7 )*/ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }, + } + }, + { + /* Block Type ( 3 ) */ + { + /* Coeff Band ( 0 )*/ + { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 }, + { 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 }, + { 63, 48, 138, 219, 151, 178, 240, 170, 255, 216, 128 }, + { 54, 40, 138, 219, 151, 178, 240, 170, 255, 216, 128 }, + }, + { + /* Coeff Band ( 1 )*/ + { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 }, + { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 }, + { 44, 84, 162, 232, 172, 180, 245, 178, 255, 255, 128 }, + { 32, 70, 162, 232, 172, 180, 245, 178, 255, 255, 128 }, + }, + { + /* Coeff Band ( 2 )*/ + { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 }, + { 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 }, + { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }, + { 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }, + }, + { + /* Coeff Band ( 3 )*/ + { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 }, + { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 }, + { 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }, + { 26, 104, 170, 242, 183, 194, 254, 223, 255, 255, 128 }, + }, + { + /* Coeff Band ( 4 )*/ + { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 }, + { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 }, + { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }, + { 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }, + }, + { + /* Coeff Band ( 5 )*/ + { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 }, + { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 }, + { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }, + { 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }, + }, + { + /* Coeff Band ( 6 )*/ + { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 }, + { 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 }, + { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }, + { 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }, + }, + { + /* Coeff Band ( 7 )*/ + { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + { 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }, + } + } +}; + +static const vp9_prob +default_coef_probs_8x8[BLOCK_TYPES_8X8] +[COEF_BANDS] +[PREV_COEF_CONTEXTS] +[ENTROPY_NODES] = { + { + /* block Type 0 */ + { + /* Coeff Band 0 */ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { + /* Coeff Band 1 */ + { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128}, + { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128}, + { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}, + { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128} + }, + { + /* Coeff Band 2 */ + { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128}, + { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128}, + { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}, + { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128} + }, + { + /* Coeff Band 3 */ + { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128}, + { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128}, + { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}, + { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128} + }, + { + /* Coeff Band 4 */ + { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128}, + { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128}, + { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}, + { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128} + }, + { + /* Coeff Band 5 */ + { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128}, + { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} + }, + { + /* Coeff Band 6 */ + { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128}, + { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} + }, + { + /* Coeff Band 7 */ + { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128}, + { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128}, + { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}, + { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128} + } + }, + { + /* block Type 1 */ + { + /* Coeff Band 0 */ + { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128}, + { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128}, + { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128} + }, + { + /* Coeff Band 1 */ + { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128}, + { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128}, + { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}, + { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128} + }, + { + /* Coeff Band 2 */ + { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128}, + { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128}, + { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}, + { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128} + }, + { + /* Coeff Band 3 */ + { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128}, + { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128}, + { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}, + { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128} + }, + { + /* Coeff Band 4 */ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { + /* Coeff Band 5 */ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { + /* Coeff Band 6 */ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { + /* Coeff Band 7 */ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + } + }, + { + /* block Type 2 */ + { + /* Coeff Band 0 */ + { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128}, + { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128}, + { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}, + { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128} + }, + { + /* Coeff Band 1 */ + { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128}, + { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128}, + { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}, + { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128} + }, + { + /* Coeff Band 2 */ + { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128}, + { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128}, + { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}, + { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128} + }, + { + /* Coeff Band 3 */ + { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128}, + { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128}, + { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}, + { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128} + }, + { + /* Coeff Band 4 */ + { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128}, + { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128}, + { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}, + { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128} + }, + { + /* Coeff Band 5 */ + { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128}, + { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128}, + { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}, + { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128} + }, + { + /* Coeff Band 6 */ + { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128}, + { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128}, + { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}, + { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128} + }, + { + /* Coeff Band 7 */ + { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128}, + { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128}, + { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}, + { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128} + } + }, + { /* block Type 3 */ + { /* Coeff Band 0 */ + { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255}, + { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255}, + { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { /* Coeff Band 1 */ + { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128}, + { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128}, + { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128}, + { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128} + }, + { /* Coeff Band 2 */ + { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128}, + { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128}, + { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128}, + { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128} + }, + { /* Coeff Band 3 */ + { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128}, + { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128}, + { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128}, + { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128} + }, + { /* Coeff Band 4 */ + { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128}, + { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128}, + { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128}, + { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128} + }, + { /* Coeff Band 5 */ + { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128}, + { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128}, + { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128}, + { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128} + }, + { /* Coeff Band 6 */ + { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128}, + { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128}, + { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128}, + { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128} + }, + { /* Coeff Band 7 */ + { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128}, + { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128}, + { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + } + } +}; + +static const vp9_prob +default_hybrid_coef_probs_8x8[BLOCK_TYPES_8X8] + [COEF_BANDS] + [PREV_COEF_CONTEXTS] + [ENTROPY_NODES] = { + { + /* block Type 0 */ + { + /* Coeff Band 0 */ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { + /* Coeff Band 1 */ + { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128}, + { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128}, + { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}, + { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128} + }, + { + /* Coeff Band 2 */ + { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128}, + { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128}, + { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}, + { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128} + }, + { + /* Coeff Band 3 */ + { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128}, + { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128}, + { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}, + { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128} + }, + { + /* Coeff Band 4 */ + { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128}, + { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128}, + { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}, + { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128} + }, + { + /* Coeff Band 5 */ + { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128}, + { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} + }, + { + /* Coeff Band 6 */ + { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128}, + { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} + }, + { + /* Coeff Band 7 */ + { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128}, + { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128}, + { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}, + { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128} + } + }, + { + /* block Type 1 */ + { + /* Coeff Band 0 */ + { 134, 152, 233, 224, 234, 52, 255, 166, 128, 128, 128}, + { 97, 132, 185, 234, 186, 189, 197, 171, 255, 212, 128}, + { 84, 110, 185, 237, 182, 182, 145, 145, 255, 255, 128} + }, + { + /* Coeff Band 1 */ + { 1, 124, 213, 247, 192, 212, 255, 255, 128, 128, 128}, + { 88, 111, 178, 254, 189, 211, 255, 255, 128, 128, 128}, + { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128}, + { 12, 59, 129, 236, 150, 179, 239, 195, 255, 255, 128} + }, + { + /* Coeff Band 2 */ + { 1, 102, 225, 255, 210, 240, 128, 128, 128, 128, 128}, + { 110, 78, 195, 254, 200, 191, 255, 255, 128, 128, 128}, + { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128}, + { 37, 63, 177, 255, 194, 195, 128, 128, 128, 128, 128} + }, + { + /* Coeff Band 3 */ + { 1, 1, 229, 255, 202, 224, 128, 128, 128, 128, 128}, + { 150, 1, 192, 255, 206, 226, 128, 128, 128, 128, 128}, + { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128}, + { 75, 1, 138, 255, 172, 228, 128, 128, 128, 128, 128} + }, + { + /* Coeff Band 4 */ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { + /* Coeff Band 5 */ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { + /* Coeff Band 6 */ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { + /* Coeff Band 7 */ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + } + }, + { + /* block Type 2 */ + { + /* Coeff Band 0 */ + { 11, 181, 226, 199, 183, 255, 255, 255, 128, 128, 128}, + { 2, 147, 185, 248, 163, 180, 255, 236, 128, 128, 128}, + { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128}, + { 1, 123, 157, 238, 154, 176, 255, 226, 255, 255, 128} + }, + { + /* Coeff Band 1 */ + { 1, 150, 191, 246, 174, 188, 255, 235, 128, 128, 128}, + { 1, 125, 166, 245, 165, 185, 255, 234, 128, 128, 128}, + { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128}, + { 1, 79, 125, 240, 148, 179, 255, 234, 255, 255, 128} + }, + { + /* Coeff Band 2 */ + { 1, 146, 184, 242, 167, 183, 255, 230, 255, 255, 128}, + { 1, 119, 160, 239, 156, 178, 255, 231, 255, 255, 128}, + { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128}, + { 1, 75, 115, 234, 142, 173, 255, 225, 255, 255, 128} + }, + { + /* Coeff Band 3 */ + { 1, 150, 188, 244, 169, 183, 255, 233, 255, 255, 128}, + { 1, 123, 162, 243, 161, 180, 255, 233, 128, 128, 128}, + { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128}, + { 1, 76, 120, 238, 148, 178, 255, 230, 255, 255, 128} + }, + { + /* Coeff Band 4 */ + { 1, 163, 202, 252, 188, 204, 255, 248, 128, 128, 128}, + { 1, 136, 180, 251, 181, 201, 255, 246, 128, 128, 128}, + { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128}, + { 1, 92, 146, 249, 170, 197, 255, 245, 128, 128, 128} + }, + { + /* Coeff Band 5 */ + { 1, 156, 195, 249, 179, 193, 255, 241, 255, 255, 128}, + { 1, 128, 169, 248, 171, 192, 255, 242, 255, 255, 128}, + { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128}, + { 1, 84, 132, 245, 158, 187, 255, 240, 255, 255, 128} + }, + { + /* Coeff Band 6 */ + { 1, 36, 71, 251, 192, 201, 255, 243, 255, 255, 128}, + { 1, 49, 185, 250, 184, 199, 255, 242, 128, 128, 128}, + { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128}, + { 1, 95, 147, 247, 168, 190, 255, 239, 255, 255, 128} + }, + { + /* Coeff Band 7 */ + { 1, 19, 98, 255, 218, 222, 255, 255, 128, 128, 128}, + { 36, 50, 210, 255, 212, 221, 255, 255, 128, 128, 128}, + { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128}, + { 6, 117, 180, 254, 199, 216, 255, 251, 128, 128, 128} + } + }, + { /* block Type 3 */ + { /* Coeff Band 0 */ + { 192, 18, 155, 172, 145, 164, 192, 135, 246, 223, 255}, + { 94, 29, 97, 131, 131, 153, 171, 121, 250, 190, 255}, + { 25, 29, 63, 128, 119, 147, 168, 124, 251, 183, 255}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { /* Coeff Band 1 */ + { 1, 108, 192, 220, 186, 173, 255, 194, 255, 255, 128}, + { 123, 104, 188, 221, 165, 171, 247, 180, 255, 255, 128}, + { 23, 76, 152, 216, 154, 166, 226, 182, 255, 209, 128}, + { 1, 26, 52, 162, 109, 152, 208, 144, 255, 231, 128} + }, + { /* Coeff Band 2 */ + { 1, 57, 179, 220, 156, 175, 210, 158, 255, 223, 128}, + { 48, 57, 134, 212, 151, 170, 219, 185, 255, 248, 128}, + { 4, 35, 63, 189, 120, 156, 221, 159, 255, 241, 128}, + { 1, 17, 23, 110, 97, 143, 187, 120, 255, 234, 128} + }, + { /* Coeff Band 3 */ + { 1, 115, 205, 243, 182, 187, 254, 218, 255, 255, 128}, + { 80, 101, 186, 241, 183, 186, 249, 182, 255, 255, 128}, + { 10, 81, 144, 229, 164, 175, 241, 185, 255, 255, 128}, + { 1, 44, 81, 192, 130, 148, 240, 180, 255, 255, 128} + }, + { /* Coeff Band 4 */ + { 1, 161, 207, 249, 187, 176, 255, 180, 128, 128, 128}, + { 79, 148, 196, 240, 186, 182, 253, 171, 255, 255, 128}, + { 14, 111, 171, 233, 170, 178, 235, 204, 255, 255, 128}, + { 1, 63, 103, 202, 143, 162, 240, 178, 255, 255, 128} + }, + { /* Coeff Band 5 */ + { 1, 101, 202, 239, 185, 184, 252, 186, 255, 255, 128}, + { 43, 67, 166, 237, 178, 190, 246, 194, 255, 255, 128}, + { 4, 49, 85, 220, 140, 168, 253, 182, 255, 255, 128}, + { 1, 24, 35, 144, 93, 135, 239, 159, 255, 253, 128} + }, + { /* Coeff Band 6 */ + { 1, 212, 243, 255, 240, 234, 255, 255, 128, 128, 128}, + { 98, 168, 234, 255, 229, 234, 255, 255, 128, 128, 128}, + { 19, 127, 199, 255, 212, 198, 255, 255, 128, 128, 128}, + { 1, 103, 162, 253, 186, 151, 255, 255, 128, 128, 128} + }, + { /* Coeff Band 7 */ + { 1, 188, 253, 255, 255, 128, 128, 128, 128, 128, 128}, + { 191, 68, 242, 255, 255, 128, 128, 128, 128, 128, 128}, + { 8, 132, 255, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + } + } +}; + +static const vp9_prob + default_coef_probs_16x16[BLOCK_TYPES_16X16] + [COEF_BANDS] + [PREV_COEF_CONTEXTS] + [ENTROPY_NODES] = { + { /* block Type 0 */ + { /* Coeff Band 0 */ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { /* Coeff Band 1 */ + { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128}, + { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128}, + { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}, + { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128} + }, + { /* Coeff Band 2 */ + { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128}, + { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128}, + { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}, + { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128} + }, + { /* Coeff Band 3 */ + { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128}, + { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128}, + { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}, + { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128} + }, + { /* Coeff Band 4 */ + { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128}, + { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128}, + { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}, + { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128} + }, + { /* Coeff Band 5 */ + { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128}, + { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} + }, + { /* Coeff Band 6 */ + { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128}, + { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} + }, + { /* Coeff Band 7 */ + { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128}, + { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128}, + { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}, + { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128} + } + }, + { /* block Type 1 */ + { /* Coeff Band 0 */ + { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128}, + { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255}, + { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { /* Coeff Band 1 */ + { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128}, + { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128}, + { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128}, + { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128} + }, + { /* Coeff Band 2 */ + { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128}, + { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128}, + { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128}, + { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128} + }, + { /* Coeff Band 3 */ + { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128}, + { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128}, + { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128}, + { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128} + }, + { /* Coeff Band 4 */ + { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128}, + { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128}, + { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128}, + { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128} + }, + { /* Coeff Band 5 */ + { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128}, + { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128}, + { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128}, + { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128} + }, + { /* Coeff Band 6 */ + { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128}, + { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128}, + { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128}, + { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128} + }, + { /* Coeff Band 7 */ + { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128}, + { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128}, + { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128}, + { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128} + } + }, + { /* block Type 2 */ + { /* Coeff Band 0 */ + { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128}, + { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255}, + { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { /* Coeff Band 1 */ + { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128}, + { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128}, + { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128}, + { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128} + }, + { /* Coeff Band 2 */ + { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128}, + { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128}, + { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128}, + { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128} + }, + { /* Coeff Band 3 */ + { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128}, + { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128}, + { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128}, + { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128} + }, + { /* Coeff Band 4 */ + { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128}, + { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128}, + { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128}, + { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128} + }, + { /* Coeff Band 5 */ + { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128}, + { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128}, + { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128}, + { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128} + }, + { /* Coeff Band 6 */ + { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128}, + { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128}, + { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128}, + { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128} + }, + { /* Coeff Band 7 */ + { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128}, + { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128}, + { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128}, + { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128} + } + }, + { /* block Type 3 */ + { /* Coeff Band 0 */ + { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184}, + { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200}, + { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { /* Coeff Band 1 */ + { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128}, + { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128}, + { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255}, + { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255} + }, + { /* Coeff Band 2 */ + { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128}, + { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255}, + { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255}, + { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255} + }, + { /* Coeff Band 3 */ + { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128}, + { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128}, + { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128}, + { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255} + }, + { /* Coeff Band 4 */ + { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128}, + { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128}, + { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128}, + { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128} + }, + { /* Coeff Band 5 */ + { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128}, + { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128}, + { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128}, + { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128} + }, + { /* Coeff Band 6 */ + { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128}, + { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128}, + { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128}, + { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255} + }, + { /* Coeff Band 7 */ + { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128}, + { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128}, + { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128}, + { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128} + } + } +}; + +static const vp9_prob + default_hybrid_coef_probs_16x16[BLOCK_TYPES_16X16] + [COEF_BANDS] + [PREV_COEF_CONTEXTS] + [ENTROPY_NODES] = { + { /* block Type 0 */ + { /* Coeff Band 0 */ + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { /* Coeff Band 1 */ + { 60, 140, 195, 255, 212, 214, 128, 128, 128, 128, 128}, + { 75, 221, 231, 255, 203, 255, 128, 128, 128, 128, 128}, + { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128}, + { 9, 212, 196, 251, 197, 207, 255, 185, 128, 128, 128} + }, + { /* Coeff Band 2 */ + { 1, 227, 226, 255, 215, 215, 128, 128, 128, 128, 128}, + { 5, 163, 209, 255, 212, 212, 255, 255, 128, 128, 128}, + { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128}, + { 1, 133, 203, 255, 210, 220, 255, 255, 128, 128, 128} + }, + { /* Coeff Band 3 */ + { 1, 226, 225, 255, 228, 236, 128, 128, 128, 128, 128}, + { 6, 163, 208, 255, 224, 234, 255, 255, 128, 128, 128}, + { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128}, + { 1, 122, 196, 253, 212, 248, 255, 255, 128, 128, 128} + }, + { /* Coeff Band 4 */ + { 1, 222, 197, 254, 193, 216, 255, 236, 128, 128, 128}, + { 7, 140, 163, 251, 195, 211, 255, 238, 128, 128, 128}, + { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128}, + { 1, 91, 152, 249, 181, 197, 255, 239, 128, 128, 128} + }, + { /* Coeff Band 5 */ + { 1, 226, 218, 255, 216, 241, 255, 255, 128, 128, 128}, + { 6, 154, 191, 255, 218, 240, 255, 255, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} + }, + { /* Coeff Band 6 */ + { 1, 221, 217, 255, 208, 217, 255, 232, 128, 128, 128}, + { 11, 155, 189, 254, 203, 211, 255, 249, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128}, + { 1, 110, 171, 252, 191, 204, 255, 236, 128, 128, 128} + }, + { /* Coeff Band 7 */ + { 1, 207, 235, 255, 232, 240, 128, 128, 128, 128, 128}, + { 58, 161, 216, 255, 229, 235, 255, 255, 128, 128, 128}, + { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128}, + { 8, 133, 204, 255, 219, 231, 255, 255, 128, 128, 128} + } + }, + { /* block Type 1 */ + { /* Coeff Band 0 */ + { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128}, + { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255}, + { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { /* Coeff Band 1 */ + { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128}, + { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128}, + { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128}, + { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128} + }, + { /* Coeff Band 2 */ + { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128}, + { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128}, + { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128}, + { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128} + }, + { /* Coeff Band 3 */ + { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128}, + { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128}, + { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128}, + { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128} + }, + { /* Coeff Band 4 */ + { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128}, + { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128}, + { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128}, + { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128} + }, + { /* Coeff Band 5 */ + { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128}, + { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128}, + { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128}, + { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128} + }, + { /* Coeff Band 6 */ + { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128}, + { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128}, + { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128}, + { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128} + }, + { /* Coeff Band 7 */ + { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128}, + { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128}, + { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128}, + { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128} + } + }, + { /* block Type 2 */ + { /* Coeff Band 0 */ + { 1, 30, 103, 204, 142, 168, 235, 161, 255, 228, 128}, + { 1, 35, 90, 192, 130, 161, 227, 158, 255, 226, 255}, + { 1, 36, 78, 180, 122, 156, 221, 153, 255, 222, 255}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { /* Coeff Band 1 */ + { 1, 163, 228, 253, 212, 194, 255, 205, 128, 128, 128}, + { 67, 160, 226, 253, 210, 202, 245, 172, 255, 255, 128}, + { 51, 147, 219, 251, 207, 207, 255, 217, 128, 128, 128}, + { 25, 107, 175, 245, 183, 190, 254, 209, 255, 255, 128} + }, + { /* Coeff Band 2 */ + { 1, 66, 170, 240, 177, 186, 252, 203, 255, 245, 128}, + { 23, 64, 145, 230, 161, 177, 252, 198, 255, 255, 128}, + { 6, 51, 99, 208, 135, 163, 249, 178, 255, 248, 128}, + { 1, 33, 59, 161, 104, 151, 238, 164, 255, 237, 128} + }, + { /* Coeff Band 3 */ + { 1, 76, 216, 250, 198, 199, 255, 226, 255, 255, 128}, + { 86, 83, 200, 247, 189, 193, 255, 224, 255, 255, 128}, + { 30, 75, 164, 242, 172, 184, 254, 218, 255, 255, 128}, + { 3, 54, 103, 227, 140, 172, 253, 201, 255, 255, 128} + }, + { /* Coeff Band 4 */ + { 1, 241, 247, 255, 233, 223, 255, 255, 128, 128, 128}, + { 78, 212, 242, 255, 226, 230, 255, 255, 128, 128, 128}, + { 10, 167, 224, 255, 217, 225, 255, 128, 128, 128, 128}, + { 1, 104, 176, 250, 166, 219, 255, 255, 128, 128, 128} + }, + { /* Coeff Band 5 */ + { 1, 194, 241, 254, 228, 214, 248, 237, 255, 255, 128}, + { 95, 133, 228, 254, 218, 215, 255, 229, 128, 128, 128}, + { 24, 119, 201, 252, 202, 205, 255, 229, 128, 128, 128}, + { 1, 88, 155, 246, 183, 193, 255, 205, 128, 128, 128} + }, + { /* Coeff Band 6 */ + { 1, 204, 236, 255, 222, 220, 255, 239, 128, 128, 128}, + { 126, 105, 225, 254, 214, 217, 255, 254, 128, 128, 128}, + { 44, 86, 190, 251, 197, 204, 255, 233, 128, 128, 128}, + { 6, 71, 130, 240, 164, 188, 255, 246, 128, 128, 128} + }, + { /* Coeff Band 7 */ + { 1, 195, 250, 255, 239, 197, 128, 128, 128, 128, 128}, + { 167, 102, 249, 255, 234, 255, 128, 128, 128, 128, 128}, + { 65, 91, 222, 255, 217, 255, 128, 128, 128, 128, 128}, + { 1, 59, 128, 255, 154, 255, 128, 128, 128, 128, 128} + } + }, + { /* block Type 3 */ + { /* Coeff Band 0 */ + { 17, 105, 227, 195, 164, 170, 168, 137, 221, 160, 184}, + { 6, 92, 166, 193, 158, 169, 179, 142, 236, 175, 200}, + { 2, 68, 118, 193, 147, 168, 187, 149, 241, 178, 247}, + { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128} + }, + { /* Coeff Band 1 */ + { 1, 193, 221, 246, 198, 194, 244, 176, 255, 192, 128}, + { 112, 160, 209, 244, 196, 194, 243, 175, 255, 209, 128}, + { 45, 123, 175, 240, 184, 195, 239, 178, 255, 218, 255}, + { 16, 53, 75, 169, 119, 152, 209, 146, 255, 219, 255} + }, + { /* Coeff Band 2 */ + { 1, 141, 183, 240, 176, 187, 246, 198, 255, 218, 128}, + { 36, 97, 150, 231, 161, 180, 243, 191, 255, 217, 255}, + { 8, 65, 111, 210, 143, 166, 230, 167, 255, 224, 255}, + { 2, 35, 61, 157, 113, 149, 208, 142, 255, 217, 255} + }, + { /* Coeff Band 3 */ + { 1, 173, 196, 245, 184, 191, 252, 211, 255, 240, 128}, + { 35, 119, 175, 242, 177, 187, 252, 209, 255, 235, 128}, + { 4, 88, 141, 234, 161, 180, 249, 200, 255, 228, 128}, + { 1, 57, 95, 203, 133, 161, 235, 167, 255, 231, 255} + }, + { /* Coeff Band 4 */ + { 1, 208, 227, 249, 209, 204, 248, 188, 255, 248, 128}, + { 28, 162, 211, 247, 203, 200, 252, 188, 255, 232, 128}, + { 5, 114, 174, 238, 182, 189, 245, 184, 255, 238, 128}, + { 1, 61, 100, 205, 136, 164, 235, 163, 255, 239, 128} + }, + { /* Coeff Band 5 */ + { 1, 195, 218, 252, 208, 207, 250, 205, 255, 245, 128}, + { 22, 141, 196, 249, 198, 201, 250, 202, 255, 244, 128}, + { 2, 105, 163, 240, 178, 189, 246, 191, 255, 246, 128}, + { 1, 70, 112, 206, 144, 167, 232, 162, 255, 239, 128} + }, + { /* Coeff Band 6 */ + { 1, 204, 215, 251, 204, 203, 255, 222, 255, 225, 128}, + { 15, 140, 194, 249, 194, 199, 254, 221, 255, 253, 128}, + { 1, 95, 153, 243, 172, 188, 254, 213, 255, 248, 128}, + { 1, 59, 99, 216, 135, 166, 247, 190, 255, 237, 255} + }, + { /* Coeff Band 7 */ + { 1, 7, 231, 255, 227, 223, 255, 240, 255, 255, 128}, + { 15, 157, 217, 255, 218, 219, 255, 239, 255, 255, 128}, + { 1, 114, 182, 252, 198, 207, 255, 235, 255, 255, 128}, + { 1, 71, 122, 238, 154, 181, 255, 216, 255, 255, 128} + } + } +}; diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c new file mode 100644 index 000000000..e6ee2729e --- /dev/null +++ b/vp9/common/vp9_entropy.c @@ -0,0 +1,447 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include + +#include "vp9_entropy.h" +#include "string.h" +#include "vp9_blockd.h" +#include "vp9_onyxc_int.h" +#include "vp9_entropymode.h" +#include "vpx_mem/vpx_mem.h" + +#define uchar unsigned char /* typedefs can clash */ +#define uint unsigned int + +typedef const uchar cuchar; +typedef const uint cuint; + +typedef vp9_prob Prob; + +#include "vp9_coefupdateprobs.h" + +const int vp9_i8x8_block[4] = {0, 2, 8, 10}; + +DECLARE_ALIGNED(16, const unsigned char, vp9_norm[256]) = { + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]) = { + 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7 +}; + +DECLARE_ALIGNED(16, cuchar, vp9_prev_token_class[MAX_ENTROPY_TOKENS]) = { + 0, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0 +}; + +DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]) = { + 0, 1, 4, 8, + 5, 2, 3, 6, + 9, 12, 13, 10, + 7, 11, 14, 15, +}; + +DECLARE_ALIGNED(16, const int, vp9_col_scan[16]) = { + 0, 4, 8, 12, + 1, 5, 9, 13, + 2, 6, 10, 14, + 3, 7, 11, 15 +}; +DECLARE_ALIGNED(16, const int, vp9_row_scan[16]) = { + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15 +}; + + +DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]) = { 0, 1, 2, 3, 5, 4, 4, 5, + 5, 3, 6, 3, 5, 4, 6, 6, + 6, 5, 5, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7 + }; +DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]) = { + 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, + 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, + 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, + 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, +}; + +// Table can be optimized. +DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]) = { + 0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6, + 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, +}; +DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = { + 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4, 5, + 20, 35, 50, 65, 80, 96, 81, 66, 51, 36, 21, 6, 7, 22, 37, 52, + 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, 8, 9, 24, 39, + 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100, 85, 70, 55, 40, + 25, 10, 11, 26, 41, 56, 71, 86, 101, 116, 131, 146, 161, 176, 192, 177, + 162, 147, 132, 117, 102, 87, 72, 57, 42, 27, 12, 13, 28, 43, 58, 73, + 88, 103, 118, 133, 148, 163, 178, 193, 208, 224, 209, 194, 179, 164, 149, 134, + 119, 104, 89, 74, 59, 44, 29, 14, 15, 30, 45, 60, 75, 90, 105, 120, + 135, 150, 165, 180, 195, 210, 225, 240, 241, 226, 211, 196, 181, 166, 151, 136, + 121, 106, 91, 76, 61, 46, 31, 47, 62, 77, 92, 107, 122, 137, 152, 167, + 182, 197, 212, 227, 242, 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, + 78, 63, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, + 215, 200, 185, 170, 155, 140, 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, + 216, 231, 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, + 203, 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235, + 250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254, 255, +}; + + +/* Array indices are identical to previously-existing CONTEXT_NODE indices */ + +const vp9_tree_index vp9_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */ +{ + -DCT_EOB_TOKEN, 2, /* 0 = EOB */ + -ZERO_TOKEN, 4, /* 1 = ZERO */ + -ONE_TOKEN, 6, /* 2 = ONE */ + 8, 12, /* 3 = LOW_VAL */ + -TWO_TOKEN, 10, /* 4 = TWO */ + -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */ + 14, 16, /* 6 = HIGH_LOW */ + -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */ + 18, 20, /* 8 = CAT_THREEFOUR */ + -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */ + -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */ +}; + +struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS]; + +/* Trees for extra bits. Probabilities are constant and + do not depend on previously encoded bits */ + +static const Prob Pcat1[] = { 159}; +static const Prob Pcat2[] = { 165, 145}; +static const Prob Pcat3[] = { 173, 148, 140}; +static const Prob Pcat4[] = { 176, 155, 140, 135}; +static const Prob Pcat5[] = { 180, 157, 141, 134, 130}; +static const Prob Pcat6[] = +{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129}; + +static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[26]; + +static void init_bit_tree(vp9_tree_index *p, int n) { + int i = 0; + + while (++i < n) { + p[0] = p[1] = i << 1; + p += 2; + } + + p[0] = p[1] = 0; +} + +static void init_bit_trees() { + init_bit_tree(cat1, 1); + init_bit_tree(cat2, 2); + init_bit_tree(cat3, 3); + init_bit_tree(cat4, 4); + init_bit_tree(cat5, 5); + init_bit_tree(cat6, 13); +} + +vp9_extra_bit_struct vp9_extra_bits[12] = { + { 0, 0, 0, 0}, + { 0, 0, 0, 1}, + { 0, 0, 0, 2}, + { 0, 0, 0, 3}, + { 0, 0, 0, 4}, + { cat1, Pcat1, 1, 5}, + { cat2, Pcat2, 2, 7}, + { cat3, Pcat3, 3, 11}, + { cat4, Pcat4, 4, 19}, + { cat5, Pcat5, 5, 35}, + { cat6, Pcat6, 13, 67}, + { 0, 0, 0, 0} +}; + +#include "vp9_default_coef_probs.h" + +void vp9_default_coef_probs(VP9_COMMON *pc) { + vpx_memcpy(pc->fc.coef_probs, default_coef_probs, + sizeof(pc->fc.coef_probs)); + vpx_memcpy(pc->fc.hybrid_coef_probs, default_hybrid_coef_probs, + sizeof(pc->fc.hybrid_coef_probs)); + + vpx_memcpy(pc->fc.coef_probs_8x8, default_coef_probs_8x8, + sizeof(pc->fc.coef_probs_8x8)); + vpx_memcpy(pc->fc.hybrid_coef_probs_8x8, default_hybrid_coef_probs_8x8, + sizeof(pc->fc.hybrid_coef_probs_8x8)); + + vpx_memcpy(pc->fc.coef_probs_16x16, default_coef_probs_16x16, + sizeof(pc->fc.coef_probs_16x16)); + vpx_memcpy(pc->fc.hybrid_coef_probs_16x16, + default_hybrid_coef_probs_16x16, + sizeof(pc->fc.hybrid_coef_probs_16x16)); +} + +void vp9_coef_tree_initialize() { + init_bit_trees(); + vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree); +} + +// #define COEF_COUNT_TESTING + +#define COEF_COUNT_SAT 24 +#define COEF_MAX_UPDATE_FACTOR 112 +#define COEF_COUNT_SAT_KEY 24 +#define COEF_MAX_UPDATE_FACTOR_KEY 112 +#define COEF_COUNT_SAT_AFTER_KEY 24 +#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128 + +void vp9_adapt_coef_probs(VP9_COMMON *cm) { + int t, i, j, k, count; + unsigned int branch_ct[ENTROPY_NODES][2]; + vp9_prob coef_probs[ENTROPY_NODES]; + int update_factor; /* denominator 256 */ + int factor; + int count_sat; + + // printf("Frame type: %d\n", cm->frame_type); + if (cm->frame_type == KEY_FRAME) { + update_factor = COEF_MAX_UPDATE_FACTOR_KEY; + count_sat = COEF_COUNT_SAT_KEY; + } else if (cm->last_frame_type == KEY_FRAME) { + update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY; /* adapt quickly */ + count_sat = COEF_COUNT_SAT_AFTER_KEY; + } else { + update_factor = COEF_MAX_UPDATE_FACTOR; + count_sat = COEF_COUNT_SAT; + } + +#ifdef COEF_COUNT_TESTING + { + printf("static const unsigned int\ncoef_counts" + "[BLOCK_TYPES] [COEF_BANDS]" + "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n"); + for (i = 0; i < BLOCK_TYPES; ++i) { + printf(" {\n"); + for (j = 0; j < COEF_BANDS; ++j) { + printf(" {\n"); + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + printf(" {"); + for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) + printf("%d, ", cm->fc.coef_counts[i][j][k][t]); + printf("},\n"); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n"); + printf("static const unsigned int\ncoef_counts_8x8" + "[BLOCK_TYPES_8X8] [COEF_BANDS]" + "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n"); + for (i = 0; i < BLOCK_TYPES_8X8; ++i) { + printf(" {\n"); + for (j = 0; j < COEF_BANDS; ++j) { + printf(" {\n"); + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + printf(" {"); + for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) + printf("%d, ", cm->fc.coef_counts_8x8[i][j][k][t]); + printf("},\n"); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n"); + printf("static const unsigned int\nhybrid_coef_counts" + "[BLOCK_TYPES] [COEF_BANDS]" + "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {\n"); + for (i = 0; i < BLOCK_TYPES; ++i) { + printf(" {\n"); + for (j = 0; j < COEF_BANDS; ++j) { + printf(" {\n"); + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + printf(" {"); + for (t = 0; t < MAX_ENTROPY_TOKENS; ++t) + printf("%d, ", cm->fc.hybrid_coef_counts[i][j][k][t]); + printf("},\n"); + } + printf(" },\n"); + } + printf(" },\n"); + } + printf("};\n"); + } +#endif + + for (i = 0; i < BLOCK_TYPES; ++i) + for (j = 0; j < COEF_BANDS; ++j) + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) + continue; + vp9_tree_probs_from_distribution( + MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, + coef_probs, branch_ct, cm->fc.coef_counts [i][j][k], + 256, 1); + for (t = 0; t < ENTROPY_NODES; ++t) { + int prob; + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > count_sat ? count_sat : count; + factor = (update_factor * count / count_sat); + prob = ((int)cm->fc.pre_coef_probs[i][j][k][t] * (256 - factor) + + (int)coef_probs[t] * factor + 128) >> 8; + if (prob <= 0) cm->fc.coef_probs[i][j][k][t] = 1; + else if (prob > 255) cm->fc.coef_probs[i][j][k][t] = 255; + else cm->fc.coef_probs[i][j][k][t] = prob; + } + } + + for (i = 0; i < BLOCK_TYPES; ++i) + for (j = 0; j < COEF_BANDS; ++j) + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) + continue; + vp9_tree_probs_from_distribution( + MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, + coef_probs, branch_ct, cm->fc.hybrid_coef_counts [i][j][k], + 256, 1); + for (t = 0; t < ENTROPY_NODES; ++t) { + int prob; + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > count_sat ? count_sat : count; + factor = (update_factor * count / count_sat); + prob = ((int)cm->fc.pre_hybrid_coef_probs[i][j][k][t] * (256 - factor) + + (int)coef_probs[t] * factor + 128) >> 8; + if (prob <= 0) cm->fc.hybrid_coef_probs[i][j][k][t] = 1; + else if (prob > 255) cm->fc.hybrid_coef_probs[i][j][k][t] = 255; + else cm->fc.hybrid_coef_probs[i][j][k][t] = prob; + } + } + + for (i = 0; i < BLOCK_TYPES_8X8; ++i) + for (j = 0; j < COEF_BANDS; ++j) + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) + continue; + vp9_tree_probs_from_distribution( + MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, + coef_probs, branch_ct, cm->fc.coef_counts_8x8 [i][j][k], + 256, 1); + for (t = 0; t < ENTROPY_NODES; ++t) { + int prob; + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > count_sat ? count_sat : count; + factor = (update_factor * count / count_sat); + prob = ((int)cm->fc.pre_coef_probs_8x8[i][j][k][t] * (256 - factor) + + (int)coef_probs[t] * factor + 128) >> 8; + if (prob <= 0) cm->fc.coef_probs_8x8[i][j][k][t] = 1; + else if (prob > 255) cm->fc.coef_probs_8x8[i][j][k][t] = 255; + else cm->fc.coef_probs_8x8[i][j][k][t] = prob; + } + } + + for (i = 0; i < BLOCK_TYPES_8X8; ++i) + for (j = 0; j < COEF_BANDS; ++j) + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) + continue; + vp9_tree_probs_from_distribution( + MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, + coef_probs, branch_ct, cm->fc.hybrid_coef_counts_8x8 [i][j][k], + 256, 1); + for (t = 0; t < ENTROPY_NODES; ++t) { + int prob; + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > count_sat ? count_sat : count; + factor = (update_factor * count / count_sat); + prob = ((int)cm->fc.pre_hybrid_coef_probs_8x8[i][j][k][t] * + (256 - factor) + + (int)coef_probs[t] * factor + 128) >> 8; + if (prob <= 0) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 1; + else if (prob > 255) cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = 255; + else cm->fc.hybrid_coef_probs_8x8[i][j][k][t] = prob; + } + } + + for (i = 0; i < BLOCK_TYPES_16X16; ++i) + for (j = 0; j < COEF_BANDS; ++j) + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) + continue; + vp9_tree_probs_from_distribution( + MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, + coef_probs, branch_ct, cm->fc.coef_counts_16x16[i][j][k], 256, 1); + for (t = 0; t < ENTROPY_NODES; ++t) { + int prob; + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > count_sat ? count_sat : count; + factor = (update_factor * count / count_sat); + prob = ((int)cm->fc.pre_coef_probs_16x16[i][j][k][t] * + (256 - factor) + + (int)coef_probs[t] * factor + 128) >> 8; + if (prob <= 0) cm->fc.coef_probs_16x16[i][j][k][t] = 1; + else if (prob > 255) cm->fc.coef_probs_16x16[i][j][k][t] = 255; + else cm->fc.coef_probs_16x16[i][j][k][t] = prob; + } + } + + for (i = 0; i < BLOCK_TYPES_16X16; ++i) + for (j = 0; j < COEF_BANDS; ++j) + for (k = 0; k < PREV_COEF_CONTEXTS; ++k) { + if (k >= 3 && ((i == 0 && j == 1) || (i > 0 && j == 0))) + continue; + vp9_tree_probs_from_distribution( + MAX_ENTROPY_TOKENS, vp9_coef_encodings, vp9_coef_tree, + coef_probs, branch_ct, cm->fc.hybrid_coef_counts_16x16[i][j][k], 256, 1); + for (t = 0; t < ENTROPY_NODES; ++t) { + int prob; + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > count_sat ? count_sat : count; + factor = (update_factor * count / count_sat); + prob = ((int)cm->fc.pre_hybrid_coef_probs_16x16[i][j][k][t] * (256 - factor) + + (int)coef_probs[t] * factor + 128) >> 8; + if (prob <= 0) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 1; + else if (prob > 255) cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = 255; + else cm->fc.hybrid_coef_probs_16x16[i][j][k][t] = prob; + } + } +} diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h new file mode 100644 index 000000000..a2053609b --- /dev/null +++ b/vp9/common/vp9_entropy.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_ENTROPY_H +#define __INC_ENTROPY_H + +#include "vp9_treecoder.h" +#include "vp9_blockd.h" +#include "vp9_common.h" +#include "vp9_coefupdateprobs.h" + +extern const int vp9_i8x8_block[4]; + +/* Coefficient token alphabet */ + +#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */ +#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */ +#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */ +#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */ +#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */ +#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */ +#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */ +#define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */ +#define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */ +#define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */ +#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 13+1 */ +#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */ +#define MAX_ENTROPY_TOKENS 12 +#define ENTROPY_NODES 11 +#define EOSB_TOKEN 127 /* Not signalled, encoder only */ + +#define INTER_MODE_CONTEXTS 7 + +extern const vp9_tree_index vp9_coef_tree[]; + +extern struct vp9_token_struct vp9_coef_encodings[MAX_ENTROPY_TOKENS]; + +typedef struct { + vp9_tree_p tree; + const vp9_prob *prob; + int Len; + int base_val; +} vp9_extra_bit_struct; + +extern vp9_extra_bit_struct vp9_extra_bits[12]; /* indexed by token value */ + +#define PROB_UPDATE_BASELINE_COST 7 + +#define MAX_PROB 255 +#define DCT_MAX_VALUE 8192 + +/* Coefficients are predicted via a 3-dimensional probability table. */ + +/* Outside dimension. 0 = Y no DC, 1 = Y2, 2 = UV, 3 = Y with DC */ +#define BLOCK_TYPES 4 + +#define BLOCK_TYPES_8X8 4 + +#define BLOCK_TYPES_16X16 4 + +/* Middle dimension is a coarsening of the coefficient's + position within the 4x4 DCT. */ + +#define COEF_BANDS 8 +extern DECLARE_ALIGNED(16, const int, vp9_coef_bands[16]); +extern DECLARE_ALIGNED(64, const int, vp9_coef_bands_8x8[64]); +extern DECLARE_ALIGNED(16, const int, vp9_coef_bands_16x16[256]); + +/* Inside dimension is 3-valued measure of nearby complexity, that is, + the extent to which nearby coefficients are nonzero. For the first + coefficient (DC, unless block type is 0), we look at the (already encoded) + blocks above and to the left of the current block. The context index is + then the number (0,1,or 2) of these blocks having nonzero coefficients. + After decoding a coefficient, the measure is roughly the size of the + most recently decoded coefficient (0 for 0, 1 for 1, 2 for >1). + Note that the intuitive meaning of this measure changes as coefficients + are decoded, e.g., prior to the first token, a zero means that my neighbors + are empty while, after the first token, because of the use of end-of-block, + a zero means we just decoded a zero and hence guarantees that a non-zero + coefficient will appear later in this block. However, this shift + in meaning is perfectly OK because our context depends also on the + coefficient band (and since zigzag positions 0, 1, and 2 are in + distinct bands). */ + +/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */ +#define PREV_COEF_CONTEXTS 4 + +#define SUBEXP_PARAM 4 /* Subexponential code parameter */ +#define MODULUS_PARAM 13 /* Modulus parameter */ + +extern DECLARE_ALIGNED(16, const unsigned char, vp9_prev_token_class[MAX_ENTROPY_TOKENS]); + +struct VP9Common; +void vp9_default_coef_probs(struct VP9Common *); +extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d[16]); + +extern DECLARE_ALIGNED(16, const int, vp9_col_scan[16]); +extern DECLARE_ALIGNED(16, const int, vp9_row_scan[16]); + +extern DECLARE_ALIGNED(64, const int, vp9_default_zig_zag1d_8x8[64]); +void vp9_coef_tree_initialize(void); + +extern DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]); +void vp9_adapt_coef_probs(struct VP9Common *); + +#endif diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c new file mode 100644 index 000000000..69acbab02 --- /dev/null +++ b/vp9/common/vp9_entropymode.c @@ -0,0 +1,713 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9_onyxc_int.h" +#include "vp9_modecont.h" +#include "vpx_mem/vpx_mem.h" + + +static const unsigned int kf_y_mode_cts[8][VP9_YMODES] = { + /* DC V H D45 135 117 153 D27 D63 TM i8x8 BPRED */ + {12, 6, 5, 5, 5, 5, 5, 5, 5, 2, 22, 200}, + {25, 13, 13, 7, 7, 7, 7, 7, 7, 6, 27, 160}, + {31, 17, 18, 8, 8, 8, 8, 8, 8, 9, 26, 139}, + {40, 22, 23, 8, 8, 8, 8, 8, 8, 12, 27, 116}, + {53, 26, 28, 8, 8, 8, 8, 8, 8, 13, 26, 94}, + {68, 33, 35, 8, 8, 8, 8, 8, 8, 17, 20, 68}, + {78, 38, 38, 8, 8, 8, 8, 8, 8, 19, 16, 52}, + {89, 42, 42, 8, 8, 8, 8, 8, 8, 21, 12, 34}, +}; + +static const unsigned int y_mode_cts [VP9_YMODES] = { + /* DC V H D45 135 117 153 D27 D63 TM i8x8 BPRED */ + 98, 19, 15, 14, 14, 14, 14, 12, 12, 13, 16, 70 +}; + +static const unsigned int uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = { + /* DC V H D45 135 117 153 D27 D63 TM */ + { 200, 15, 15, 10, 10, 10, 10, 10, 10, 6}, /* DC */ + { 130, 75, 10, 10, 10, 10, 10, 10, 10, 6}, /* V */ + { 130, 10, 75, 10, 10, 10, 10, 10, 10, 6}, /* H */ + { 130, 15, 10, 75, 10, 10, 10, 10, 10, 6}, /* D45 */ + { 150, 15, 10, 10, 75, 10, 10, 10, 10, 6}, /* D135 */ + { 150, 15, 10, 10, 10, 75, 10, 10, 10, 6}, /* D117 */ + { 150, 15, 10, 10, 10, 10, 75, 10, 10, 6}, /* D153 */ + { 150, 15, 10, 10, 10, 10, 10, 75, 10, 6}, /* D27 */ + { 150, 15, 10, 10, 10, 10, 10, 10, 75, 6}, /* D63 */ + { 160, 30, 30, 10, 10, 10, 10, 10, 10, 16}, /* TM */ + { 132, 46, 40, 10, 10, 10, 10, 10, 10, 18}, /* i8x8 - never used */ + { 150, 35, 41, 10, 10, 10, 10, 10, 10, 10}, /* BPRED */ +}; + +static const unsigned int i8x8_mode_cts [VP9_I8X8_MODES] = { + /* DC V H D45 135 117 153 D27 D63 TM */ + 73, 49, 61, 30, 30, 30, 30, 30, 30, 13 +}; + +static const unsigned int kf_uv_mode_cts [VP9_YMODES] [VP9_UV_MODES] = { + // DC V H D45 135 117 153 D27 D63 TM + { 160, 24, 24, 20, 20, 20, 20, 20, 20, 8}, /* DC */ + { 102, 64, 30, 20, 20, 20, 20, 20, 20, 10}, /* V */ + { 102, 30, 64, 20, 20, 20, 20, 20, 20, 10}, /* H */ + { 102, 33, 20, 64, 20, 20, 20, 20, 20, 14}, /* D45 */ + { 102, 33, 20, 20, 64, 20, 20, 20, 20, 14}, /* D135 */ + { 122, 33, 20, 20, 20, 64, 20, 20, 20, 14}, /* D117 */ + { 102, 33, 20, 20, 20, 20, 64, 20, 20, 14}, /* D153 */ + { 102, 33, 20, 20, 20, 20, 20, 64, 20, 14}, /* D27 */ + { 102, 33, 20, 20, 20, 20, 20, 20, 64, 14}, /* D63 */ + { 132, 36, 30, 20, 20, 20, 20, 20, 20, 18}, /* TM */ + { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* i8x8 - never used */ + { 122, 41, 35, 20, 20, 20, 20, 20, 20, 18}, /* BPRED */ +}; + +static const unsigned int bmode_cts[VP9_NKF_BINTRAMODES] = { +#if CONFIG_NEWBINTRAMODES +#if CONTEXT_PRED_REPLACEMENTS == 6 + /* DC TM VE HE CONTEXT */ + 43891, 17694, 10036, 3920, 20000 +#elif CONTEXT_PRED_REPLACEMENTS == 4 + /* DC TM VE HE LD RD CONTEXT */ + 43891, 17694, 10036, 3920, 3363, 2546, 14000 +#elif CONTEXT_PRED_REPLACEMENTS == 0 + /* DC TM VE HE LD RD VR VL HD HU CONTEXT */ + 43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723, 50000 +#endif +#else + /* DC TM VE HE LD RD VR VL HD HU */ + 43891, 17694, 10036, 3920, 3363, 2546, 5119, 3221, 2471, 1723 +#endif +}; + +typedef enum { + SUBMVREF_NORMAL, + SUBMVREF_LEFT_ZED, + SUBMVREF_ABOVE_ZED, + SUBMVREF_LEFT_ABOVE_SAME, + SUBMVREF_LEFT_ABOVE_ZED +} sumvfref_t; + +int vp9_mv_cont(const int_mv *l, const int_mv *a) { + int lez = (l->as_int == 0); + int aez = (a->as_int == 0); + int lea = (l->as_int == a->as_int); + + if (lea && lez) + return SUBMVREF_LEFT_ABOVE_ZED; + + if (lea) + return SUBMVREF_LEFT_ABOVE_SAME; + + if (aez) + return SUBMVREF_ABOVE_ZED; + + if (lez) + return SUBMVREF_LEFT_ZED; + + return SUBMVREF_NORMAL; +} + +const vp9_prob vp9_sub_mv_ref_prob [VP9_SUBMVREFS - 1] = { 180, 162, 25}; + +const vp9_prob vp9_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP9_SUBMVREFS - 1] = { + { 147, 136, 18 }, + { 106, 145, 1 }, + { 179, 121, 1 }, + { 223, 1, 34 }, + { 208, 1, 1 } +}; + +vp9_mbsplit vp9_mbsplits [VP9_NUMMBSPLITS] = { + { + 0, 0, 0, 0, + 0, 0, 0, 0, + 1, 1, 1, 1, + 1, 1, 1, 1, + }, { + 0, 0, 1, 1, + 0, 0, 1, 1, + 0, 0, 1, 1, + 0, 0, 1, 1, + }, { + 0, 0, 1, 1, + 0, 0, 1, 1, + 2, 2, 3, 3, + 2, 2, 3, 3, + }, { + 0, 1, 2, 3, + 4, 5, 6, 7, + 8, 9, 10, 11, + 12, 13, 14, 15, + }, +}; + +const int vp9_mbsplit_count [VP9_NUMMBSPLITS] = { 2, 2, 4, 16}; + +const vp9_prob vp9_mbsplit_probs [VP9_NUMMBSPLITS - 1] = { 110, 111, 150}; + +/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ + +const vp9_tree_index vp9_kf_bmode_tree[VP9_KF_BINTRAMODES * 2 - 2] = { + -B_DC_PRED, 2, /* 0 = DC_NODE */ + -B_TM_PRED, 4, /* 1 = TM_NODE */ + -B_VE_PRED, 6, /* 2 = VE_NODE */ + 8, 12, /* 3 = COM_NODE */ + -B_HE_PRED, 10, /* 4 = HE_NODE */ + -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */ + -B_LD_PRED, 14, /* 6 = LD_NODE */ + -B_VL_PRED, 16, /* 7 = VL_NODE */ + -B_HD_PRED, -B_HU_PRED /* 8 = HD_NODE */ +}; + +const vp9_tree_index vp9_bmode_tree[VP9_NKF_BINTRAMODES * 2 - 2] = { +#if CONFIG_NEWBINTRAMODES +#if CONTEXT_PRED_REPLACEMENTS == 6 + -B_DC_PRED, 2, + -B_TM_PRED, 4, + 6, -(B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS), + -B_VE_PRED, -B_HE_PRED +#elif CONTEXT_PRED_REPLACEMENTS == 4 + -B_DC_PRED, 2, + -B_TM_PRED, 4, + 6, 8, + -B_VE_PRED, -B_HE_PRED, + 10, -(B_CONTEXT_PRED - CONTEXT_PRED_REPLACEMENTS), + -B_RD_PRED, -B_LD_PRED, +#elif CONTEXT_PRED_REPLACEMENTS == 0 + -B_DC_PRED, 2, /* 0 = DC_NODE */ + -B_TM_PRED, 4, /* 1 = TM_NODE */ + -B_VE_PRED, 6, /* 2 = VE_NODE */ + 8, 12, /* 3 = COM_NODE */ + -B_HE_PRED, 10, /* 4 = HE_NODE */ + -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */ + -B_LD_PRED, 14, /* 6 = LD_NODE */ + -B_VL_PRED, 16, /* 7 = VL_NODE */ + -B_HD_PRED, 18, + -B_HU_PRED, -B_CONTEXT_PRED +#endif +#else + -B_DC_PRED, 2, /* 0 = DC_NODE */ + -B_TM_PRED, 4, /* 1 = TM_NODE */ + -B_VE_PRED, 6, /* 2 = VE_NODE */ + 8, 12, /* 3 = COM_NODE */ + -B_HE_PRED, 10, /* 4 = HE_NODE */ + -B_RD_PRED, -B_VR_PRED, /* 5 = RD_NODE */ + -B_LD_PRED, 14, /* 6 = LD_NODE */ + -B_VL_PRED, 16, /* 7 = VL_NODE */ + -B_HD_PRED, -B_HU_PRED /* 8 = HD_NODE */ +#endif +}; + +/* Again, these trees use the same probability indices as their + explicitly-programmed predecessors. */ +const vp9_tree_index vp9_ymode_tree[VP9_YMODES * 2 - 2] = { + 2, 14, + -DC_PRED, 4, + 6, 8, + -D45_PRED, -D135_PRED, + 10, 12, + -D117_PRED, -D153_PRED, + -D27_PRED, -D63_PRED, + 16, 18, + -V_PRED, -H_PRED, + -TM_PRED, 20, + -B_PRED, -I8X8_PRED +}; + +const vp9_tree_index vp9_kf_ymode_tree[VP9_YMODES * 2 - 2] = { + 2, 14, + -DC_PRED, 4, + 6, 8, + -D45_PRED, -D135_PRED, + 10, 12, + -D117_PRED, -D153_PRED, + -D27_PRED, -D63_PRED, + 16, 18, + -V_PRED, -H_PRED, + -TM_PRED, 20, + -B_PRED, -I8X8_PRED +}; + +const vp9_tree_index vp9_i8x8_mode_tree[VP9_I8X8_MODES * 2 - 2] = { + 2, 14, + -DC_PRED, 4, + 6, 8, + -D45_PRED, -D135_PRED, + 10, 12, + -D117_PRED, -D153_PRED, + -D27_PRED, -D63_PRED, + -V_PRED, 16, + -H_PRED, -TM_PRED +}; + +const vp9_tree_index vp9_uv_mode_tree[VP9_UV_MODES * 2 - 2] = { + 2, 14, + -DC_PRED, 4, + 6, 8, + -D45_PRED, -D135_PRED, + 10, 12, + -D117_PRED, -D153_PRED, + -D27_PRED, -D63_PRED, + -V_PRED, 16, + -H_PRED, -TM_PRED +}; + +const vp9_tree_index vp9_mbsplit_tree[6] = { + -PARTITIONING_4X4, 2, + -PARTITIONING_8X8, 4, + -PARTITIONING_16X8, -PARTITIONING_8X16, +}; + +const vp9_tree_index vp9_mv_ref_tree[8] = { + -ZEROMV, 2, + -NEARESTMV, 4, + -NEARMV, 6, + -NEWMV, -SPLITMV +}; + +#if CONFIG_SUPERBLOCKS +const vp9_tree_index vp9_sb_mv_ref_tree[6] = { + -ZEROMV, 2, + -NEARESTMV, 4, + -NEARMV, -NEWMV +}; +#endif + +const vp9_tree_index vp9_sub_mv_ref_tree[6] = { + -LEFT4X4, 2, + -ABOVE4X4, 4, + -ZERO4X4, -NEW4X4 +}; + +struct vp9_token_struct vp9_bmode_encodings[VP9_NKF_BINTRAMODES]; +struct vp9_token_struct vp9_kf_bmode_encodings[VP9_KF_BINTRAMODES]; +struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES]; +#if CONFIG_SUPERBLOCKS +struct vp9_token_struct vp9_sb_ymode_encodings[VP9_I32X32_MODES]; +struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES]; +#endif +struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES]; +struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES]; +struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES]; +struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS]; + +struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS]; +#if CONFIG_SUPERBLOCKS +struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS]; +#endif +struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS]; + +void vp9_init_mbmode_probs(VP9_COMMON *x) { + unsigned int bct [VP9_YMODES] [2]; /* num Ymodes > num UV modes */ + + vp9_tree_probs_from_distribution(VP9_YMODES, vp9_ymode_encodings, + vp9_ymode_tree, x->fc.ymode_prob, + bct, y_mode_cts, 256, 1); +#if CONFIG_SUPERBLOCKS + vp9_tree_probs_from_distribution(VP9_I32X32_MODES, vp9_sb_ymode_encodings, + vp9_sb_ymode_tree, x->fc.sb_ymode_prob, + bct, y_mode_cts, 256, 1); +#endif + { + int i; + for (i = 0; i < 8; i++) { + vp9_tree_probs_from_distribution(VP9_YMODES, vp9_kf_ymode_encodings, + vp9_kf_ymode_tree, x->kf_ymode_prob[i], + bct, kf_y_mode_cts[i], 256, 1); +#if CONFIG_SUPERBLOCKS + vp9_tree_probs_from_distribution(VP9_I32X32_MODES, + vp9_sb_kf_ymode_encodings, + vp9_sb_kf_ymode_tree, + x->sb_kf_ymode_prob[i], bct, + kf_y_mode_cts[i], 256, 1); +#endif + } + } + { + int i; + for (i = 0; i < VP9_YMODES; i++) { + vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings, + vp9_uv_mode_tree, x->kf_uv_mode_prob[i], + bct, kf_uv_mode_cts[i], 256, 1); + vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings, + vp9_uv_mode_tree, x->fc.uv_mode_prob[i], + bct, uv_mode_cts[i], 256, 1); + } + } + + vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings, + vp9_i8x8_mode_tree, x->fc.i8x8_mode_prob, + bct, i8x8_mode_cts, 256, 1); + + vpx_memcpy(x->fc.sub_mv_ref_prob, vp9_sub_mv_ref_prob2, + sizeof(vp9_sub_mv_ref_prob2)); + vpx_memcpy(x->fc.mbsplit_prob, vp9_mbsplit_probs, sizeof(vp9_mbsplit_probs)); + vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob, + sizeof(vp9_switchable_interp_prob)); +#if CONFIG_COMP_INTERINTRA_PRED + x->fc.interintra_prob = VP9_DEF_INTERINTRA_PROB; +#endif +} + + +static void intra_bmode_probs_from_distribution( + vp9_prob p[VP9_NKF_BINTRAMODES - 1], + unsigned int branch_ct[VP9_NKF_BINTRAMODES - 1][2], + const unsigned int events[VP9_NKF_BINTRAMODES]) { + vp9_tree_probs_from_distribution(VP9_NKF_BINTRAMODES, vp9_bmode_encodings, + vp9_bmode_tree, p, branch_ct, events, 256, 1); +} + +void vp9_default_bmode_probs(vp9_prob p[VP9_NKF_BINTRAMODES - 1]) { + unsigned int branch_ct[VP9_NKF_BINTRAMODES - 1][2]; + intra_bmode_probs_from_distribution(p, branch_ct, bmode_cts); +} + +static void intra_kf_bmode_probs_from_distribution( + vp9_prob p[VP9_KF_BINTRAMODES - 1], + unsigned int branch_ct[VP9_KF_BINTRAMODES - 1][2], + const unsigned int events[VP9_KF_BINTRAMODES]) { + vp9_tree_probs_from_distribution(VP9_KF_BINTRAMODES, vp9_kf_bmode_encodings, + vp9_kf_bmode_tree, p, branch_ct, events, 256, 1); +} + +void vp9_kf_default_bmode_probs(vp9_prob p[VP9_KF_BINTRAMODES] + [VP9_KF_BINTRAMODES] + [VP9_KF_BINTRAMODES - 1]) { + unsigned int branch_ct[VP9_KF_BINTRAMODES - 1][2]; + int i, j; + + for (i = 0; i < VP9_KF_BINTRAMODES; ++i) { + for (j = 0; j < VP9_KF_BINTRAMODES; ++j) { + intra_kf_bmode_probs_from_distribution( + p[i][j], branch_ct, vp9_kf_default_bmode_counts[i][j]); + } + } +} + +#if VP9_SWITCHABLE_FILTERS == 3 +const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = { + -0, 2, + -1, -2 +}; +struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS]; +const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = { + EIGHTTAP, SIXTAP, EIGHTTAP_SHARP}; +const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, -1, 0, 2, -1}; +const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1] + [VP9_SWITCHABLE_FILTERS-1] = { + {248, 192}, { 32, 248}, { 32, 32}, {192, 160} +}; +#elif VP9_SWITCHABLE_FILTERS == 2 +const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = { + -0, -1, +}; +struct vp9_token_struct vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS]; +const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1] + [VP9_SWITCHABLE_FILTERS-1] = { + {248}, + { 64}, + {192}, +}; +const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = { + EIGHTTAP, EIGHTTAP_SHARP}; +const int vp9_switchable_interp_map[SWITCHABLE+1] = {-1, -1, 0, 1, -1}; //8, 8s +#endif + +void vp9_entropy_mode_init() { + vp9_tokens_from_tree(vp9_kf_bmode_encodings, vp9_kf_bmode_tree); + vp9_tokens_from_tree(vp9_bmode_encodings, vp9_bmode_tree); + vp9_tokens_from_tree(vp9_ymode_encodings, vp9_ymode_tree); + vp9_tokens_from_tree(vp9_kf_ymode_encodings, vp9_kf_ymode_tree); +#if CONFIG_SUPERBLOCKS + vp9_tokens_from_tree(vp9_sb_ymode_encodings, vp9_sb_ymode_tree); + vp9_tokens_from_tree(vp9_sb_kf_ymode_encodings, vp9_sb_kf_ymode_tree); +#endif + vp9_tokens_from_tree(vp9_uv_mode_encodings, vp9_uv_mode_tree); + vp9_tokens_from_tree(vp9_i8x8_mode_encodings, vp9_i8x8_mode_tree); + vp9_tokens_from_tree(vp9_mbsplit_encodings, vp9_mbsplit_tree); + vp9_tokens_from_tree(vp9_switchable_interp_encodings, + vp9_switchable_interp_tree); + + vp9_tokens_from_tree_offset(vp9_mv_ref_encoding_array, + vp9_mv_ref_tree, NEARESTMV); +#if CONFIG_SUPERBLOCKS + vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array, + vp9_sb_mv_ref_tree, NEARESTMV); +#endif + vp9_tokens_from_tree_offset(vp9_sub_mv_ref_encoding_array, + vp9_sub_mv_ref_tree, LEFT4X4); +} + +void vp9_init_mode_contexts(VP9_COMMON *pc) { + vpx_memset(pc->fc.mv_ref_ct, 0, sizeof(pc->fc.mv_ref_ct)); + + vpx_memcpy(pc->fc.mode_context, + vp9_default_mode_contexts, + sizeof(pc->fc.mode_context)); + vpx_memcpy(pc->fc.mode_context_a, + vp9_default_mode_contexts_a, + sizeof(pc->fc.mode_context_a)); + +} + +void vp9_accum_mv_refs(VP9_COMMON *pc, + MB_PREDICTION_MODE m, + const int context) { + int (*mv_ref_ct)[4][2]; + + mv_ref_ct = pc->fc.mv_ref_ct; + + if (m == ZEROMV) { + ++mv_ref_ct[context][0][0]; + } else { + ++mv_ref_ct[context][0][1]; + if (m == NEARESTMV) { + ++mv_ref_ct[context][1][0]; + } else { + ++mv_ref_ct[context][1][1]; + if (m == NEARMV) { + ++mv_ref_ct[context][2][0]; + } else { + ++mv_ref_ct[context][2][1]; + if (m == NEWMV) { + ++mv_ref_ct[context][3][0]; + } else { + ++mv_ref_ct[context][3][1]; + } + } + } + } +} + +#define MVREF_COUNT_SAT 20 +#define MVREF_MAX_UPDATE_FACTOR 128 +void vp9_update_mode_context(VP9_COMMON *pc) { + int i, j; + int (*mv_ref_ct)[4][2]; + int (*mode_context)[4]; + + if (pc->refresh_alt_ref_frame) { + mode_context = pc->fc.mode_context_a; + } else { + mode_context = pc->fc.mode_context; + } + mv_ref_ct = pc->fc.mv_ref_ct; + + for (j = 0; j < INTER_MODE_CONTEXTS; j++) { + for (i = 0; i < 4; i++) { + int this_prob; + int count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1]; + int factor; + { + this_prob = count > 0 ? 256 * mv_ref_ct[j][i][0] / count : 128; + count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count; + factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT); + this_prob = (pc->fc.vp9_mode_contexts[j][i] * (256 - factor) + + this_prob * factor + 128) >> 8; + mode_context[j][i] = clip_prob(this_prob); + } + } + } +} + +#ifdef MODE_STATS +#include "vp9/common/vp9_modecont.h" +void print_mode_contexts(VP9_COMMON *pc) { + int j, i; + printf("\n====================\n"); + for (j = 0; j < INTER_MODE_CONTEXTS; j++) { + for (i = 0; i < 4; i++) { + printf("%4d ", pc->fc.mode_context[j][i]); + } + printf("\n"); + } + printf("====================\n"); + for (j = 0; j < INTER_MODE_CONTEXTS; j++) { + for (i = 0; i < 4; i++) { + printf("%4d ", pc->fc.mode_context_a[j][i]); + } + printf("\n"); + } +} +#endif + +// #define MODE_COUNT_TESTING +#define MODE_COUNT_SAT 20 +#define MODE_MAX_UPDATE_FACTOR 144 +void vp9_adapt_mode_probs(VP9_COMMON *cm) { + int i, t, count, factor; + unsigned int branch_ct[32][2]; + vp9_prob ymode_probs[VP9_YMODES - 1]; +#if CONFIG_SUPERBLOCKS + vp9_prob sb_ymode_probs[VP9_I32X32_MODES - 1]; +#endif + vp9_prob uvmode_probs[VP9_UV_MODES - 1]; + vp9_prob bmode_probs[VP9_NKF_BINTRAMODES - 1]; + vp9_prob i8x8_mode_probs[VP9_I8X8_MODES - 1]; + vp9_prob sub_mv_ref_probs[VP9_SUBMVREFS - 1]; + vp9_prob mbsplit_probs[VP9_NUMMBSPLITS - 1]; +#if CONFIG_COMP_INTERINTRA_PRED + vp9_prob interintra_prob; +#endif +#ifdef MODE_COUNT_TESTING + printf("static const unsigned int\nymode_counts" + "[VP9_YMODES] = {\n"); + for (t = 0; t < VP9_YMODES; ++t) printf("%d, ", cm->fc.ymode_counts[t]); + printf("};\n"); + printf("static const unsigned int\nuv_mode_counts" + "[VP9_YMODES] [VP9_UV_MODES] = {\n"); + for (i = 0; i < VP9_YMODES; ++i) { + printf(" {"); + for (t = 0; t < VP9_UV_MODES; ++t) printf("%d, ", cm->fc.uv_mode_counts[i][t]); + printf("},\n"); + } + printf("};\n"); + printf("static const unsigned int\nbmode_counts" + "[VP9_NKF_BINTRAMODES] = {\n"); + for (t = 0; t < VP9_NKF_BINTRAMODES; ++t) + printf("%d, ", cm->fc.bmode_counts[t]); + printf("};\n"); + printf("static const unsigned int\ni8x8_mode_counts" + "[VP9_I8X8_MODES] = {\n"); + for (t = 0; t < VP9_I8X8_MODES; ++t) printf("%d, ", cm->fc.i8x8_mode_counts[t]); + printf("};\n"); + printf("static const unsigned int\nsub_mv_ref_counts" + "[SUBMVREF_COUNT] [VP9_SUBMVREFS] = {\n"); + for (i = 0; i < SUBMVREF_COUNT; ++i) { + printf(" {"); + for (t = 0; t < VP9_SUBMVREFS; ++t) printf("%d, ", cm->fc.sub_mv_ref_counts[i][t]); + printf("},\n"); + } + printf("};\n"); + printf("static const unsigned int\nmbsplit_counts" + "[VP9_NUMMBSPLITS] = {\n"); + for (t = 0; t < VP9_NUMMBSPLITS; ++t) printf("%d, ", cm->fc.mbsplit_counts[t]); + printf("};\n"); +#if CONFIG_COMP_INTERINTRA_PRED + printf("static const unsigned int\ninterintra_counts" + "[2] = {\n"); + for (t = 0; t < 2; ++t) printf("%d, ", cm->fc.interintra_counts[t]); + printf("};\n"); +#endif +#endif + vp9_tree_probs_from_distribution( + VP9_YMODES, vp9_ymode_encodings, vp9_ymode_tree, + ymode_probs, branch_ct, cm->fc.ymode_counts, + 256, 1); + for (t = 0; t < VP9_YMODES - 1; ++t) { + int prob; + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; + factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); + prob = ((int)cm->fc.pre_ymode_prob[t] * (256 - factor) + + (int)ymode_probs[t] * factor + 128) >> 8; + cm->fc.ymode_prob[t] = clip_prob(prob); + } +#if CONFIG_SUPERBLOCKS + vp9_tree_probs_from_distribution(VP9_I32X32_MODES, + vp9_sb_ymode_encodings, vp9_sb_ymode_tree, + sb_ymode_probs, branch_ct, + cm->fc.sb_ymode_counts, + 256, 1); + for (t = 0; t < VP9_I32X32_MODES - 1; ++t) { + int prob; + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; + factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); + prob = ((int)cm->fc.pre_sb_ymode_prob[t] * (256 - factor) + + (int)sb_ymode_probs[t] * factor + 128) >> 8; + cm->fc.sb_ymode_prob[t] = clip_prob(prob); + } +#endif + for (i = 0; i < VP9_YMODES; ++i) { + vp9_tree_probs_from_distribution(VP9_UV_MODES, vp9_uv_mode_encodings, + vp9_uv_mode_tree, uvmode_probs, branch_ct, + cm->fc.uv_mode_counts[i], 256, 1); + for (t = 0; t < VP9_UV_MODES - 1; ++t) { + int prob; + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; + factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); + prob = ((int)cm->fc.pre_uv_mode_prob[i][t] * (256 - factor) + + (int)uvmode_probs[t] * factor + 128) >> 8; + cm->fc.uv_mode_prob[i][t] = clip_prob(prob); + } + } + vp9_tree_probs_from_distribution(VP9_NKF_BINTRAMODES, vp9_bmode_encodings, + vp9_bmode_tree, bmode_probs, branch_ct, + cm->fc.bmode_counts, 256, 1); + for (t = 0; t < VP9_NKF_BINTRAMODES - 1; ++t) { + int prob; + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; + factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); + prob = ((int)cm->fc.pre_bmode_prob[t] * (256 - factor) + + (int)bmode_probs[t] * factor + 128) >> 8; + cm->fc.bmode_prob[t] = clip_prob(prob); + } + vp9_tree_probs_from_distribution(VP9_I8X8_MODES, vp9_i8x8_mode_encodings, + vp9_i8x8_mode_tree, i8x8_mode_probs, + branch_ct, cm->fc.i8x8_mode_counts, 256, 1); + for (t = 0; t < VP9_I8X8_MODES - 1; ++t) { + int prob; + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; + factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); + prob = ((int)cm->fc.pre_i8x8_mode_prob[t] * (256 - factor) + + (int)i8x8_mode_probs[t] * factor + 128) >> 8; + cm->fc.i8x8_mode_prob[t] = clip_prob(prob); + } + for (i = 0; i < SUBMVREF_COUNT; ++i) { + vp9_tree_probs_from_distribution(VP9_SUBMVREFS, + vp9_sub_mv_ref_encoding_array, + vp9_sub_mv_ref_tree, sub_mv_ref_probs, + branch_ct, cm->fc.sub_mv_ref_counts[i], + 256, 1); + for (t = 0; t < VP9_SUBMVREFS - 1; ++t) { + int prob; + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; + factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); + prob = ((int)cm->fc.pre_sub_mv_ref_prob[i][t] * (256 - factor) + + (int)sub_mv_ref_probs[t] * factor + 128) >> 8; + cm->fc.sub_mv_ref_prob[i][t] = clip_prob(prob); + } + } + vp9_tree_probs_from_distribution(VP9_NUMMBSPLITS, vp9_mbsplit_encodings, + vp9_mbsplit_tree, mbsplit_probs, branch_ct, + cm->fc.mbsplit_counts, 256, 1); + for (t = 0; t < VP9_NUMMBSPLITS - 1; ++t) { + int prob; + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; + factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); + prob = ((int)cm->fc.pre_mbsplit_prob[t] * (256 - factor) + + (int)mbsplit_probs[t] * factor + 128) >> 8; + cm->fc.mbsplit_prob[t] = clip_prob(prob); + } +#if CONFIG_COMP_INTERINTRA_PRED + if (cm->use_interintra) { + int prob; + interintra_prob = vp9_bin_prob_from_distribution(cm->fc.interintra_counts); + count = cm->fc.interintra_counts[0] + cm->fc.interintra_counts[1]; + count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; + factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); + prob = ((int)cm->fc.pre_interintra_prob * (256 - factor) + + (int)interintra_prob * factor + 128) >> 8; + if (prob <= 0) + cm->fc.interintra_prob = 1; + else if (prob > 255) + cm->fc.interintra_prob = 255; + else + cm->fc.interintra_prob = prob; + } +#endif +} diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h new file mode 100644 index 000000000..aee6e6c76 --- /dev/null +++ b/vp9/common/vp9_entropymode.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_ENTROPYMODE_H +#define __INC_ENTROPYMODE_H + +#include "vp9_blockd.h" +#include "vp9_treecoder.h" + +#define SUBMVREF_COUNT 5 +#define VP9_NUMMBSPLITS 4 +#if CONFIG_COMP_INTRA_PRED +#define DEFAULT_COMP_INTRA_PROB 32 +#endif + +#if CONFIG_COMP_INTERINTRA_PRED +#define VP9_DEF_INTERINTRA_PROB 248 +#define VP9_UPD_INTERINTRA_PROB 192 +// whether to use a separate uv mode (1) or use the same as the y mode (0) +#define SEPARATE_INTERINTRA_UV 0 +#endif + +typedef const int vp9_mbsplit[16]; + +extern vp9_mbsplit vp9_mbsplits[VP9_NUMMBSPLITS]; + +extern const int vp9_mbsplit_count[VP9_NUMMBSPLITS]; /* # of subsets */ + +extern const vp9_prob vp9_mbsplit_probs[VP9_NUMMBSPLITS - 1]; + +extern int vp9_mv_cont(const int_mv *l, const int_mv *a); + +extern const vp9_prob vp9_sub_mv_ref_prob[VP9_SUBMVREFS - 1]; + +extern const vp9_prob vp9_sub_mv_ref_prob2[SUBMVREF_COUNT][VP9_SUBMVREFS - 1]; + +extern const unsigned int vp9_kf_default_bmode_counts[VP9_KF_BINTRAMODES] + [VP9_KF_BINTRAMODES] + [VP9_KF_BINTRAMODES]; + +extern const vp9_tree_index vp9_bmode_tree[]; +extern const vp9_tree_index vp9_kf_bmode_tree[]; + +extern const vp9_tree_index vp9_ymode_tree[]; +extern const vp9_tree_index vp9_kf_ymode_tree[]; +extern const vp9_tree_index vp9_uv_mode_tree[]; +#define vp9_sb_ymode_tree vp9_uv_mode_tree +#define vp9_sb_kf_ymode_tree vp9_uv_mode_tree +extern const vp9_tree_index vp9_i8x8_mode_tree[]; +extern const vp9_tree_index vp9_mbsplit_tree[]; +extern const vp9_tree_index vp9_mv_ref_tree[]; +extern const vp9_tree_index vp9_sb_mv_ref_tree[]; +extern const vp9_tree_index vp9_sub_mv_ref_tree[]; + +extern struct vp9_token_struct vp9_bmode_encodings[VP9_NKF_BINTRAMODES]; +extern struct vp9_token_struct vp9_kf_bmode_encodings[VP9_KF_BINTRAMODES]; +extern struct vp9_token_struct vp9_ymode_encodings[VP9_YMODES]; +extern struct vp9_token_struct vp9_sb_ymode_encodings[VP9_I32X32_MODES]; +extern struct vp9_token_struct vp9_sb_kf_ymode_encodings[VP9_I32X32_MODES]; +extern struct vp9_token_struct vp9_kf_ymode_encodings[VP9_YMODES]; +extern struct vp9_token_struct vp9_i8x8_mode_encodings[VP9_I8X8_MODES]; +extern struct vp9_token_struct vp9_uv_mode_encodings[VP9_UV_MODES]; +extern struct vp9_token_struct vp9_mbsplit_encodings[VP9_NUMMBSPLITS]; + +/* Inter mode values do not start at zero */ + +extern struct vp9_token_struct vp9_mv_ref_encoding_array[VP9_MVREFS]; +extern struct vp9_token_struct vp9_sb_mv_ref_encoding_array[VP9_MVREFS]; +extern struct vp9_token_struct vp9_sub_mv_ref_encoding_array[VP9_SUBMVREFS]; + +void vp9_entropy_mode_init(void); + +struct VP9Common; + +void vp9_init_mbmode_probs(struct VP9Common *x); + +extern void vp9_init_mode_contexts(struct VP9Common *pc); + +extern void vp9_update_mode_context(struct VP9Common *pc); + +extern void vp9_accum_mv_refs(struct VP9Common *pc, + MB_PREDICTION_MODE m, + const int context); + +void vp9_default_bmode_probs(vp9_prob dest[VP9_NKF_BINTRAMODES - 1]); + +void vp9_kf_default_bmode_probs(vp9_prob dest[VP9_KF_BINTRAMODES] + [VP9_KF_BINTRAMODES] + [VP9_KF_BINTRAMODES - 1]); + +void vp9_adapt_mode_probs(struct VP9Common *); + +#define VP9_SWITCHABLE_FILTERS 2 /* number of switchable filters */ + +extern const INTERPOLATIONFILTERTYPE vp9_switchable_interp + [VP9_SWITCHABLE_FILTERS]; + +extern const int vp9_switchable_interp_map[SWITCHABLE + 1]; + +extern const vp9_tree_index vp9_switchable_interp_tree + [2 * (VP9_SWITCHABLE_FILTERS - 1)]; + +extern struct vp9_token_struct vp9_switchable_interp_encodings + [VP9_SWITCHABLE_FILTERS]; + +extern const vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1] + [VP9_SWITCHABLE_FILTERS - 1]; + +#endif diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c new file mode 100644 index 000000000..8d2ebebc4 --- /dev/null +++ b/vp9/common/vp9_entropymv.c @@ -0,0 +1,469 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9_onyxc_int.h" +#include "vp9_entropymv.h" + +//#define MV_COUNT_TESTING + +#define MV_COUNT_SAT 16 +#define MV_MAX_UPDATE_FACTOR 160 + +#if CONFIG_NEW_MVREF +/* Integer pel reference mv threshold for use of high-precision 1/8 mv */ +#define COMPANDED_MVREF_THRESH 1000000 +#else +/* Integer pel reference mv threshold for use of high-precision 1/8 mv */ +#define COMPANDED_MVREF_THRESH 8 +#endif + +/* Smooth or bias the mv-counts before prob computation */ +/* #define SMOOTH_MV_COUNTS */ + +const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = { + -MV_JOINT_ZERO, 2, + -MV_JOINT_HNZVZ, 4, + -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ +}; +struct vp9_token_struct vp9_mv_joint_encodings[MV_JOINTS]; + +const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = { + -MV_CLASS_0, 2, + -MV_CLASS_1, 4, + 6, 8, + -MV_CLASS_2, -MV_CLASS_3, + 10, 12, + -MV_CLASS_4, -MV_CLASS_5, + -MV_CLASS_6, -MV_CLASS_7, +}; +struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES]; + +const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = { + -0, -1, +}; +struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE]; + +const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = { + -0, 2, + -1, 4, + -2, -3 +}; +struct vp9_token_struct vp9_mv_fp_encodings[4]; + +const nmv_context vp9_default_nmv_context = { + {32, 64, 96}, + { + { /* vert component */ + 128, /* sign */ + {224, 144, 192, 168, 192, 176, 192}, /* class */ + {216}, /* class0 */ + {136, 140, 148, 160, 176, 192, 224}, /* bits */ + {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ + {64, 96, 64}, /* fp */ + 160, /* class0_hp bit */ + 128, /* hp */ + }, + { /* hor component */ + 128, /* sign */ + {216, 128, 176, 160, 176, 176, 192}, /* class */ + {208}, /* class0 */ + {136, 140, 148, 160, 176, 192, 224}, /* bits */ + {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ + {64, 96, 64}, /* fp */ + 160, /* class0_hp bit */ + 128, /* hp */ + } + }, +}; + +MV_JOINT_TYPE vp9_get_mv_joint(MV mv) { + if (mv.row == 0 && mv.col == 0) return MV_JOINT_ZERO; + else if (mv.row == 0 && mv.col != 0) return MV_JOINT_HNZVZ; + else if (mv.row != 0 && mv.col == 0) return MV_JOINT_HZVNZ; + else return MV_JOINT_HNZVNZ; +} + +#define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0) + +MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { + MV_CLASS_TYPE c; + if (z < CLASS0_SIZE * 8) c = MV_CLASS_0; + else if (z < CLASS0_SIZE * 16) c = MV_CLASS_1; + else if (z < CLASS0_SIZE * 32) c = MV_CLASS_2; + else if (z < CLASS0_SIZE * 64) c = MV_CLASS_3; + else if (z < CLASS0_SIZE * 128) c = MV_CLASS_4; + else if (z < CLASS0_SIZE * 256) c = MV_CLASS_5; + else if (z < CLASS0_SIZE * 512) c = MV_CLASS_6; + else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7; + else assert(0); + if (offset) + *offset = z - mv_class_base(c); + return c; +} + +int vp9_use_nmv_hp(const MV *ref) { + if ((abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH && + (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH) + return 1; + else + return 0; +} + +int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) { + return mv_class_base(c) + offset; +} + +static void increment_nmv_component_count(int v, + nmv_component_counts *mvcomp, + int incr, + int usehp) { + assert (v != 0); /* should not be zero */ + mvcomp->mvcount[MV_MAX + v] += incr; +} + +static void increment_nmv_component(int v, + nmv_component_counts *mvcomp, + int incr, + int usehp) { + int s, z, c, o, d, e, f; + assert (v != 0); /* should not be zero */ + s = v < 0; + mvcomp->sign[s] += incr; + z = (s ? -v : v) - 1; /* magnitude - 1 */ + + c = vp9_get_mv_class(z, &o); + mvcomp->classes[c] += incr; + + d = (o >> 3); /* int mv data */ + f = (o >> 1) & 3; /* fractional pel mv data */ + e = (o & 1); /* high precision mv data */ + if (c == MV_CLASS_0) { + mvcomp->class0[d] += incr; + } else { + int i, b; + b = c + CLASS0_BITS - 1; /* number of bits */ + for (i = 0; i < b; ++i) + mvcomp->bits[i][((d >> i) & 1)] += incr; + } + + /* Code the fractional pel bits */ + if (c == MV_CLASS_0) { + mvcomp->class0_fp[d][f] += incr; + } else { + mvcomp->fp[f] += incr; + } + + /* Code the high precision bit */ + if (usehp) { + if (c == MV_CLASS_0) { + mvcomp->class0_hp[e] += incr; + } else { + mvcomp->hp[e] += incr; + } + } +} + +#ifdef SMOOTH_MV_COUNTS +static void smooth_counts(nmv_component_counts *mvcomp) { + static const int flen = 3; // (filter_length + 1) / 2 + static const int fval[] = {8, 3, 1}; + static const int fvalbits = 4; + int i; + unsigned int smvcount[MV_VALS]; + vpx_memcpy(smvcount, mvcomp->mvcount, sizeof(smvcount)); + smvcount[MV_MAX] = (smvcount[MV_MAX - 1] + smvcount[MV_MAX + 1]) >> 1; + for (i = flen - 1; i <= MV_VALS - flen; ++i) { + int j, s = smvcount[i] * fval[0]; + for (j = 1; j < flen; ++j) + s += (smvcount[i - j] + smvcount[i + j]) * fval[j]; + mvcomp->mvcount[i] = (s + (1 << (fvalbits - 1))) >> fvalbits; + } +} +#endif + +static void counts_to_context(nmv_component_counts *mvcomp, int usehp) { + int v; + vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount)); + for (v = 1; v <= MV_MAX; v++) { + increment_nmv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp); + increment_nmv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp); + } +} + +void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx, + int usehp) { + MV_JOINT_TYPE j = vp9_get_mv_joint(*mv); + mvctx->joints[j]++; + usehp = usehp && vp9_use_nmv_hp(ref); + if (j == MV_JOINT_HZVNZ || j == MV_JOINT_HNZVNZ) { + increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp); + } + if (j == MV_JOINT_HNZVZ || j == MV_JOINT_HNZVNZ) { + increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp); + } +} + +static void adapt_prob(vp9_prob *dest, vp9_prob prep, vp9_prob newp, + unsigned int ct[2]) { + int factor; + int prob; + int count = ct[0] + ct[1]; + if (count) { + count = count > MV_COUNT_SAT ? MV_COUNT_SAT : count; + factor = (MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT); + prob = ((int)prep * (256 - factor) + (int)(newp) * factor + 128) >> 8; + prob += !prob; + prob = (prob > 255 ? 255 : prob); + *dest = prob; + } +} + +void vp9_counts_process(nmv_context_counts *NMVcount, int usehp) { + counts_to_context(&NMVcount->comps[0], usehp); + counts_to_context(&NMVcount->comps[1], usehp); +} + +void vp9_counts_to_nmv_context( + nmv_context_counts *NMVcount, + nmv_context *prob, + int usehp, + unsigned int (*branch_ct_joint)[2], + unsigned int (*branch_ct_sign)[2], + unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2], + unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2], + unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2], + unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2], + unsigned int (*branch_ct_fp)[4 - 1][2], + unsigned int (*branch_ct_class0_hp)[2], + unsigned int (*branch_ct_hp)[2]) { + int i, j, k; + vp9_counts_process(NMVcount, usehp); + vp9_tree_probs_from_distribution(MV_JOINTS, + vp9_mv_joint_encodings, + vp9_mv_joint_tree, + prob->joints, + branch_ct_joint, + NMVcount->joints, + 256, 1); + for (i = 0; i < 2; ++i) { + prob->comps[i].sign = + vp9_bin_prob_from_distribution(NMVcount->comps[i].sign); + branch_ct_sign[i][0] = NMVcount->comps[i].sign[0]; + branch_ct_sign[i][1] = NMVcount->comps[i].sign[1]; + vp9_tree_probs_from_distribution(MV_CLASSES, + vp9_mv_class_encodings, + vp9_mv_class_tree, + prob->comps[i].classes, + branch_ct_classes[i], + NMVcount->comps[i].classes, + 256, 1); + vp9_tree_probs_from_distribution(CLASS0_SIZE, + vp9_mv_class0_encodings, + vp9_mv_class0_tree, + prob->comps[i].class0, + branch_ct_class0[i], + NMVcount->comps[i].class0, + 256, 1); + for (j = 0; j < MV_OFFSET_BITS; ++j) { + prob->comps[i].bits[j] = vp9_bin_prob_from_distribution( + NMVcount->comps[i].bits[j]); + branch_ct_bits[i][j][0] = NMVcount->comps[i].bits[j][0]; + branch_ct_bits[i][j][1] = NMVcount->comps[i].bits[j][1]; + } + } + for (i = 0; i < 2; ++i) { + for (k = 0; k < CLASS0_SIZE; ++k) { + vp9_tree_probs_from_distribution(4, + vp9_mv_fp_encodings, + vp9_mv_fp_tree, + prob->comps[i].class0_fp[k], + branch_ct_class0_fp[i][k], + NMVcount->comps[i].class0_fp[k], + 256, 1); + } + vp9_tree_probs_from_distribution(4, + vp9_mv_fp_encodings, + vp9_mv_fp_tree, + prob->comps[i].fp, + branch_ct_fp[i], + NMVcount->comps[i].fp, + 256, 1); + } + if (usehp) { + for (i = 0; i < 2; ++i) { + prob->comps[i].class0_hp = vp9_bin_prob_from_distribution( + NMVcount->comps[i].class0_hp); + branch_ct_class0_hp[i][0] = NMVcount->comps[i].class0_hp[0]; + branch_ct_class0_hp[i][1] = NMVcount->comps[i].class0_hp[1]; + + prob->comps[i].hp = + vp9_bin_prob_from_distribution(NMVcount->comps[i].hp); + branch_ct_hp[i][0] = NMVcount->comps[i].hp[0]; + branch_ct_hp[i][1] = NMVcount->comps[i].hp[1]; + } + } +} + +void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) { + int i, j, k; + nmv_context prob; + unsigned int branch_ct_joint[MV_JOINTS - 1][2]; + unsigned int branch_ct_sign[2][2]; + unsigned int branch_ct_classes[2][MV_CLASSES - 1][2]; + unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2]; + unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2]; + unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2]; + unsigned int branch_ct_fp[2][4 - 1][2]; + unsigned int branch_ct_class0_hp[2][2]; + unsigned int branch_ct_hp[2][2]; +#ifdef MV_COUNT_TESTING + printf("joints count: "); + for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]); + printf("\n"); fflush(stdout); + printf("signs count:\n"); + for (i = 0; i < 2; ++i) + printf("%d/%d ", cm->fc.NMVcount.comps[i].sign[0], cm->fc.NMVcount.comps[i].sign[1]); + printf("\n"); fflush(stdout); + printf("classes count:\n"); + for (i = 0; i < 2; ++i) { + for (j = 0; j < MV_CLASSES; ++j) + printf("%d ", cm->fc.NMVcount.comps[i].classes[j]); + printf("\n"); fflush(stdout); + } + printf("class0 count:\n"); + for (i = 0; i < 2; ++i) { + for (j = 0; j < CLASS0_SIZE; ++j) + printf("%d ", cm->fc.NMVcount.comps[i].class0[j]); + printf("\n"); fflush(stdout); + } + printf("bits count:\n"); + for (i = 0; i < 2; ++i) { + for (j = 0; j < MV_OFFSET_BITS; ++j) + printf("%d/%d ", cm->fc.NMVcount.comps[i].bits[j][0], + cm->fc.NMVcount.comps[i].bits[j][1]); + printf("\n"); fflush(stdout); + } + printf("class0_fp count:\n"); + for (i = 0; i < 2; ++i) { + for (j = 0; j < CLASS0_SIZE; ++j) { + printf("{"); + for (k = 0; k < 4; ++k) + printf("%d ", cm->fc.NMVcount.comps[i].class0_fp[j][k]); + printf("}, "); + } + printf("\n"); fflush(stdout); + } + printf("fp count:\n"); + for (i = 0; i < 2; ++i) { + for (j = 0; j < 4; ++j) + printf("%d ", cm->fc.NMVcount.comps[i].fp[j]); + printf("\n"); fflush(stdout); + } + if (usehp) { + printf("class0_hp count:\n"); + for (i = 0; i < 2; ++i) + printf("%d/%d ", cm->fc.NMVcount.comps[i].class0_hp[0], + cm->fc.NMVcount.comps[i].class0_hp[1]); + printf("\n"); fflush(stdout); + printf("hp count:\n"); + for (i = 0; i < 2; ++i) + printf("%d/%d ", cm->fc.NMVcount.comps[i].hp[0], + cm->fc.NMVcount.comps[i].hp[1]); + printf("\n"); fflush(stdout); + } +#endif +#ifdef SMOOTH_MV_COUNTS + smooth_counts(&cm->fc.NMVcount.comps[0]); + smooth_counts(&cm->fc.NMVcount.comps[1]); +#endif + vp9_counts_to_nmv_context(&cm->fc.NMVcount, + &prob, + usehp, + branch_ct_joint, + branch_ct_sign, + branch_ct_classes, + branch_ct_class0, + branch_ct_bits, + branch_ct_class0_fp, + branch_ct_fp, + branch_ct_class0_hp, + branch_ct_hp); + + for (j = 0; j < MV_JOINTS - 1; ++j) { + adapt_prob(&cm->fc.nmvc.joints[j], + cm->fc.pre_nmvc.joints[j], + prob.joints[j], + branch_ct_joint[j]); + } + for (i = 0; i < 2; ++i) { + adapt_prob(&cm->fc.nmvc.comps[i].sign, + cm->fc.pre_nmvc.comps[i].sign, + prob.comps[i].sign, + branch_ct_sign[i]); + for (j = 0; j < MV_CLASSES - 1; ++j) { + adapt_prob(&cm->fc.nmvc.comps[i].classes[j], + cm->fc.pre_nmvc.comps[i].classes[j], + prob.comps[i].classes[j], + branch_ct_classes[i][j]); + } + for (j = 0; j < CLASS0_SIZE - 1; ++j) { + adapt_prob(&cm->fc.nmvc.comps[i].class0[j], + cm->fc.pre_nmvc.comps[i].class0[j], + prob.comps[i].class0[j], + branch_ct_class0[i][j]); + } + for (j = 0; j < MV_OFFSET_BITS; ++j) { + adapt_prob(&cm->fc.nmvc.comps[i].bits[j], + cm->fc.pre_nmvc.comps[i].bits[j], + prob.comps[i].bits[j], + branch_ct_bits[i][j]); + } + } + for (i = 0; i < 2; ++i) { + for (j = 0; j < CLASS0_SIZE; ++j) { + for (k = 0; k < 3; ++k) { + adapt_prob(&cm->fc.nmvc.comps[i].class0_fp[j][k], + cm->fc.pre_nmvc.comps[i].class0_fp[j][k], + prob.comps[i].class0_fp[j][k], + branch_ct_class0_fp[i][j][k]); + } + } + for (j = 0; j < 3; ++j) { + adapt_prob(&cm->fc.nmvc.comps[i].fp[j], + cm->fc.pre_nmvc.comps[i].fp[j], + prob.comps[i].fp[j], + branch_ct_fp[i][j]); + } + } + if (usehp) { + for (i = 0; i < 2; ++i) { + adapt_prob(&cm->fc.nmvc.comps[i].class0_hp, + cm->fc.pre_nmvc.comps[i].class0_hp, + prob.comps[i].class0_hp, + branch_ct_class0_hp[i]); + adapt_prob(&cm->fc.nmvc.comps[i].hp, + cm->fc.pre_nmvc.comps[i].hp, + prob.comps[i].hp, + branch_ct_hp[i]); + } + } +} + +void vp9_entropy_mv_init() { + vp9_tokens_from_tree(vp9_mv_joint_encodings, vp9_mv_joint_tree); + vp9_tokens_from_tree(vp9_mv_class_encodings, vp9_mv_class_tree); + vp9_tokens_from_tree(vp9_mv_class0_encodings, vp9_mv_class0_tree); + vp9_tokens_from_tree(vp9_mv_fp_encodings, vp9_mv_fp_tree); +} + +void vp9_init_mv_probs(VP9_COMMON *cm) { + vpx_memcpy(&cm->fc.nmvc, &vp9_default_nmv_context, sizeof(nmv_context)); +} diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h new file mode 100644 index 000000000..3be95117c --- /dev/null +++ b/vp9/common/vp9_entropymv.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_ENTROPYMV_H +#define __INC_ENTROPYMV_H + +#include "vp9_treecoder.h" +#include "vpx_config.h" +#include "vp9_blockd.h" + +struct VP9Common; + +void vp9_entropy_mv_init(); +void vp9_init_mv_probs(struct VP9Common *cm); + +void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp); +int vp9_use_nmv_hp(const MV *ref); + +#define VP9_NMV_UPDATE_PROB 255 +//#define MV_GROUP_UPDATE + +#define LOW_PRECISION_MV_UPDATE /* Use 7 bit forward update */ + +/* Symbols for coding which components are zero jointly */ +#define MV_JOINTS 4 +typedef enum { + MV_JOINT_ZERO = 0, /* Zero vector */ + MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */ + MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */ + MV_JOINT_HNZVNZ = 3, /* Both components nonzero */ +} MV_JOINT_TYPE; + +extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2]; +extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS]; + +/* Symbols for coding magnitude class of nonzero components */ +#define MV_CLASSES 8 +typedef enum { + MV_CLASS_0 = 0, /* (0, 2] integer pel */ + MV_CLASS_1 = 1, /* (2, 4] integer pel */ + MV_CLASS_2 = 2, /* (4, 8] integer pel */ + MV_CLASS_3 = 3, /* (8, 16] integer pel */ + MV_CLASS_4 = 4, /* (16, 32] integer pel */ + MV_CLASS_5 = 5, /* (32, 64] integer pel */ + MV_CLASS_6 = 6, /* (64, 128] integer pel */ + MV_CLASS_7 = 7, /* (128, 256] integer pel */ +} MV_CLASS_TYPE; + +extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2]; +extern struct vp9_token_struct vp9_mv_class_encodings [MV_CLASSES]; + +#define CLASS0_BITS 1 /* bits at integer precision for class 0 */ +#define CLASS0_SIZE (1 << CLASS0_BITS) +#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2) + +#define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2) +#define MV_MAX ((1 << MV_MAX_BITS) - 1) +#define MV_VALS ((MV_MAX << 1) + 1) + +extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2]; +extern struct vp9_token_struct vp9_mv_class0_encodings[CLASS0_SIZE]; + +extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2]; +extern struct vp9_token_struct vp9_mv_fp_encodings[4]; + +typedef struct { + vp9_prob sign; + vp9_prob classes[MV_CLASSES - 1]; + vp9_prob class0[CLASS0_SIZE - 1]; + vp9_prob bits[MV_OFFSET_BITS]; + vp9_prob class0_fp[CLASS0_SIZE][4 - 1]; + vp9_prob fp[4 - 1]; + vp9_prob class0_hp; + vp9_prob hp; +} nmv_component; + +typedef struct { + vp9_prob joints[MV_JOINTS - 1]; + nmv_component comps[2]; +} nmv_context; + +MV_JOINT_TYPE vp9_get_mv_joint(MV mv); +MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset); +int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset); + + +typedef struct { + unsigned int mvcount[MV_VALS]; + unsigned int sign[2]; + unsigned int classes[MV_CLASSES]; + unsigned int class0[CLASS0_SIZE]; + unsigned int bits[MV_OFFSET_BITS][2]; + unsigned int class0_fp[CLASS0_SIZE][4]; + unsigned int fp[4]; + unsigned int class0_hp[2]; + unsigned int hp[2]; +} nmv_component_counts; + +typedef struct { + unsigned int joints[MV_JOINTS]; + nmv_component_counts comps[2]; +} nmv_context_counts; + +void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx, + int usehp); +extern const nmv_context vp9_default_nmv_context; +void vp9_counts_to_nmv_context( + nmv_context_counts *NMVcount, + nmv_context *prob, + int usehp, + unsigned int (*branch_ct_joint)[2], + unsigned int (*branch_ct_sign)[2], + unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2], + unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2], + unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2], + unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2], + unsigned int (*branch_ct_fp)[4 - 1][2], + unsigned int (*branch_ct_class0_hp)[2], + unsigned int (*branch_ct_hp)[2]); +void vp9_counts_process(nmv_context_counts *NMVcount, int usehp); +#endif diff --git a/vp9/common/vp9_extend.c b/vp9/common/vp9_extend.c new file mode 100644 index 000000000..4f2aa0e0c --- /dev/null +++ b/vp9/common/vp9_extend.c @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9_extend.h" +#include "vpx_mem/vpx_mem.h" + +static void copy_and_extend_plane(unsigned char *s, /* source */ + int sp, /* source pitch */ + unsigned char *d, /* destination */ + int dp, /* destination pitch */ + int h, /* height */ + int w, /* width */ + int et, /* extend top border */ + int el, /* extend left border */ + int eb, /* extend bottom border */ + int er) { /* extend right border */ + int i; + unsigned char *src_ptr1, *src_ptr2; + unsigned char *dest_ptr1, *dest_ptr2; + int linesize; + + /* copy the left and right most columns out */ + src_ptr1 = s; + src_ptr2 = s + w - 1; + dest_ptr1 = d - el; + dest_ptr2 = d + w; + + for (i = 0; i < h; i++) { + vpx_memset(dest_ptr1, src_ptr1[0], el); + vpx_memcpy(dest_ptr1 + el, src_ptr1, w); + vpx_memset(dest_ptr2, src_ptr2[0], er); + src_ptr1 += sp; + src_ptr2 += sp; + dest_ptr1 += dp; + dest_ptr2 += dp; + } + + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = d - el; + src_ptr2 = d + dp * (h - 1) - el; + dest_ptr1 = d + dp * (-et) - el; + dest_ptr2 = d + dp * (h) - el; + linesize = el + er + w; + + for (i = 0; i < et; i++) { + vpx_memcpy(dest_ptr1, src_ptr1, linesize); + dest_ptr1 += dp; + } + + for (i = 0; i < eb; i++) { + vpx_memcpy(dest_ptr2, src_ptr2, linesize); + dest_ptr2 += dp; + } +} + +void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { + int et = dst->border; + int el = dst->border; + int eb = dst->border + dst->y_height - src->y_height; + int er = dst->border + dst->y_width - src->y_width; + + copy_and_extend_plane(src->y_buffer, src->y_stride, + dst->y_buffer, dst->y_stride, + src->y_height, src->y_width, + et, el, eb, er); + + et = dst->border >> 1; + el = dst->border >> 1; + eb = (dst->border >> 1) + dst->uv_height - src->uv_height; + er = (dst->border >> 1) + dst->uv_width - src->uv_width; + + copy_and_extend_plane(src->u_buffer, src->uv_stride, + dst->u_buffer, dst->uv_stride, + src->uv_height, src->uv_width, + et, el, eb, er); + + copy_and_extend_plane(src->v_buffer, src->uv_stride, + dst->v_buffer, dst->uv_stride, + src->uv_height, src->uv_width, + et, el, eb, er); +} + +void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + int srcy, int srcx, + int srch, int srcw) { + int et = dst->border; + int el = dst->border; + int eb = dst->border + dst->y_height - src->y_height; + int er = dst->border + dst->y_width - src->y_width; + int src_y_offset = srcy * src->y_stride + srcx; + int dst_y_offset = srcy * dst->y_stride + srcx; + int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); + int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); + + // If the side is not touching the bounder then don't extend. + if (srcy) + et = 0; + if (srcx) + el = 0; + if (srcy + srch != src->y_height) + eb = 0; + if (srcx + srcw != src->y_width) + er = 0; + + copy_and_extend_plane(src->y_buffer + src_y_offset, + src->y_stride, + dst->y_buffer + dst_y_offset, + dst->y_stride, + srch, srcw, + et, el, eb, er); + + et = (et + 1) >> 1; + el = (el + 1) >> 1; + eb = (eb + 1) >> 1; + er = (er + 1) >> 1; + srch = (srch + 1) >> 1; + srcw = (srcw + 1) >> 1; + + copy_and_extend_plane(src->u_buffer + src_uv_offset, + src->uv_stride, + dst->u_buffer + dst_uv_offset, + dst->uv_stride, + srch, srcw, + et, el, eb, er); + + copy_and_extend_plane(src->v_buffer + src_uv_offset, + src->uv_stride, + dst->v_buffer + dst_uv_offset, + dst->uv_stride, + srch, srcw, + et, el, eb, er); +} + +/* note the extension is only for the last row, for intra prediction purpose */ +void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, + unsigned char *UPtr, unsigned char *VPtr) { + int i; + + YPtr += ybf->y_stride * 14; + UPtr += ybf->uv_stride * 6; + VPtr += ybf->uv_stride * 6; + + for (i = 0; i < 4; i++) { + YPtr[i] = YPtr[-1]; + UPtr[i] = UPtr[-1]; + VPtr[i] = VPtr[-1]; + } + + YPtr += ybf->y_stride; + UPtr += ybf->uv_stride; + VPtr += ybf->uv_stride; + + for (i = 0; i < 4; i++) { + YPtr[i] = YPtr[-1]; + UPtr[i] = UPtr[-1]; + VPtr[i] = VPtr[-1]; + } +} diff --git a/vp9/common/vp9_extend.h b/vp9/common/vp9_extend.h new file mode 100644 index 000000000..c3c590479 --- /dev/null +++ b/vp9/common/vp9_extend.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef __INC_EXTEND_H +#define __INC_EXTEND_H + +#include "vpx_scale/yv12config.h" + +void vp9_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, + unsigned char *UPtr, unsigned char *VPtr); + +void vp9_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); + +void vp9_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + int srcy, int srcx, + int srch, int srcw); + +#endif // __INC_EXTEND_H diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c new file mode 100644 index 000000000..8d5eb9333 --- /dev/null +++ b/vp9/common/vp9_filter.c @@ -0,0 +1,1159 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include +#include "vp9_filter.h" +#include "vpx_ports/mem.h" +#include "vp9_rtcd.h" + +DECLARE_ALIGNED(16, const short, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = { + { 128, 0 }, + { 120, 8 }, + { 112, 16 }, + { 104, 24 }, + { 96, 32 }, + { 88, 40 }, + { 80, 48 }, + { 72, 56 }, + { 64, 64 }, + { 56, 72 }, + { 48, 80 }, + { 40, 88 }, + { 32, 96 }, + { 24, 104 }, + { 16, 112 }, + { 8, 120 } +}; + +#define FILTER_ALPHA 0 +#define FILTER_ALPHA_SHARP 1 +DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = { +#if FILTER_ALPHA == 0 + /* Lagrangian interpolation filter */ + { 0, 0, 0, 128, 0, 0, 0, 0}, + { 0, 1, -5, 126, 8, -3, 1, 0}, + { -1, 3, -10, 122, 18, -6, 2, 0}, + { -1, 4, -13, 118, 27, -9, 3, -1}, + { -1, 4, -16, 112, 37, -11, 4, -1}, + { -1, 5, -18, 105, 48, -14, 4, -1}, + { -1, 5, -19, 97, 58, -16, 5, -1}, + { -1, 6, -19, 88, 68, -18, 5, -1}, + { -1, 6, -19, 78, 78, -19, 6, -1}, + { -1, 5, -18, 68, 88, -19, 6, -1}, + { -1, 5, -16, 58, 97, -19, 5, -1}, + { -1, 4, -14, 48, 105, -18, 5, -1}, + { -1, 4, -11, 37, 112, -16, 4, -1}, + { -1, 3, -9, 27, 118, -13, 4, -1}, + { 0, 2, -6, 18, 122, -10, 3, -1}, + { 0, 1, -3, 8, 126, -5, 1, 0} +#elif FILTER_ALPHA == 50 + /* Generated using MATLAB: + * alpha = 0.5; + * b=intfilt(8,4,alpha); + * bi=round(128*b); + * ba=flipud(reshape([bi 0], 8, 8)); + * disp(num2str(ba, '%d,')) + */ + { 0, 0, 0, 128, 0, 0, 0, 0}, + { 0, 1, -5, 126, 8, -3, 1, 0}, + { 0, 2, -10, 122, 18, -6, 2, 0}, + { -1, 3, -13, 118, 27, -9, 3, 0}, + { -1, 4, -16, 112, 37, -11, 3, 0}, + { -1, 5, -17, 104, 48, -14, 4, -1}, + { -1, 5, -18, 96, 58, -16, 5, -1}, + { -1, 5, -19, 88, 68, -17, 5, -1}, + { -1, 5, -18, 78, 78, -18, 5, -1}, + { -1, 5, -17, 68, 88, -19, 5, -1}, + { -1, 5, -16, 58, 96, -18, 5, -1}, + { -1, 4, -14, 48, 104, -17, 5, -1}, + { 0, 3, -11, 37, 112, -16, 4, -1}, + { 0, 3, -9, 27, 118, -13, 3, -1}, + { 0, 2, -6, 18, 122, -10, 2, 0}, + { 0, 1, -3, 8, 126, -5, 1, 0} +#endif /* FILTER_ALPHA */ +}; + +DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) = { +#if FILTER_ALPHA_SHARP == 1 + /* dct based filter */ + {0, 0, 0, 128, 0, 0, 0, 0}, + {-1, 3, -7, 127, 8, -3, 1, 0}, + {-2, 5, -13, 125, 17, -6, 3, -1}, + {-3, 7, -17, 121, 27, -10, 5, -2}, + {-4, 9, -20, 115, 37, -13, 6, -2}, + {-4, 10, -23, 108, 48, -16, 8, -3}, + {-4, 10, -24, 100, 59, -19, 9, -3}, + {-4, 11, -24, 90, 70, -21, 10, -4}, + {-4, 11, -23, 80, 80, -23, 11, -4}, + {-4, 10, -21, 70, 90, -24, 11, -4}, + {-3, 9, -19, 59, 100, -24, 10, -4}, + {-3, 8, -16, 48, 108, -23, 10, -4}, + {-2, 6, -13, 37, 115, -20, 9, -4}, + {-2, 5, -10, 27, 121, -17, 7, -3}, + {-1, 3, -6, 17, 125, -13, 5, -2}, + {0, 1, -3, 8, 127, -7, 3, -1} +#elif FILTER_ALPHA_SHARP == 75 + /* alpha = 0.75 */ + {0, 0, 0, 128, 0, 0, 0, 0}, + {-1, 2, -6, 126, 9, -3, 2, -1}, + {-1, 4, -11, 123, 18, -7, 3, -1}, + {-2, 6, -16, 119, 28, -10, 5, -2}, + {-2, 7, -19, 113, 38, -13, 6, -2}, + {-3, 8, -21, 106, 49, -16, 7, -2}, + {-3, 9, -22, 99, 59, -19, 8, -3}, + {-3, 9, -23, 90, 70, -21, 9, -3}, + {-3, 9, -22, 80, 80, -22, 9, -3}, + {-3, 9, -21, 70, 90, -23, 9, -3}, + {-3, 8, -19, 59, 99, -22, 9, -3}, + {-2, 7, -16, 49, 106, -21, 8, -3}, + {-2, 6, -13, 38, 113, -19, 7, -2}, + {-2, 5, -10, 28, 119, -16, 6, -2}, + {-1, 3, -7, 18, 123, -11, 4, -1}, + {-1, 2, -3, 9, 126, -6, 2, -1} +#endif /* FILTER_ALPHA_SHARP */ +}; + +DECLARE_ALIGNED(16, const short, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = { + {0, 0, 128, 0, 0, 0}, + {1, -5, 125, 8, -2, 1}, + {1, -8, 122, 17, -5, 1}, + {2, -11, 116, 27, -8, 2}, + {3, -14, 110, 37, -10, 2}, + {3, -15, 103, 47, -12, 2}, + {3, -16, 95, 57, -14, 3}, + {3, -16, 86, 67, -15, 3}, + {3, -16, 77, 77, -16, 3}, + {3, -15, 67, 86, -16, 3}, + {3, -14, 57, 95, -16, 3}, + {2, -12, 47, 103, -15, 3}, + {2, -10, 37, 110, -14, 3}, + {2, -8, 27, 116, -11, 2}, + {1, -5, 17, 122, -8, 1}, + {1, -2, 8, 125, -5, 1} +}; + +static void filter_block2d_first_pass_6(unsigned char *src_ptr, + int *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp9_filter) { + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) + + ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) + + ((int)src_ptr[0] * vp9_filter[2]) + + ((int)src_ptr[pixel_step] * vp9_filter[3]) + + ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) + + ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) + + (VP9_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp = Temp >> VP9_FILTER_SHIFT; + + if (Temp < 0) + Temp = 0; + else if (Temp > 255) + Temp = 255; + + output_ptr[j] = Temp; + src_ptr++; + } + + /* Next row... */ + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +static void filter_block2d_second_pass_6(int *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp9_filter) { + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + /* Apply filter */ + Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) + + ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) + + ((int)src_ptr[0] * vp9_filter[2]) + + ((int)src_ptr[pixel_step] * vp9_filter[3]) + + ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) + + ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) + + (VP9_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp = Temp >> VP9_FILTER_SHIFT; + + if (Temp < 0) + Temp = 0; + else if (Temp > 255) + Temp = 255; + + output_ptr[j] = (unsigned char)Temp; + src_ptr++; + } + + /* Start next row */ + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_pitch; + } +} + +/* + * The only functional difference between filter_block2d_second_pass() + * and this function is that filter_block2d_second_pass() does a sixtap + * filter on the input and stores it in the output. This function + * (filter_block2d_second_pass_avg()) does a sixtap filter on the input, + * and then averages that with the content already present in the output + * ((filter_result + dest + 1) >> 1) and stores that in the output. + */ +static void filter_block2d_second_pass_avg_6(int *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp9_filter) { + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + /* Apply filter */ + Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) + + ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) + + ((int)src_ptr[0] * vp9_filter[2]) + + ((int)src_ptr[pixel_step] * vp9_filter[3]) + + ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) + + ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) + + (VP9_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp = Temp >> VP9_FILTER_SHIFT; + + if (Temp < 0) + Temp = 0; + else if (Temp > 255) + Temp = 255; + + output_ptr[j] = (unsigned char)((output_ptr[j] + Temp + 1) >> 1); + src_ptr++; + } + + /* Start next row */ + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_pitch; + } +} + +#define Interp_Extend 3 +static void filter_block2d_6(unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + int output_pitch, + const short *HFilter, + const short *VFilter) { + int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1, + 3 + Interp_Extend * 2, 4, HFilter); + + /* then filter verticaly... */ + filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr, output_pitch, 4, 4, 4, 4, VFilter); +} + + +void vp9_sixtap_predict_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + + HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ + VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ + + filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter); +} + +/* + * The difference between filter_block2d_6() and filter_block2d_avg_6 is + * that filter_block2d_6() does a 6-tap filter and stores it in the output + * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and + * then averages that with the content already present in the output + * ((filter_result + dest + 1) >> 1) and stores that in the output. + */ +static void filter_block2d_avg_6(unsigned char *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + int output_pitch, + const short *HFilter, + const short *VFilter) { + int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), + FData, src_pixels_per_line, 1, + 3 + Interp_Extend * 2, 4, HFilter); + + /* then filter verticaly... */ + filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr, + output_pitch, 4, 4, 4, 4, VFilter); +} + +void vp9_sixtap_predict_avg_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) { + const short *HFilter; + const short *VFilter; + + HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ + VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ + + filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line, + dst_pitch, HFilter, VFilter); +} + +void vp9_sixtap_predict8x8_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) { + const short *HFilter; + const short *VFilter; + // int FData[(7+Interp_Extend*2)*16]; /* Temp data buffer used in filtering */ + int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */ + + HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ + VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1, + 7 + Interp_Extend * 2, 8, HFilter); + + + /* then filter verticaly... */ + filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter); + +} + +void vp9_sixtap_predict_avg8x8_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) { + const short *HFilter; + const short *VFilter; + // int FData[(7+Interp_Extend*2)*16]; /* Temp data buffer used in filtering */ + int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */ + + HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ + VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1, + 7 + Interp_Extend * 2, 8, HFilter); + + /* then filter verticaly... */ + filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 8, 8, VFilter); +} + +void vp9_sixtap_predict8x4_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) { + const short *HFilter; + const short *VFilter; + // int FData[(7+Interp_Extend*2)*16]; /* Temp data buffer used in filtering */ + int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer used in filtering */ + + HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ + VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1, + 3 + Interp_Extend * 2, 8, HFilter); + + + /* then filter verticaly... */ + filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, dst_pitch, 8, 8, 4, 8, VFilter); + +} + +void vp9_sixtap_predict16x16_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) { + const short *HFilter; + const short *VFilter; + // int FData[(15+Interp_Extend*2)*24]; /* Temp data buffer used in filtering */ + int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */ + + + HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ + VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, src_pixels_per_line, 1, + 15 + Interp_Extend * 2, 16, HFilter); + + /* then filter verticaly... */ + filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch, 16, 16, 16, 16, VFilter); + +} + +void vp9_sixtap_predict_avg16x16_c +( + unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch +) { + const short *HFilter; + const short *VFilter; + // int FData[(15+Interp_Extend*2)*24]; /* Temp data buffer used in filtering */ + int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer used in filtering */ + + HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ + VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ + + /* First filter 1-D horizontally... */ + filter_block2d_first_pass_6(src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, + src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter); + + /* then filter verticaly... */ + filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr, dst_pitch, + 16, 16, 16, 16, VFilter); +} + +typedef enum { + VPX_FILTER_4x4 = 0, + VPX_FILTER_8x8 = 1, + VPX_FILTER_8x4 = 2, + VPX_FILTER_16x16 = 3, +} filter_size_t; + +static const unsigned int filter_size_to_wh[][2] = { + {4, 4}, + {8, 8}, + {8, 4}, + {16,16}, +}; + +static const unsigned int filter_max_height = 16; +static const unsigned int filter_max_width = 16; + +static void filter_block2d_8_c(const unsigned char *src_ptr, + const unsigned int src_stride, + const short *HFilter, + const short *VFilter, + const filter_size_t filter_size, + unsigned char *dst_ptr, + unsigned int dst_stride) { + const unsigned int output_width = filter_size_to_wh[filter_size][0]; + const unsigned int output_height = filter_size_to_wh[filter_size][1]; + + // Between passes, we use an intermediate buffer whose height is extended to + // have enough horizontally filtered values as input for the vertical pass. + // This buffer is allocated to be big enough for the largest block type we + // support. + const int kInterp_Extend = 4; + const unsigned int intermediate_height = + (kInterp_Extend - 1) + output_height + kInterp_Extend; + const unsigned int max_intermediate_height = + (kInterp_Extend - 1) + filter_max_height + kInterp_Extend; +#ifdef _MSC_VER + // MSVC does not support C99 style declaration + unsigned char intermediate_buffer[23 * 16]; +#else + unsigned char intermediate_buffer[max_intermediate_height * filter_max_width]; +#endif + const int intermediate_next_stride = 1 - intermediate_height * output_width; + + // Horizontal pass (src -> transposed intermediate). + { + unsigned char *output_ptr = intermediate_buffer; + const int src_next_row_stride = src_stride - output_width; + unsigned int i, j; + src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); + for (i = 0; i < intermediate_height; i++) { + for (j = 0; j < output_width; j++) { + // Apply filter... + int temp = ((int)src_ptr[0] * HFilter[0]) + + ((int)src_ptr[1] * HFilter[1]) + + ((int)src_ptr[2] * HFilter[2]) + + ((int)src_ptr[3] * HFilter[3]) + + ((int)src_ptr[4] * HFilter[4]) + + ((int)src_ptr[5] * HFilter[5]) + + ((int)src_ptr[6] * HFilter[6]) + + ((int)src_ptr[7] * HFilter[7]) + + (VP9_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255... + temp >>= VP9_FILTER_SHIFT; + if (temp < 0) { + temp = 0; + } else if (temp > 255) { + temp = 255; + } + src_ptr++; + *output_ptr = temp; + output_ptr += intermediate_height; + } + src_ptr += src_next_row_stride; + output_ptr += intermediate_next_stride; + } + } + + // Vertical pass (transposed intermediate -> dst). + { + unsigned char *src_ptr = intermediate_buffer; + const int dst_next_row_stride = dst_stride - output_width; + unsigned int i, j; + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + // Apply filter... + int temp = ((int)src_ptr[0] * VFilter[0]) + + ((int)src_ptr[1] * VFilter[1]) + + ((int)src_ptr[2] * VFilter[2]) + + ((int)src_ptr[3] * VFilter[3]) + + ((int)src_ptr[4] * VFilter[4]) + + ((int)src_ptr[5] * VFilter[5]) + + ((int)src_ptr[6] * VFilter[6]) + + ((int)src_ptr[7] * VFilter[7]) + + (VP9_FILTER_WEIGHT >> 1); // Rounding + + // Normalize back to 0-255... + temp >>= VP9_FILTER_SHIFT; + if (temp < 0) { + temp = 0; + } else if (temp > 255) { + temp = 255; + } + + src_ptr += intermediate_height; + *dst_ptr++ = (unsigned char)temp; + } + src_ptr += intermediate_next_stride; + dst_ptr += dst_next_row_stride; + } + } +} + +void vp9_filter_block2d_4x4_8_c(const unsigned char *src_ptr, + const unsigned int src_stride, + const short *HFilter_aligned16, + const short *VFilter_aligned16, + unsigned char *dst_ptr, + unsigned int dst_stride) { + filter_block2d_8_c(src_ptr, src_stride, + HFilter_aligned16, VFilter_aligned16, + VPX_FILTER_4x4, dst_ptr, dst_stride); +} + +void vp9_filter_block2d_8x4_8_c(const unsigned char *src_ptr, + const unsigned int src_stride, + const short *HFilter_aligned16, + const short *VFilter_aligned16, + unsigned char *dst_ptr, + unsigned int dst_stride) { + filter_block2d_8_c(src_ptr, src_stride, + HFilter_aligned16, VFilter_aligned16, + VPX_FILTER_8x4, dst_ptr, dst_stride); +} + +void vp9_filter_block2d_8x8_8_c(const unsigned char *src_ptr, + const unsigned int src_stride, + const short *HFilter_aligned16, + const short *VFilter_aligned16, + unsigned char *dst_ptr, + unsigned int dst_stride) { + filter_block2d_8_c(src_ptr, src_stride, + HFilter_aligned16, VFilter_aligned16, + VPX_FILTER_8x8, dst_ptr, dst_stride); +} + +void vp9_filter_block2d_16x16_8_c(const unsigned char *src_ptr, + const unsigned int src_stride, + const short *HFilter_aligned16, + const short *VFilter_aligned16, + unsigned char *dst_ptr, + unsigned int dst_stride) { + filter_block2d_8_c(src_ptr, src_stride, + HFilter_aligned16, VFilter_aligned16, + VPX_FILTER_16x16, dst_ptr, dst_stride); +} + +static void block2d_average_c(unsigned char *src, + unsigned int src_stride, + unsigned char *output_ptr, + unsigned int output_stride, + const filter_size_t filter_size) { + const unsigned int output_width = filter_size_to_wh[filter_size][0]; + const unsigned int output_height = filter_size_to_wh[filter_size][1]; + + unsigned int i, j; + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1; + } + output_ptr += output_stride; + } +} + +#define block2d_average block2d_average_c + +void vp9_eighttap_predict_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + + HFilter = vp9_sub_pel_filters_8[xoffset]; + VFilter = vp9_sub_pel_filters_8[yoffset]; + + vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + dst_ptr, dst_pitch); +} + +void vp9_eighttap_predict_avg4x4_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter = vp9_sub_pel_filters_8[xoffset]; + const short *VFilter = vp9_sub_pel_filters_8[yoffset]; + unsigned char tmp[4 * 4]; + + vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + tmp, 4); + block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4); +} + +void vp9_eighttap_predict_sharp_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + + HFilter = vp9_sub_pel_filters_8s[xoffset]; + VFilter = vp9_sub_pel_filters_8s[yoffset]; + + vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + dst_ptr, dst_pitch); +} + +void vp9_eighttap_predict_avg4x4_sharp_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter = vp9_sub_pel_filters_8s[xoffset]; + const short *VFilter = vp9_sub_pel_filters_8s[yoffset]; + unsigned char tmp[4 * 4]; + + vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + tmp, 4); + block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4); +} + +void vp9_eighttap_predict8x8_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter = vp9_sub_pel_filters_8[xoffset]; + const short *VFilter = vp9_sub_pel_filters_8[yoffset]; + + vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + dst_ptr, dst_pitch); +} + +void vp9_eighttap_predict8x8_sharp_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter = vp9_sub_pel_filters_8s[xoffset]; + const short *VFilter = vp9_sub_pel_filters_8s[yoffset]; + + vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + dst_ptr, dst_pitch); +} + +void vp9_eighttap_predict_avg8x8_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char tmp[8 * 8]; + const short *HFilter = vp9_sub_pel_filters_8[xoffset]; + const short *VFilter = vp9_sub_pel_filters_8[yoffset]; + + vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + tmp, 8); + block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8); +} + +void vp9_eighttap_predict_avg8x8_sharp_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + unsigned char tmp[8 * 8]; + const short *HFilter = vp9_sub_pel_filters_8s[xoffset]; + const short *VFilter = vp9_sub_pel_filters_8s[yoffset]; + + vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + tmp, 8); + block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8); +} + +void vp9_eighttap_predict8x4_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter = vp9_sub_pel_filters_8[xoffset]; + const short *VFilter = vp9_sub_pel_filters_8[yoffset]; + + vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + dst_ptr, dst_pitch); +} + +void vp9_eighttap_predict8x4_sharp_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter = vp9_sub_pel_filters_8s[xoffset]; + const short *VFilter = vp9_sub_pel_filters_8s[yoffset]; + + vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + dst_ptr, dst_pitch); +} + +void vp9_eighttap_predict16x16_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter = vp9_sub_pel_filters_8[xoffset]; + const short *VFilter = vp9_sub_pel_filters_8[yoffset]; + + vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + dst_ptr, dst_pitch); +} + +void vp9_eighttap_predict16x16_sharp_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter = vp9_sub_pel_filters_8s[xoffset]; + const short *VFilter = vp9_sub_pel_filters_8s[yoffset]; + + vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + dst_ptr, dst_pitch); +} + +void vp9_eighttap_predict_avg16x16_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16); + const short *HFilter = vp9_sub_pel_filters_8[xoffset]; + const short *VFilter = vp9_sub_pel_filters_8[yoffset]; + + vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + tmp, 16); + block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16); +} + +void vp9_eighttap_predict_avg16x16_sharp_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 16 * 16); + const short *HFilter = vp9_sub_pel_filters_8s[xoffset]; + const short *VFilter = vp9_sub_pel_filters_8s[yoffset]; + + vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, + HFilter, VFilter, + tmp, 16); + block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16); +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_first_pass + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_stride : Stride of source block. + * UINT32 height : Block height. + * UINT32 width : Block width. + * INT32 *vp9_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : INT32 *dst_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block + * in the horizontal direction to produce the filtered output + * block. Used to implement first-pass of 2-D separable filter. + * + * SPECIAL NOTES : Produces INT32 output to retain precision for next pass. + * Two filter taps should sum to VP9_FILTER_WEIGHT. + * + ****************************************************************************/ +static void filter_block2d_bil_first_pass(unsigned char *src_ptr, + unsigned short *dst_ptr, + unsigned int src_stride, + unsigned int height, + unsigned int width, + const short *vp9_filter) { + unsigned int i, j; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + /* Apply bilinear filter */ + dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) + + ((int)src_ptr[1] * vp9_filter[1]) + + (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT; + src_ptr++; + } + + /* Next row... */ + src_ptr += src_stride - width; + dst_ptr += width; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_second_pass + * + * INPUTS : INT32 *src_ptr : Pointer to source block. + * UINT32 dst_pitch : Destination block pitch. + * UINT32 height : Block height. + * UINT32 width : Block width. + * INT32 *vp9_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block + * in the vertical direction to produce the filtered output + * block. Used to implement second-pass of 2-D separable filter. + * + * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. + * Two filter taps should sum to VP9_FILTER_WEIGHT. + * + ****************************************************************************/ +static void filter_block2d_bil_second_pass(unsigned short *src_ptr, + unsigned char *dst_ptr, + int dst_pitch, + unsigned int height, + unsigned int width, + const short *vp9_filter) { + unsigned int i, j; + int Temp; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + /* Apply filter */ + Temp = ((int)src_ptr[0] * vp9_filter[0]) + + ((int)src_ptr[width] * vp9_filter[1]) + + (VP9_FILTER_WEIGHT / 2); + dst_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT); + src_ptr++; + } + + /* Next row... */ + dst_ptr += dst_pitch; + } +} + +/* + * As before for filter_block2d_second_pass_avg(), the functional difference + * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg() + * is that filter_block2d_bil_second_pass() does a bilinear filter on input + * and stores the result in output; filter_block2d_bil_second_pass_avg(), + * instead, does a bilinear filter on input, averages the resulting value + * with the values already present in the output and stores the result of + * that back into the output ((filter_result + dest + 1) >> 1). + */ +static void filter_block2d_bil_second_pass_avg(unsigned short *src_ptr, + unsigned char *dst_ptr, + int dst_pitch, + unsigned int height, + unsigned int width, + const short *vp9_filter) { + unsigned int i, j; + int Temp; + + for (i = 0; i < height; i++) { + for (j = 0; j < width; j++) { + /* Apply filter */ + Temp = ((int)src_ptr[0] * vp9_filter[0]) + + ((int)src_ptr[width] * vp9_filter[1]) + + (VP9_FILTER_WEIGHT / 2); + dst_ptr[j] = (unsigned int)(((Temp >> VP9_FILTER_SHIFT) + dst_ptr[j] + 1) >> 1); + src_ptr++; + } + + /* Next row... */ + dst_ptr += dst_pitch; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_pitch : Stride of source block. + * UINT32 dst_pitch : Stride of destination block. + * INT32 *HFilter : Array of 2 horizontal filter taps. + * INT32 *VFilter : Array of 2 vertical filter taps. + * INT32 Width : Block width + * INT32 Height : Block height + * + * OUTPUTS : UINT16 *dst_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : 2-D filters an input block by applying a 2-tap + * bi-linear filter horizontally followed by a 2-tap + * bi-linear filter vertically on the result. + * + * SPECIAL NOTES : The largest block size can be handled here is 16x16 + * + ****************************************************************************/ +static void filter_block2d_bil(unsigned char *src_ptr, + unsigned char *dst_ptr, + unsigned int src_pitch, + unsigned int dst_pitch, + const short *HFilter, + const short *VFilter, + int Width, + int Height) { + + unsigned short FData[17 * 16]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); + + /* then 1-D vertically... */ + filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter); +} + +static void filter_block2d_bil_avg(unsigned char *src_ptr, + unsigned char *dst_ptr, + unsigned int src_pitch, + unsigned int dst_pitch, + const short *HFilter, + const short *VFilter, + int Width, + int Height) { + unsigned short FData[17 * 16]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); + + /* then 1-D vertically... */ + filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter); +} + +void vp9_bilinear_predict4x4_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + + HFilter = vp9_bilinear_filters[xoffset]; + VFilter = vp9_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); +} + +void vp9_bilinear_predict_avg4x4_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + + HFilter = vp9_bilinear_filters[xoffset]; + VFilter = vp9_bilinear_filters[yoffset]; + + filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line, + dst_pitch, HFilter, VFilter, 4, 4); +} + +void vp9_bilinear_predict8x8_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + + HFilter = vp9_bilinear_filters[xoffset]; + VFilter = vp9_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); + +} + +void vp9_bilinear_predict_avg8x8_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + + HFilter = vp9_bilinear_filters[xoffset]; + VFilter = vp9_bilinear_filters[yoffset]; + + filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line, + dst_pitch, HFilter, VFilter, 8, 8); +} + +void vp9_bilinear_predict8x4_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + + HFilter = vp9_bilinear_filters[xoffset]; + VFilter = vp9_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); + +} + +void vp9_bilinear_predict16x16_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + + HFilter = vp9_bilinear_filters[xoffset]; + VFilter = vp9_bilinear_filters[yoffset]; + + filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); +} + +void vp9_bilinear_predict_avg16x16_c(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + const short *HFilter; + const short *VFilter; + + HFilter = vp9_bilinear_filters[xoffset]; + VFilter = vp9_bilinear_filters[yoffset]; + + filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line, + dst_pitch, HFilter, VFilter, 16, 16); +} diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h new file mode 100644 index 000000000..c194887dc --- /dev/null +++ b/vp9/common/vp9_filter.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef FILTER_H +#define FILTER_H + +#include "vpx_config.h" +#include "vpx_scale/yv12config.h" + +#define BLOCK_HEIGHT_WIDTH 4 +#define VP9_FILTER_WEIGHT 128 +#define VP9_FILTER_SHIFT 7 + +#define SUBPEL_SHIFTS 16 + +extern const short vp9_bilinear_filters[SUBPEL_SHIFTS][2]; +extern const short vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]; +extern const short vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]; +extern const short vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]; + +#endif // FILTER_H diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c new file mode 100644 index 000000000..bfbac4e18 --- /dev/null +++ b/vp9/common/vp9_findnearmv.c @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9_findnearmv.h" +#include "vp9/common/vp9_sadmxn.h" +#include "vp9/common/vp9_subpelvar.h" +#include + +const unsigned char vp9_mbsplit_offset[4][16] = { + { 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} +}; + +static void lower_mv_precision(int_mv *mv, int usehp) +{ + if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) { + if (mv->as_mv.row & 1) + mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1); + if (mv->as_mv.col & 1) + mv->as_mv.col += (mv->as_mv.col > 0 ? -1 : 1); + } +} + +vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc, + vp9_prob p[4], const int context + ) { + p[0] = pc->fc.vp9_mode_contexts[context][0]; + p[1] = pc->fc.vp9_mode_contexts[context][1]; + p[2] = pc->fc.vp9_mode_contexts[context][2]; + p[3] = pc->fc.vp9_mode_contexts[context][3]; + return p; +} + +#define SP(x) (((x) & 7) << 1) +unsigned int vp9_sad3x16_c(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride) { + return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 3, 16); +} +unsigned int vp9_sad16x3_c(const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride) { + return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 3); +} + +#if CONFIG_SUBPELREFMV +unsigned int vp9_variance2x16_c(const unsigned char *src_ptr, + const int source_stride, + const unsigned char *ref_ptr, + const int recon_stride, + unsigned int *sse) { + int sum; + variance(src_ptr, source_stride, ref_ptr, recon_stride, 2, 16, sse, &sum); + return (*sse - (((unsigned int)sum * sum) >> 5)); +} + +unsigned int vp9_variance16x2_c(const unsigned char *src_ptr, + const int source_stride, + const unsigned char *ref_ptr, + const int recon_stride, + unsigned int *sse) { + int sum; + variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 2, sse, &sum); + return (*sse - (((unsigned int)sum * sum) >> 5)); +} + +unsigned int vp9_sub_pixel_variance16x2_c(const unsigned char *src_ptr, + const int src_pixels_per_line, + const int xoffset, + const int yoffset, + const unsigned char *dst_ptr, + const int dst_pixels_per_line, + unsigned int *sse) { + unsigned short FData3[16 * 3]; // Temp data buffer used in filtering + unsigned char temp2[2 * 16]; + const short *HFilter, *VFilter; + + HFilter = vp9_bilinear_filters[xoffset]; + VFilter = vp9_bilinear_filters[yoffset]; + + var_filter_block2d_bil_first_pass(src_ptr, FData3, + src_pixels_per_line, 1, 3, 16, HFilter); + var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 2, 16, VFilter); + + return vp9_variance16x2_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse); +} + +unsigned int vp9_sub_pixel_variance2x16_c(const unsigned char *src_ptr, + const int src_pixels_per_line, + const int xoffset, + const int yoffset, + const unsigned char *dst_ptr, + const int dst_pixels_per_line, + unsigned int *sse) { + unsigned short FData3[2 * 17]; // Temp data buffer used in filtering + unsigned char temp2[2 * 16]; + const short *HFilter, *VFilter; + + HFilter = vp9_bilinear_filters[xoffset]; + VFilter = vp9_bilinear_filters[yoffset]; + + var_filter_block2d_bil_first_pass(src_ptr, FData3, + src_pixels_per_line, 1, 17, 2, HFilter); + var_filter_block2d_bil_second_pass(FData3, temp2, 2, 2, 16, 2, VFilter); + + return vp9_variance2x16_c(temp2, 2, dst_ptr, dst_pixels_per_line, sse); +} +#endif + +/* check a list of motion vectors by sad score using a number rows of pixels + * above and a number cols of pixels in the left to select the one with best + * score to use as ref motion vector + */ +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, + unsigned char *ref_y_buffer, + int ref_y_stride, + int_mv *mvlist, + int_mv *best_mv, + int_mv *nearest, + int_mv *near) { + int i, j; + unsigned char *above_src; + unsigned char *left_src; + unsigned char *above_ref; + unsigned char *left_ref; + unsigned int score; + unsigned int sse; + unsigned int ref_scores[MAX_MV_REFS] = {0}; + int_mv sorted_mvs[MAX_MV_REFS]; + int zero_seen = FALSE; + + // Default all to 0,0 if nothing else available + best_mv->as_int = nearest->as_int = near->as_int = 0; + vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs)); + +#if CONFIG_SUBPELREFMV + above_src = xd->dst.y_buffer - xd->dst.y_stride * 2; + left_src = xd->dst.y_buffer - 2; + above_ref = ref_y_buffer - ref_y_stride * 2; + left_ref = ref_y_buffer - 2; +#else + above_src = xd->dst.y_buffer - xd->dst.y_stride * 3; + left_src = xd->dst.y_buffer - 3; + above_ref = ref_y_buffer - ref_y_stride * 3; + left_ref = ref_y_buffer - 3; +#endif + + //for(i = 0; i < MAX_MV_REFS; ++i) { + // Limit search to the predicted best 4 + for(i = 0; i < 4; ++i) { + int_mv this_mv; + int offset = 0; + int row_offset, col_offset; + + this_mv.as_int = mvlist[i].as_int; + + // If we see a 0,0 vector for a second time we have reached the end of + // the list of valid candidate vectors. + if (!this_mv.as_int && zero_seen) + break; + + zero_seen = zero_seen || !this_mv.as_int; + + clamp_mv(&this_mv, + xd->mb_to_left_edge - LEFT_TOP_MARGIN + 24, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN + 24, + xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); + +#if CONFIG_SUBPELREFMV + row_offset = this_mv.as_mv.row >> 3; + col_offset = this_mv.as_mv.col >> 3; + offset = ref_y_stride * row_offset + col_offset; + score = 0; + if (xd->up_available) { + vp9_sub_pixel_variance16x2_c(above_ref + offset, ref_y_stride, + SP(this_mv.as_mv.col), + SP(this_mv.as_mv.row), + above_src, xd->dst.y_stride, &sse); + score += sse; +#if CONFIG_SUPERBLOCKS + if (xd->mode_info_context->mbmi.encoded_as_sb) { + vp9_sub_pixel_variance16x2_c(above_ref + offset + 16, + ref_y_stride, + SP(this_mv.as_mv.col), + SP(this_mv.as_mv.row), + above_src + 16, xd->dst.y_stride, &sse); + score += sse; + } +#endif + } + if (xd->left_available) { + vp9_sub_pixel_variance2x16_c(left_ref + offset, ref_y_stride, + SP(this_mv.as_mv.col), + SP(this_mv.as_mv.row), + left_src, xd->dst.y_stride, &sse); + score += sse; +#if CONFIG_SUPERBLOCKS + if (xd->mode_info_context->mbmi.encoded_as_sb) { + vp9_sub_pixel_variance2x16_c(left_ref + offset + ref_y_stride * 16, + ref_y_stride, + SP(this_mv.as_mv.col), + SP(this_mv.as_mv.row), + left_src + xd->dst.y_stride * 16, + xd->dst.y_stride, &sse); + score += sse; + } +#endif + } +#else + row_offset = (this_mv.as_mv.row > 0) ? + ((this_mv.as_mv.row + 3) >> 3):((this_mv.as_mv.row + 4) >> 3); + col_offset = (this_mv.as_mv.col > 0) ? + ((this_mv.as_mv.col + 3) >> 3):((this_mv.as_mv.col + 4) >> 3); + offset = ref_y_stride * row_offset + col_offset; + score = 0; + if (xd->up_available) { + score += vp9_sad16x3(above_src, xd->dst.y_stride, + above_ref + offset, ref_y_stride); +#if CONFIG_SUPERBLOCKS + if (xd->mode_info_context->mbmi.encoded_as_sb) { + score += vp9_sad16x3(above_src + 16, xd->dst.y_stride, + above_ref + offset + 16, ref_y_stride); + } +#endif + } + if (xd->left_available) { + score += vp9_sad3x16(left_src, xd->dst.y_stride, + left_ref + offset, ref_y_stride); +#if CONFIG_SUPERBLOCKS + if (xd->mode_info_context->mbmi.encoded_as_sb) { + score += vp9_sad3x16(left_src + xd->dst.y_stride * 16, + xd->dst.y_stride, + left_ref + offset + ref_y_stride * 16, + ref_y_stride); + } +#endif + } +#endif + // Add the entry to our list and then resort the list on score. + ref_scores[i] = score; + sorted_mvs[i].as_int = this_mv.as_int; + j = i; + while (j > 0) { + if (ref_scores[j] < ref_scores[j-1]) { + ref_scores[j] = ref_scores[j-1]; + sorted_mvs[j].as_int = sorted_mvs[j-1].as_int; + ref_scores[j-1] = score; + sorted_mvs[j-1].as_int = this_mv.as_int; + j--; + } else + break; + } + } + + // Make sure all the candidates are properly clamped etc + for (i = 0; i < 4; ++i) { + lower_mv_precision(&sorted_mvs[i], xd->allow_high_precision_mv); + clamp_mv2(&sorted_mvs[i], xd); + } + + // Set the best mv to the first entry in the sorted list + best_mv->as_int = sorted_mvs[0].as_int; + + // Provided that there are non zero vectors available there will not + // be more than one 0,0 entry in the sorted list. + // The best ref mv is always set to the first entry (which gave the best + // results. The nearest is set to the first non zero vector if available and + // near to the second non zero vector if available. + // We do not use 0,0 as a nearest or near as 0,0 has its own mode. + if ( sorted_mvs[0].as_int ) { + nearest->as_int = sorted_mvs[0].as_int; + if ( sorted_mvs[1].as_int ) + near->as_int = sorted_mvs[1].as_int; + else + near->as_int = sorted_mvs[2].as_int; + } else { + nearest->as_int = sorted_mvs[1].as_int; + near->as_int = sorted_mvs[2].as_int; + } + + // Copy back the re-ordered mv list + vpx_memcpy(mvlist, sorted_mvs, sizeof(sorted_mvs)); +} diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h new file mode 100644 index 000000000..759bdbaff --- /dev/null +++ b/vp9/common/vp9_findnearmv.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_FINDNEARMV_H +#define __INC_FINDNEARMV_H + +#include "vp9_mv.h" +#include "vp9_blockd.h" +#include "vp9_treecoder.h" +#include "vp9_onyxc_int.h" + +/* check a list of motion vectors by sad score using a number rows of pixels + * above and a number cols of pixels in the left to select the one with best + * score to use as ref motion vector + */ +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, + unsigned char *ref_y_buffer, + int ref_y_stride, + int_mv *mvlist, + int_mv *best_mv, + int_mv *nearest, + int_mv *near); + +static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias) { + MV xmv; + xmv = mvp->as_mv; + + if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) { + xmv.row *= -1; + xmv.col *= -1; + } + + mvp->as_mv = xmv; +} + +#define LEFT_TOP_MARGIN (16 << 3) +#define RIGHT_BOTTOM_MARGIN (16 << 3) + +static void clamp_mv(int_mv *mv, + int mb_to_left_edge, + int mb_to_right_edge, + int mb_to_top_edge, + int mb_to_bottom_edge) { + mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ? + mb_to_left_edge : mv->as_mv.col; + mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ? + mb_to_right_edge : mv->as_mv.col; + mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ? + mb_to_top_edge : mv->as_mv.row; + mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ? + mb_to_bottom_edge : mv->as_mv.row; +} + +static void clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) { + clamp_mv(mv, + xd->mb_to_left_edge - LEFT_TOP_MARGIN, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN, + xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); +} + +static unsigned int check_mv_bounds(int_mv *mv, + int mb_to_left_edge, + int mb_to_right_edge, + int mb_to_top_edge, + int mb_to_bottom_edge) { + return (mv->as_mv.col < mb_to_left_edge) || + (mv->as_mv.col > mb_to_right_edge) || + (mv->as_mv.row < mb_to_top_edge) || + (mv->as_mv.row > mb_to_bottom_edge); +} + +vp9_prob *vp9_mv_ref_probs(VP9_COMMON *pc, + vp9_prob p[VP9_MVREFS - 1], + const int context); + +extern const unsigned char vp9_mbsplit_offset[4][16]; + +static int left_block_mv(const MODE_INFO *cur_mb, int b) { + if (!(b & 3)) { + /* On L edge, get from MB to left of us */ + --cur_mb; + + if (cur_mb->mbmi.mode != SPLITMV) + return cur_mb->mbmi.mv[0].as_int; + b += 4; + } + + return (cur_mb->bmi + b - 1)->as_mv.first.as_int; +} + +static int left_block_second_mv(const MODE_INFO *cur_mb, int b) { + if (!(b & 3)) { + /* On L edge, get from MB to left of us */ + --cur_mb; + + if (cur_mb->mbmi.mode != SPLITMV) + return cur_mb->mbmi.second_ref_frame > 0 ? + cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int; + b += 4; + } + + return cur_mb->mbmi.second_ref_frame > 0 ? + (cur_mb->bmi + b - 1)->as_mv.second.as_int : + (cur_mb->bmi + b - 1)->as_mv.first.as_int; +} + +static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) { + if (!(b >> 2)) { + /* On top edge, get from MB above us */ + cur_mb -= mi_stride; + + if (cur_mb->mbmi.mode != SPLITMV) + return cur_mb->mbmi.mv[0].as_int; + b += 16; + } + + return (cur_mb->bmi + b - 4)->as_mv.first.as_int; +} + +static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) { + if (!(b >> 2)) { + /* On top edge, get from MB above us */ + cur_mb -= mi_stride; + + if (cur_mb->mbmi.mode != SPLITMV) + return cur_mb->mbmi.second_ref_frame > 0 ? + cur_mb->mbmi.mv[1].as_int : cur_mb->mbmi.mv[0].as_int; + b += 16; + } + + return cur_mb->mbmi.second_ref_frame > 0 ? + (cur_mb->bmi + b - 4)->as_mv.second.as_int : + (cur_mb->bmi + b - 4)->as_mv.first.as_int; +} + +static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) { + if (!(b & 3)) { + /* On L edge, get from MB to left of us */ + --cur_mb; + + if (cur_mb->mbmi.mode < I8X8_PRED) { + return pred_mode_conv(cur_mb->mbmi.mode); + } else if (cur_mb->mbmi.mode == I8X8_PRED) { + return pred_mode_conv((cur_mb->bmi + 3 + b)->as_mode.first); + } else if (cur_mb->mbmi.mode == B_PRED) { + return ((cur_mb->bmi + 3 + b)->as_mode.first); + } else { + return B_DC_PRED; + } + } + return (cur_mb->bmi + b - 1)->as_mode.first; +} + +static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, + int b, int mi_stride) { + if (!(b >> 2)) { + /* On top edge, get from MB above us */ + cur_mb -= mi_stride; + + if (cur_mb->mbmi.mode < I8X8_PRED) { + return pred_mode_conv(cur_mb->mbmi.mode); + } else if (cur_mb->mbmi.mode == I8X8_PRED) { + return pred_mode_conv((cur_mb->bmi + 12 + b)->as_mode.first); + } else if (cur_mb->mbmi.mode == B_PRED) { + return ((cur_mb->bmi + 12 + b)->as_mode.first); + } else { + return B_DC_PRED; + } + } + + return (cur_mb->bmi + b - 4)->as_mode.first; +} + +#endif diff --git a/vp9/common/vp9_header.h b/vp9/common/vp9_header.h new file mode 100644 index 000000000..a88b6e3e3 --- /dev/null +++ b/vp9/common/vp9_header.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_HEADER_H +#define __INC_HEADER_H + +/* 24 bits total */ +typedef struct { + unsigned int type: 1; + unsigned int version: 3; + unsigned int show_frame: 1; + + /* Allow 2^20 bytes = 8 megabits for first partition */ + + unsigned int first_partition_length_in_bytes: 19; + +#ifdef PACKET_TESTING + unsigned int frame_number; + unsigned int update_gold: 1; + unsigned int uses_gold: 1; + unsigned int update_last: 1; + unsigned int uses_last: 1; +#endif + +} VP9_HEADER; + +#ifdef PACKET_TESTING +#define VP9_HEADER_SIZE 8 +#else +#define VP9_HEADER_SIZE 3 +#endif + + +#endif diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c new file mode 100644 index 000000000..9d559ed22 --- /dev/null +++ b/vp9/common/vp9_idctllm.c @@ -0,0 +1,1784 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/**************************************************************************** + * Notes: + * + * This implementation makes use of 16 bit fixed point verio of two multiply + * constants: + * 1. sqrt(2) * cos (pi/8) + * 2. sqrt(2) * sin (pi/8) + * Becuase the first constant is bigger than 1, to maintain the same 16 bit + * fixed point precision as the second one, we use a trick of + * x * a = x + x*(a-1) + * so + * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). + **************************************************************************/ +#include +#include +#include "vpx_ports/config.h" +#include "vp9/common/vp9_systemdependent.h" + +#include "vp9/common/vp9_blockd.h" + +static const int cospi8sqrt2minus1 = 20091; +static const int sinpi8sqrt2 = 35468; +static const int rounding = 0; + +// TODO: these transforms can be further converted into integer forms +// for complexity optimization +static const float idct_4[16] = { + 0.500000000000000, 0.653281482438188, 0.500000000000000, 0.270598050073099, + 0.500000000000000, 0.270598050073099, -0.500000000000000, -0.653281482438188, + 0.500000000000000, -0.270598050073099, -0.500000000000000, 0.653281482438188, + 0.500000000000000, -0.653281482438188, 0.500000000000000, -0.270598050073099 +}; + +static const float iadst_4[16] = { + 0.228013428883779, 0.577350269189626, 0.656538502008139, 0.428525073124360, + 0.428525073124360, 0.577350269189626, -0.228013428883779, -0.656538502008139, + 0.577350269189626, 0, -0.577350269189626, 0.577350269189626, + 0.656538502008139, -0.577350269189626, 0.428525073124359, -0.228013428883779 +}; + +static const float idct_8[64] = { + 0.353553390593274, 0.490392640201615, 0.461939766255643, 0.415734806151273, + 0.353553390593274, 0.277785116509801, 0.191341716182545, 0.097545161008064, + 0.353553390593274, 0.415734806151273, 0.191341716182545, -0.097545161008064, + -0.353553390593274, -0.490392640201615, -0.461939766255643, -0.277785116509801, + 0.353553390593274, 0.277785116509801, -0.191341716182545, -0.490392640201615, + -0.353553390593274, 0.097545161008064, 0.461939766255643, 0.415734806151273, + 0.353553390593274, 0.097545161008064, -0.461939766255643, -0.277785116509801, + 0.353553390593274, 0.415734806151273, -0.191341716182545, -0.490392640201615, + 0.353553390593274, -0.097545161008064, -0.461939766255643, 0.277785116509801, + 0.353553390593274, -0.415734806151273, -0.191341716182545, 0.490392640201615, + 0.353553390593274, -0.277785116509801, -0.191341716182545, 0.490392640201615, + -0.353553390593274, -0.097545161008064, 0.461939766255643, -0.415734806151273, + 0.353553390593274, -0.415734806151273, 0.191341716182545, 0.097545161008064, + -0.353553390593274, 0.490392640201615, -0.461939766255643, 0.277785116509801, + 0.353553390593274, -0.490392640201615, 0.461939766255643, -0.415734806151273, + 0.353553390593274, -0.277785116509801, 0.191341716182545, -0.097545161008064 +}; + +static const float iadst_8[64] = { + 0.089131608307533, 0.255357107325376, 0.387095214016349, 0.466553967085785, + 0.483002021635509, 0.434217976756762, 0.326790388032145, 0.175227946595735, + 0.175227946595735, 0.434217976756762, 0.466553967085785, 0.255357107325376, + -0.089131608307533, -0.387095214016348, -0.483002021635509, -0.326790388032145, + 0.255357107325376, 0.483002021635509, 0.175227946595735, -0.326790388032145, + -0.466553967085785, -0.089131608307533, 0.387095214016349, 0.434217976756762, + 0.326790388032145, 0.387095214016349, -0.255357107325376, -0.434217976756762, + 0.175227946595735, 0.466553967085786, -0.089131608307534, -0.483002021635509, + 0.387095214016349, 0.175227946595735, -0.483002021635509, 0.089131608307533, + 0.434217976756762, -0.326790388032145, -0.255357107325377, 0.466553967085785, + 0.434217976756762, -0.089131608307533, -0.326790388032145, 0.483002021635509, + -0.255357107325376, -0.175227946595735, 0.466553967085785, -0.387095214016348, + 0.466553967085785, -0.326790388032145, 0.089131608307533, 0.175227946595735, + -0.387095214016348, 0.483002021635509, -0.434217976756762, 0.255357107325376, + 0.483002021635509, -0.466553967085785, 0.434217976756762, -0.387095214016348, + 0.326790388032145, -0.255357107325375, 0.175227946595736, -0.089131608307532 +}; + +static const int16_t idct_i4[16] = { + 8192, 10703, 8192, 4433, + 8192, 4433, -8192, -10703, + 8192, -4433, -8192, 10703, + 8192, -10703, 8192, -4433 +}; + +static const int16_t iadst_i4[16] = { + 3736, 9459, 10757, 7021, + 7021, 9459, -3736, -10757, + 9459, 0, -9459, 9459, + 10757, -9459, 7021, -3736 +}; + +static const int16_t idct_i8[64] = { + 5793, 8035, 7568, 6811, + 5793, 4551, 3135, 1598, + 5793, 6811, 3135, -1598, + -5793, -8035, -7568, -4551, + 5793, 4551, -3135, -8035, + -5793, 1598, 7568, 6811, + 5793, 1598, -7568, -4551, + 5793, 6811, -3135, -8035, + 5793, -1598, -7568, 4551, + 5793, -6811, -3135, 8035, + 5793, -4551, -3135, 8035, + -5793, -1598, 7568, -6811, + 5793, -6811, 3135, 1598, + -5793, 8035, -7568, 4551, + 5793, -8035, 7568, -6811, + 5793, -4551, 3135, -1598 +}; + +static const int16_t iadst_i8[64] = { + 1460, 4184, 6342, 7644, + 7914, 7114, 5354, 2871, + 2871, 7114, 7644, 4184, + -1460, -6342, -7914, -5354, + 4184, 7914, 2871, -5354, + -7644, -1460, 6342, 7114, + 5354, 6342, -4184, -7114, + 2871, 7644, -1460, -7914, + 6342, 2871, -7914, 1460, + 7114, -5354, -4184, 7644, + 7114, -1460, -5354, 7914, + -4184, -2871, 7644, -6342, + 7644, -5354, 1460, 2871, + -6342, 7914, -7114, 4184, + 7914, -7644, 7114, -6342, + 5354, -4184, 2871, -1460 +}; + +static float idct_16[256] = { + 0.250000, 0.351851, 0.346760, 0.338330, 0.326641, 0.311806, 0.293969, 0.273300, + 0.250000, 0.224292, 0.196424, 0.166664, 0.135299, 0.102631, 0.068975, 0.034654, + 0.250000, 0.338330, 0.293969, 0.224292, 0.135299, 0.034654, -0.068975, -0.166664, + -0.250000, -0.311806, -0.346760, -0.351851, -0.326641, -0.273300, -0.196424, -0.102631, + 0.250000, 0.311806, 0.196424, 0.034654, -0.135299, -0.273300, -0.346760, -0.338330, + -0.250000, -0.102631, 0.068975, 0.224292, 0.326641, 0.351851, 0.293969, 0.166664, + 0.250000, 0.273300, 0.068975, -0.166664, -0.326641, -0.338330, -0.196424, 0.034654, + 0.250000, 0.351851, 0.293969, 0.102631, -0.135299, -0.311806, -0.346760, -0.224292, + 0.250000, 0.224292, -0.068975, -0.311806, -0.326641, -0.102631, 0.196424, 0.351851, + 0.250000, -0.034654, -0.293969, -0.338330, -0.135299, 0.166664, 0.346760, 0.273300, + 0.250000, 0.166664, -0.196424, -0.351851, -0.135299, 0.224292, 0.346760, 0.102631, + -0.250000, -0.338330, -0.068975, 0.273300, 0.326641, 0.034654, -0.293969, -0.311806, + 0.250000, 0.102631, -0.293969, -0.273300, 0.135299, 0.351851, 0.068975, -0.311806, + -0.250000, 0.166664, 0.346760, 0.034654, -0.326641, -0.224292, 0.196424, 0.338330, + 0.250000, 0.034654, -0.346760, -0.102631, 0.326641, 0.166664, -0.293969, -0.224292, + 0.250000, 0.273300, -0.196424, -0.311806, 0.135299, 0.338330, -0.068975, -0.351851, + 0.250000, -0.034654, -0.346760, 0.102631, 0.326641, -0.166664, -0.293969, 0.224292, + 0.250000, -0.273300, -0.196424, 0.311806, 0.135299, -0.338330, -0.068975, 0.351851, + 0.250000, -0.102631, -0.293969, 0.273300, 0.135299, -0.351851, 0.068975, 0.311806, + -0.250000, -0.166664, 0.346760, -0.034654, -0.326641, 0.224292, 0.196424, -0.338330, + 0.250000, -0.166664, -0.196424, 0.351851, -0.135299, -0.224292, 0.346760, -0.102631, + -0.250000, 0.338330, -0.068975, -0.273300, 0.326641, -0.034654, -0.293969, 0.311806, + 0.250000, -0.224292, -0.068975, 0.311806, -0.326641, 0.102631, 0.196424, -0.351851, + 0.250000, 0.034654, -0.293969, 0.338330, -0.135299, -0.166664, 0.346760, -0.273300, + 0.250000, -0.273300, 0.068975, 0.166664, -0.326641, 0.338330, -0.196424, -0.034654, + 0.250000, -0.351851, 0.293969, -0.102631, -0.135299, 0.311806, -0.346760, 0.224292, + 0.250000, -0.311806, 0.196424, -0.034654, -0.135299, 0.273300, -0.346760, 0.338330, + -0.250000, 0.102631, 0.068975, -0.224292, 0.326641, -0.351851, 0.293969, -0.166664, + 0.250000, -0.338330, 0.293969, -0.224292, 0.135299, -0.034654, -0.068975, 0.166664, + -0.250000, 0.311806, -0.346760, 0.351851, -0.326641, 0.273300, -0.196424, 0.102631, + 0.250000, -0.351851, 0.346760, -0.338330, 0.326641, -0.311806, 0.293969, -0.273300, + 0.250000, -0.224292, 0.196424, -0.166664, 0.135299, -0.102631, 0.068975, -0.034654 +}; + +static float iadst_16[256] = { + 0.033094, 0.098087, 0.159534, 0.215215, 0.263118, 0.301511, 0.329007, 0.344612, + 0.347761, 0.338341, 0.316693, 0.283599, 0.240255, 0.188227, 0.129396, 0.065889, + 0.065889, 0.188227, 0.283599, 0.338341, 0.344612, 0.301511, 0.215215, 0.098087, + -0.033094, -0.159534, -0.263118, -0.329007, -0.347761, -0.316693, -0.240255, -0.129396, + 0.098087, 0.263118, 0.344612, 0.316693, 0.188227, 0.000000, -0.188227, -0.316693, + -0.344612, -0.263118, -0.098087, 0.098087, 0.263118, 0.344612, 0.316693, 0.188227, + 0.129396, 0.316693, 0.329007, 0.159534, -0.098087, -0.301511, -0.338341, -0.188227, + 0.065889, 0.283599, 0.344612, 0.215215, -0.033094, -0.263118, -0.347761, -0.240255, + 0.159534, 0.344612, 0.240255, -0.065889, -0.316693, -0.301511, -0.033094, 0.263118, + 0.338341, 0.129396, -0.188227, -0.347761, -0.215215, 0.098087, 0.329007, 0.283599, + 0.188227, 0.344612, 0.098087, -0.263118, -0.316693, -0.000000, 0.316693, 0.263118, + -0.098087, -0.344612, -0.188227, 0.188227, 0.344612, 0.098087, -0.263118, -0.316693, + 0.215215, 0.316693, -0.065889, -0.347761, -0.098087, 0.301511, 0.240255, -0.188227, + -0.329007, 0.033094, 0.344612, 0.129396, -0.283599, -0.263118, 0.159534, 0.338341, + 0.240255, 0.263118, -0.215215, -0.283599, 0.188227, 0.301511, -0.159534, -0.316693, + 0.129396, 0.329007, -0.098087, -0.338341, 0.065889, 0.344612, -0.033094, -0.347761, + 0.263118, 0.188227, -0.316693, -0.098087, 0.344612, 0.000000, -0.344612, 0.098087, + 0.316693, -0.188227, -0.263118, 0.263118, 0.188227, -0.316693, -0.098087, 0.344612, + 0.283599, 0.098087, -0.347761, 0.129396, 0.263118, -0.301511, -0.065889, 0.344612, + -0.159534, -0.240255, 0.316693, 0.033094, -0.338341, 0.188227, 0.215215, -0.329007, + 0.301511, 0.000000, -0.301511, 0.301511, 0.000000, -0.301511, 0.301511, 0.000000, + -0.301511, 0.301511, 0.000000, -0.301511, 0.301511, 0.000000, -0.301511, 0.301511, + 0.316693, -0.098087, -0.188227, 0.344612, -0.263118, -0.000000, 0.263118, -0.344612, + 0.188227, 0.098087, -0.316693, 0.316693, -0.098087, -0.188227, 0.344612, -0.263118, + 0.329007, -0.188227, -0.033094, 0.240255, -0.344612, 0.301511, -0.129396, -0.098087, + 0.283599, -0.347761, 0.263118, -0.065889, -0.159534, 0.316693, -0.338341, 0.215215, + 0.338341, -0.263118, 0.129396, 0.033094, -0.188227, 0.301511, -0.347761, 0.316693, + -0.215215, 0.065889, 0.098087, -0.240255, 0.329007, -0.344612, 0.283599, -0.159534, + 0.344612, -0.316693, 0.263118, -0.188227, 0.098087, 0.000000, -0.098087, 0.188227, + -0.263118, 0.316693, -0.344612, 0.344612, -0.316693, 0.263118, -0.188227, 0.098087, + 0.347761, -0.344612, 0.338341, -0.329007, 0.316693, -0.301511, 0.283599, -0.263118, + 0.240255, -0.215215, 0.188227, -0.159534, 0.129396, -0.098087, 0.065889, -0.033094 +}; + +static const int16_t idct_i16[256] = { + 4096, 5765, 5681, 5543, 5352, 5109, 4816, 4478, + 4096, 3675, 3218, 2731, 2217, 1682, 1130, 568, + 4096, 5543, 4816, 3675, 2217, 568, -1130, -2731, + -4096, -5109, -5681, -5765, -5352, -4478, -3218, -1682, + 4096, 5109, 3218, 568, -2217, -4478, -5681, -5543, + -4096, -1682, 1130, 3675, 5352, 5765, 4816, 2731, + 4096, 4478, 1130, -2731, -5352, -5543, -3218, 568, + 4096, 5765, 4816, 1682, -2217, -5109, -5681, -3675, + 4096, 3675, -1130, -5109, -5352, -1682, 3218, 5765, + 4096, -568, -4816, -5543, -2217, 2731, 5681, 4478, + 4096, 2731, -3218, -5765, -2217, 3675, 5681, 1682, + -4096, -5543, -1130, 4478, 5352, 568, -4816, -5109, + 4096, 1682, -4816, -4478, 2217, 5765, 1130, -5109, + -4096, 2731, 5681, 568, -5352, -3675, 3218, 5543, + 4096, 568, -5681, -1682, 5352, 2731, -4816, -3675, + 4096, 4478, -3218, -5109, 2217, 5543, -1130, -5765, + 4096, -568, -5681, 1682, 5352, -2731, -4816, 3675, + 4096, -4478, -3218, 5109, 2217, -5543, -1130, 5765, + 4096, -1682, -4816, 4478, 2217, -5765, 1130, 5109, + -4096, -2731, 5681, -568, -5352, 3675, 3218, -5543, + 4096, -2731, -3218, 5765, -2217, -3675, 5681, -1682, + -4096, 5543, -1130, -4478, 5352, -568, -4816, 5109, + 4096, -3675, -1130, 5109, -5352, 1682, 3218, -5765, + 4096, 568, -4816, 5543, -2217, -2731, 5681, -4478, + 4096, -4478, 1130, 2731, -5352, 5543, -3218, -568, + 4096, -5765, 4816, -1682, -2217, 5109, -5681, 3675, + 4096, -5109, 3218, -568, -2217, 4478, -5681, 5543, + -4096, 1682, 1130, -3675, 5352, -5765, 4816, -2731, + 4096, -5543, 4816, -3675, 2217, -568, -1130, 2731, + -4096, 5109, -5681, 5765, -5352, 4478, -3218, 1682, + 4096, -5765, 5681, -5543, 5352, -5109, 4816, -4478, + 4096, -3675, 3218, -2731, 2217, -1682, 1130, -568 +}; + +static const int16_t iadst_i16[256] = { + 542, 1607, 2614, 3526, 4311, 4940, 5390, 5646, + 5698, 5543, 5189, 4646, 3936, 3084, 2120, 1080, + 1080, 3084, 4646, 5543, 5646, 4940, 3526, 1607, + -542, -2614, -4311, -5390, -5698, -5189, -3936, -2120, + 1607, 4311, 5646, 5189, 3084, 0, -3084, -5189, + -5646, -4311, -1607, 1607, 4311, 5646, 5189, 3084, + 2120, 5189, 5390, 2614, -1607, -4940, -5543, -3084, + 1080, 4646, 5646, 3526, -542, -4311, -5698, -3936, + 2614, 5646, 3936, -1080, -5189, -4940, -542, 4311, + 5543, 2120, -3084, -5698, -3526, 1607, 5390, 4646, + 3084, 5646, 1607, -4311, -5189, 0, 5189, 4311, + -1607, -5646, -3084, 3084, 5646, 1607, -4311, -5189, + 3526, 5189, -1080, -5698, -1607, 4940, 3936, -3084, + -5390, 542, 5646, 2120, -4646, -4311, 2614, 5543, + 3936, 4311, -3526, -4646, 3084, 4940, -2614, -5189, + 2120, 5390, -1607, -5543, 1080, 5646, -542, -5698, + 4311, 3084, -5189, -1607, 5646, 0, -5646, 1607, + 5189, -3084, -4311, 4311, 3084, -5189, -1607, 5646, + 4646, 1607, -5698, 2120, 4311, -4940, -1080, 5646, + -2614, -3936, 5189, 542, -5543, 3084, 3526, -5390, + 4940, 0, -4940, 4940, 0, -4940, 4940, 0, + -4940, 4940, 0, -4940, 4940, 0, -4940, 4940, + 5189, -1607, -3084, 5646, -4311, 0, 4311, -5646, + 3084, 1607, -5189, 5189, -1607, -3084, 5646, -4311, + 5390, -3084, -542, 3936, -5646, 4940, -2120, -1607, + 4646, -5698, 4311, -1080, -2614, 5189, -5543, 3526, + 5543, -4311, 2120, 542, -3084, 4940, -5698, 5189, + -3526, 1080, 1607, -3936, 5390, -5646, 4646, -2614, + 5646, -5189, 4311, -3084, 1607, 0, -1607, 3084, + -4311, 5189, -5646, 5646, -5189, 4311, -3084, 1607, + 5698, -5646, 5543, -5390, 5189, -4940, 4646, -4311, + 3936, -3526, 3084, -2614, 2120, -1607, 1080, -542 +}; + +/* For test */ +#define TEST_INT 1 +#if TEST_INT +#define vp9_ihtllm_int_c vp9_ihtllm_c +#else +#define vp9_ihtllm_float_c vp9_ihtllm_c +#endif + +void vp9_ihtllm_float_c(const int16_t *input, int16_t *output, int pitch, + TX_TYPE tx_type, int tx_dim) { + vp9_clear_system_state(); // Make it simd safe : __asm emms; + { + int i, j, k; + float bufa[256], bufb[256]; // buffers are for floating-point test purpose + // the implementation could be simplified in + // conjunction with integer transform + const int16_t *ip = input; + int16_t *op = output; + int shortpitch = pitch >> 1; + + float *pfa = &bufa[0]; + float *pfb = &bufb[0]; + + // pointers to vertical and horizontal transforms + const float *ptv, *pth; + + assert(tx_type != DCT_DCT); + // load and convert residual array into floating-point + for(j = 0; j < tx_dim; j++) { + for(i = 0; i < tx_dim; i++) { + pfa[i] = (float)ip[i]; + } + pfa += tx_dim; + ip += tx_dim; + } + + // vertical transformation + pfa = &bufa[0]; + pfb = &bufb[0]; + + switch(tx_type) { + case ADST_ADST : + case ADST_DCT : + ptv = (tx_dim == 4) ? &iadst_4[0] : + ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]); + break; + + default : + ptv = (tx_dim == 4) ? &idct_4[0] : + ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]); + break; + } + + for(j = 0; j < tx_dim; j++) { + for(i = 0; i < tx_dim; i++) { + pfb[i] = 0 ; + for(k = 0; k < tx_dim; k++) { + pfb[i] += ptv[k] * pfa[(k * tx_dim)]; + } + pfa += 1; + } + + pfb += tx_dim; + ptv += tx_dim; + pfa = &bufa[0]; + } + + // horizontal transformation + pfa = &bufa[0]; + pfb = &bufb[0]; + + switch(tx_type) { + case ADST_ADST : + case DCT_ADST : + pth = (tx_dim == 4) ? &iadst_4[0] : + ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]); + break; + + default : + pth = (tx_dim == 4) ? &idct_4[0] : + ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]); + break; + } + + for(j = 0; j < tx_dim; j++) { + for(i = 0; i < tx_dim; i++) { + pfa[i] = 0; + for(k = 0; k < tx_dim; k++) { + pfa[i] += pfb[k] * pth[k]; + } + pth += tx_dim; + } + + pfa += tx_dim; + pfb += tx_dim; + + switch(tx_type) { + case ADST_ADST : + case DCT_ADST : + pth = (tx_dim == 4) ? &iadst_4[0] : + ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]); + break; + + default : + pth = (tx_dim == 4) ? &idct_4[0] : + ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]); + break; + } + } + + // convert to short integer format and load BLOCKD buffer + op = output; + pfa = &bufa[0]; + + for(j = 0; j < tx_dim; j++) { + for(i = 0; i < tx_dim; i++) { + op[i] = (pfa[i] > 0 ) ? (int16_t)( pfa[i] / 8 + 0.49) : + -(int16_t)( - pfa[i] / 8 + 0.49); + } + + op += shortpitch; + pfa += tx_dim; + } + } + vp9_clear_system_state(); // Make it simd safe : __asm emms; +} + +/* Converted the transforms to integer form. */ +#define VERTICAL_SHIFT 14 // 16 +#define VERTICAL_ROUNDING ((1 << (VERTICAL_SHIFT - 1)) - 1) +#define HORIZONTAL_SHIFT 17 // 15 +#define HORIZONTAL_ROUNDING ((1 << (HORIZONTAL_SHIFT - 1)) - 1) +void vp9_ihtllm_int_c(const int16_t *input, int16_t *output, int pitch, + TX_TYPE tx_type, int tx_dim) { + int i, j, k; + int16_t imbuf[256]; + + const int16_t *ip = input; + int16_t *op = output; + int16_t *im = &imbuf[0]; + + /* pointers to vertical and horizontal transforms. */ + const int16_t *ptv = NULL, *pth = NULL; + int shortpitch = pitch >> 1; + + switch (tx_type) { + case ADST_ADST : + ptv = pth = (tx_dim == 4) ? &iadst_i4[0] + : ((tx_dim == 8) ? &iadst_i8[0] + : &iadst_i16[0]); + break; + case ADST_DCT : + ptv = (tx_dim == 4) ? &iadst_i4[0] + : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]); + pth = (tx_dim == 4) ? &idct_i4[0] + : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]); + break; + case DCT_ADST : + ptv = (tx_dim == 4) ? &idct_i4[0] + : ((tx_dim == 8) ? &idct_i8[0] : &idct_i16[0]); + pth = (tx_dim == 4) ? &iadst_i4[0] + : ((tx_dim == 8) ? &iadst_i8[0] : &iadst_i16[0]); + break; + case DCT_DCT : + ptv = pth = (tx_dim == 4) ? &idct_i4[0] + : ((tx_dim == 8) ? &idct_i8[0] + : &idct_i16[0]); + break; + default: + assert(0); + break; + } + + /* vertical transformation */ + for (j = 0; j < tx_dim; j++) { + for (i = 0; i < tx_dim; i++) { + int temp = 0; + + for (k = 0; k < tx_dim; k++) { + temp += ptv[k] * ip[(k * tx_dim)]; + } + + im[i] = (int16_t)((temp + VERTICAL_ROUNDING) >> VERTICAL_SHIFT); + ip++; + } + im += tx_dim; // 16 + ptv += tx_dim; + ip = input; + } + + /* horizontal transformation */ + im = &imbuf[0]; + + for (j = 0; j < tx_dim; j++) { + const int16_t *pthc = pth; + + for (i = 0; i < tx_dim; i++) { + int temp = 0; + + for (k = 0; k < tx_dim; k++) { + temp += im[k] * pthc[k]; + } + + op[i] = (int16_t)((temp + HORIZONTAL_ROUNDING) >> HORIZONTAL_SHIFT); + pthc += tx_dim; + } + + im += tx_dim; // 16 + op += shortpitch; + } +} + +void vp9_short_idct4x4llm_c(short *input, short *output, int pitch) { + int i; + int a1, b1, c1, d1; + + short *ip = input; + short *op = output; + int temp1, temp2; + int shortpitch = pitch >> 1; + + for (i = 0; i < 4; i++) { + a1 = ip[0] + ip[8]; + b1 = ip[0] - ip[8]; + + temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16; + temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16); + temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16; + d1 = temp1 + temp2; + + op[shortpitch * 0] = a1 + d1; + op[shortpitch * 3] = a1 - d1; + + op[shortpitch * 1] = b1 + c1; + op[shortpitch * 2] = b1 - c1; + + ip++; + op++; + } + + ip = output; + op = output; + + for (i = 0; i < 4; i++) { + a1 = ip[0] + ip[2]; + b1 = ip[0] - ip[2]; + + temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16; + temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16); + temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16; + d1 = temp1 + temp2; + + op[0] = (a1 + d1 + 16) >> 5; + op[3] = (a1 - d1 + 16) >> 5; + + op[1] = (b1 + c1 + 16) >> 5; + op[2] = (b1 - c1 + 16) >> 5; + + ip += shortpitch; + op += shortpitch; + } +} + +void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch) { + int i; + int a1; + short *op = output; + int shortpitch = pitch >> 1; + a1 = ((input[0] + 16) >> 5); + for (i = 0; i < 4; i++) { + op[0] = a1; + op[1] = a1; + op[2] = a1; + op[3] = a1; + op += shortpitch; + } +} + +void vp9_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, + unsigned char *dst_ptr, int pitch, int stride) { + int a1 = ((input_dc + 16) >> 5); + int r, c; + + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) { + int a = a1 + pred_ptr[c]; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a; + } + + dst_ptr += stride; + pred_ptr += pitch; + } +} + +void vp9_short_inv_walsh4x4_c(short *input, short *output) { + int i; + int a1, b1, c1, d1; + short *ip = input; + short *op = output; + + for (i = 0; i < 4; i++) { + a1 = ((ip[0] + ip[3])); + b1 = ((ip[1] + ip[2])); + c1 = ((ip[1] - ip[2])); + d1 = ((ip[0] - ip[3])); + + op[0] = (a1 + b1 + 1) >> 1; + op[1] = (c1 + d1) >> 1; + op[2] = (a1 - b1) >> 1; + op[3] = (d1 - c1) >> 1; + + ip += 4; + op += 4; + } + + ip = output; + op = output; + for (i = 0; i < 4; i++) { + a1 = ip[0] + ip[12]; + b1 = ip[4] + ip[8]; + c1 = ip[4] - ip[8]; + d1 = ip[0] - ip[12]; + op[0] = (a1 + b1 + 1) >> 1; + op[4] = (c1 + d1) >> 1; + op[8] = (a1 - b1) >> 1; + op[12] = (d1 - c1) >> 1; + ip++; + op++; + } +} + +void vp9_short_inv_walsh4x4_1_c(short *in, short *out) { + int i; + short tmp[4]; + short *ip = in; + short *op = tmp; + + op[0] = (ip[0] + 1) >> 1; + op[1] = op[2] = op[3] = (ip[0] >> 1); + + ip = tmp; + op = out; + for (i = 0; i < 4; i++) { + op[0] = (ip[0] + 1) >> 1; + op[4] = op[8] = op[12] = (ip[0] >> 1); + ip++; + op++; + } +} + +#if CONFIG_LOSSLESS +void vp9_short_inv_walsh4x4_lossless_c(short *input, short *output) { + int i; + int a1, b1, c1, d1; + short *ip = input; + short *op = output; + + for (i = 0; i < 4; i++) { + a1 = ((ip[0] + ip[3])) >> Y2_WHT_UPSCALE_FACTOR; + b1 = ((ip[1] + ip[2])) >> Y2_WHT_UPSCALE_FACTOR; + c1 = ((ip[1] - ip[2])) >> Y2_WHT_UPSCALE_FACTOR; + d1 = ((ip[0] - ip[3])) >> Y2_WHT_UPSCALE_FACTOR; + + op[0] = (a1 + b1 + 1) >> 1; + op[1] = (c1 + d1) >> 1; + op[2] = (a1 - b1) >> 1; + op[3] = (d1 - c1) >> 1; + + ip += 4; + op += 4; + } + + ip = output; + op = output; + for (i = 0; i < 4; i++) { + a1 = ip[0] + ip[12]; + b1 = ip[4] + ip[8]; + c1 = ip[4] - ip[8]; + d1 = ip[0] - ip[12]; + + + op[0] = ((a1 + b1 + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR; + op[4] = ((c1 + d1) >> 1) << Y2_WHT_UPSCALE_FACTOR; + op[8] = ((a1 - b1) >> 1) << Y2_WHT_UPSCALE_FACTOR; + op[12] = ((d1 - c1) >> 1) << Y2_WHT_UPSCALE_FACTOR; + + ip++; + op++; + } +} + +void vp9_short_inv_walsh4x4_1_lossless_c(short *in, short *out) { + int i; + short tmp[4]; + short *ip = in; + short *op = tmp; + + op[0] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) + 1) >> 1; + op[1] = op[2] = op[3] = ((ip[0] >> Y2_WHT_UPSCALE_FACTOR) >> 1); + + ip = tmp; + op = out; + for (i = 0; i < 4; i++) { + op[0] = ((ip[0] + 1) >> 1) << Y2_WHT_UPSCALE_FACTOR; + op[4] = op[8] = op[12] = ((ip[0] >> 1)) << Y2_WHT_UPSCALE_FACTOR; + ip++; + op++; + } +} + +void vp9_short_inv_walsh4x4_x8_c(short *input, short *output, int pitch) { + int i; + int a1, b1, c1, d1; + short *ip = input; + short *op = output; + int shortpitch = pitch >> 1; + + for (i = 0; i < 4; i++) { + a1 = ((ip[0] + ip[3])) >> WHT_UPSCALE_FACTOR; + b1 = ((ip[1] + ip[2])) >> WHT_UPSCALE_FACTOR; + c1 = ((ip[1] - ip[2])) >> WHT_UPSCALE_FACTOR; + d1 = ((ip[0] - ip[3])) >> WHT_UPSCALE_FACTOR; + + op[0] = (a1 + b1 + 1) >> 1; + op[1] = (c1 + d1) >> 1; + op[2] = (a1 - b1) >> 1; + op[3] = (d1 - c1) >> 1; + + ip += 4; + op += shortpitch; + } + + ip = output; + op = output; + for (i = 0; i < 4; i++) { + a1 = ip[shortpitch * 0] + ip[shortpitch * 3]; + b1 = ip[shortpitch * 1] + ip[shortpitch * 2]; + c1 = ip[shortpitch * 1] - ip[shortpitch * 2]; + d1 = ip[shortpitch * 0] - ip[shortpitch * 3]; + + + op[shortpitch * 0] = (a1 + b1 + 1) >> 1; + op[shortpitch * 1] = (c1 + d1) >> 1; + op[shortpitch * 2] = (a1 - b1) >> 1; + op[shortpitch * 3] = (d1 - c1) >> 1; + + ip++; + op++; + } +} + +void vp9_short_inv_walsh4x4_1_x8_c(short *in, short *out, int pitch) { + int i; + short tmp[4]; + short *ip = in; + short *op = tmp; + int shortpitch = pitch >> 1; + + op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1; + op[1] = op[2] = op[3] = ((ip[0] >> WHT_UPSCALE_FACTOR) >> 1); + + + ip = tmp; + op = out; + for (i = 0; i < 4; i++) { + op[shortpitch * 0] = (ip[0] + 1) >> 1; + op[shortpitch * 1] = op[shortpitch * 2] = op[shortpitch * 3] = ip[0] >> 1; + ip++; + op++; + } +} + +void vp9_dc_only_inv_walsh_add_c(short input_dc, unsigned char *pred_ptr, + unsigned char *dst_ptr, + int pitch, int stride) { + int r, c; + short tmp[16]; + vp9_short_inv_walsh4x4_1_x8_c(&input_dc, tmp, 4 << 1); + + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) { + int a = tmp[r * 4 + c] + pred_ptr[c]; + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a; + } + + dst_ptr += stride; + pred_ptr += pitch; + } +} +#endif + +void vp9_dc_only_idct_add_8x8_c(short input_dc, + unsigned char *pred_ptr, + unsigned char *dst_ptr, + int pitch, int stride) { + int a1 = ((input_dc + 16) >> 5); + int r, c, b; + unsigned char *orig_pred = pred_ptr; + unsigned char *orig_dst = dst_ptr; + for (b = 0; b < 4; b++) { + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) { + int a = a1 + pred_ptr[c]; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a; + } + + dst_ptr += stride; + pred_ptr += pitch; + } + dst_ptr = orig_dst + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * stride; + pred_ptr = orig_pred + (b + 1) % 2 * 4 + (b + 1) / 2 * 4 * pitch; + } +} + +#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */ +#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */ +#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */ +#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */ +#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */ +#define W7 565 /* 2048*sqrt(2)*cos(7*pi/16) */ + +/* row (horizontal) IDCT + * + * 7 pi 1 dst[k] = sum c[l] * src[l] * cos( -- * + * ( k + - ) * l ) l=0 8 2 + * + * where: c[0] = 128 c[1..7] = 128*sqrt(2) */ + +static void idctrow(int *blk) { + int x0, x1, x2, x3, x4, x5, x6, x7, x8; + /* shortcut */ + if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) | + (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) { + blk[0] = blk[1] = blk[2] = blk[3] = blk[4] + = blk[5] = blk[6] = blk[7] = blk[0] << 3; + return; + } + + x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */ + /* first stage */ + x8 = W7 * (x4 + x5); + x4 = x8 + (W1 - W7) * x4; + x5 = x8 - (W1 + W7) * x5; + x8 = W3 * (x6 + x7); + x6 = x8 - (W3 - W5) * x6; + x7 = x8 - (W3 + W5) * x7; + + /* second stage */ + x8 = x0 + x1; + x0 -= x1; + x1 = W6 * (x3 + x2); + x2 = x1 - (W2 + W6) * x2; + x3 = x1 + (W2 - W6) * x3; + x1 = x4 + x6; + x4 -= x6; + x6 = x5 + x7; + x5 -= x7; + + /* third stage */ + x7 = x8 + x3; + x8 -= x3; + x3 = x0 + x2; + x0 -= x2; + x2 = (181 * (x4 + x5) + 128) >> 8; + x4 = (181 * (x4 - x5) + 128) >> 8; + + /* fourth stage */ + blk[0] = (x7 + x1) >> 8; + blk[1] = (x3 + x2) >> 8; + blk[2] = (x0 + x4) >> 8; + blk[3] = (x8 + x6) >> 8; + blk[4] = (x8 - x6) >> 8; + blk[5] = (x0 - x4) >> 8; + blk[6] = (x3 - x2) >> 8; + blk[7] = (x7 - x1) >> 8; +} + +/* column (vertical) IDCT + * + * 7 pi 1 dst[8*k] = sum c[l] * src[8*l] * + * cos( -- * ( k + - ) * l ) l=0 8 2 + * + * where: c[0] = 1/1024 c[1..7] = (1/1024)*sqrt(2) */ +static void idctcol(int *blk) { + int x0, x1, x2, x3, x4, x5, x6, x7, x8; + + /* shortcut */ + if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) | + (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) | + (x7 = blk[8 * 3]))) { + blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] + = blk[8 * 4] = blk[8 * 5] = blk[8 * 6] + = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6); + return; + } + + x0 = (blk[8 * 0] << 8) + 16384; + + /* first stage */ + x8 = W7 * (x4 + x5) + 4; + x4 = (x8 + (W1 - W7) * x4) >> 3; + x5 = (x8 - (W1 + W7) * x5) >> 3; + x8 = W3 * (x6 + x7) + 4; + x6 = (x8 - (W3 - W5) * x6) >> 3; + x7 = (x8 - (W3 + W5) * x7) >> 3; + + /* second stage */ + x8 = x0 + x1; + x0 -= x1; + x1 = W6 * (x3 + x2) + 4; + x2 = (x1 - (W2 + W6) * x2) >> 3; + x3 = (x1 + (W2 - W6) * x3) >> 3; + x1 = x4 + x6; + x4 -= x6; + x6 = x5 + x7; + x5 -= x7; + + /* third stage */ + x7 = x8 + x3; + x8 -= x3; + x3 = x0 + x2; + x0 -= x2; + x2 = (181 * (x4 + x5) + 128) >> 8; + x4 = (181 * (x4 - x5) + 128) >> 8; + + /* fourth stage */ + blk[8 * 0] = (x7 + x1) >> 14; + blk[8 * 1] = (x3 + x2) >> 14; + blk[8 * 2] = (x0 + x4) >> 14; + blk[8 * 3] = (x8 + x6) >> 14; + blk[8 * 4] = (x8 - x6) >> 14; + blk[8 * 5] = (x0 - x4) >> 14; + blk[8 * 6] = (x3 - x2) >> 14; + blk[8 * 7] = (x7 - x1) >> 14; +} + +#define TX_DIM 8 +void vp9_short_idct8x8_c(short *coefs, short *block, int pitch) { + int X[TX_DIM * TX_DIM]; + int i, j; + int shortpitch = pitch >> 1; + + for (i = 0; i < TX_DIM; i++) { + for (j = 0; j < TX_DIM; j++) { + X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1 + + (coefs[i * TX_DIM + j] < 0)) >> 2; + } + } + for (i = 0; i < 8; i++) + idctrow(X + 8 * i); + + for (i = 0; i < 8; i++) + idctcol(X + i); + + for (i = 0; i < TX_DIM; i++) { + for (j = 0; j < TX_DIM; j++) { + block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1; + } + } +} + +/* Row IDCT when only first 4 coefficients are non-zero. */ +static void idctrow10(int *blk) { + int x0, x1, x2, x3, x4, x5, x6, x7, x8; + + /* shortcut */ + if (!((x1 = blk[4] << 11) | (x2 = blk[6]) | (x3 = blk[2]) | + (x4 = blk[1]) | (x5 = blk[7]) | (x6 = blk[5]) | (x7 = blk[3]))) { + blk[0] = blk[1] = blk[2] = blk[3] = blk[4] + = blk[5] = blk[6] = blk[7] = blk[0] << 3; + return; + } + + x0 = (blk[0] << 11) + 128; /* for proper rounding in the fourth stage */ + /* first stage */ + x5 = W7 * x4; + x4 = W1 * x4; + x6 = W3 * x7; + x7 = -W5 * x7; + + /* second stage */ + x2 = W6 * x3; + x3 = W2 * x3; + x1 = x4 + x6; + x4 -= x6; + x6 = x5 + x7; + x5 -= x7; + + /* third stage */ + x7 = x0 + x3; + x8 = x0 - x3; + x3 = x0 + x2; + x0 -= x2; + x2 = (181 * (x4 + x5) + 128) >> 8; + x4 = (181 * (x4 - x5) + 128) >> 8; + + /* fourth stage */ + blk[0] = (x7 + x1) >> 8; + blk[1] = (x3 + x2) >> 8; + blk[2] = (x0 + x4) >> 8; + blk[3] = (x8 + x6) >> 8; + blk[4] = (x8 - x6) >> 8; + blk[5] = (x0 - x4) >> 8; + blk[6] = (x3 - x2) >> 8; + blk[7] = (x7 - x1) >> 8; +} + +/* Column (vertical) IDCT when only first 4 coefficients are non-zero. */ +static void idctcol10(int *blk) { + int x0, x1, x2, x3, x4, x5, x6, x7, x8; + + /* shortcut */ + if (!((x1 = (blk[8 * 4] << 8)) | (x2 = blk[8 * 6]) | (x3 = blk[8 * 2]) | + (x4 = blk[8 * 1]) | (x5 = blk[8 * 7]) | (x6 = blk[8 * 5]) | + (x7 = blk[8 * 3]))) { + blk[8 * 0] = blk[8 * 1] = blk[8 * 2] = blk[8 * 3] + = blk[8 * 4] = blk[8 * 5] = blk[8 * 6] + = blk[8 * 7] = ((blk[8 * 0] + 32) >> 6); + return; + } + + x0 = (blk[8 * 0] << 8) + 16384; + + /* first stage */ + x5 = (W7 * x4 + 4) >> 3; + x4 = (W1 * x4 + 4) >> 3; + x6 = (W3 * x7 + 4) >> 3; + x7 = (-W5 * x7 + 4) >> 3; + + /* second stage */ + x2 = (W6 * x3 + 4) >> 3; + x3 = (W2 * x3 + 4) >> 3; + x1 = x4 + x6; + x4 -= x6; + x6 = x5 + x7; + x5 -= x7; + + /* third stage */ + x7 = x0 + x3; + x8 = x0 - x3; + x3 = x0 + x2; + x0 -= x2; + x2 = (181 * (x4 + x5) + 128) >> 8; + x4 = (181 * (x4 - x5) + 128) >> 8; + + /* fourth stage */ + blk[8 * 0] = (x7 + x1) >> 14; + blk[8 * 1] = (x3 + x2) >> 14; + blk[8 * 2] = (x0 + x4) >> 14; + blk[8 * 3] = (x8 + x6) >> 14; + blk[8 * 4] = (x8 - x6) >> 14; + blk[8 * 5] = (x0 - x4) >> 14; + blk[8 * 6] = (x3 - x2) >> 14; + blk[8 * 7] = (x7 - x1) >> 14; +} + +void vp9_short_idct10_8x8_c(short *coefs, short *block, int pitch) { + int X[TX_DIM * TX_DIM]; + int i, j; + int shortpitch = pitch >> 1; + + for (i = 0; i < TX_DIM; i++) { + for (j = 0; j < TX_DIM; j++) { + X[i * TX_DIM + j] = (int)(coefs[i * TX_DIM + j] + 1 + + (coefs[i * TX_DIM + j] < 0)) >> 2; + } + } + + /* Do first 4 row idct only since non-zero dct coefficients are all in + * upper-left 4x4 area. */ + for (i = 0; i < 4; i++) + idctrow10(X + 8 * i); + + for (i = 0; i < 8; i++) + idctcol10(X + i); + + for (i = 0; i < TX_DIM; i++) { + for (j = 0; j < TX_DIM; j++) { + block[i * shortpitch + j] = X[i * TX_DIM + j] >> 1; + } + } +} + +void vp9_short_ihaar2x2_c(short *input, short *output, int pitch) { + int i; + short *ip = input; // 0,1, 4, 8 + short *op = output; + for (i = 0; i < 16; i++) { + op[i] = 0; + } + + op[0] = (ip[0] + ip[1] + ip[4] + ip[8] + 1) >> 1; + op[1] = (ip[0] - ip[1] + ip[4] - ip[8]) >> 1; + op[4] = (ip[0] + ip[1] - ip[4] - ip[8]) >> 1; + op[8] = (ip[0] - ip[1] - ip[4] + ip[8]) >> 1; +} + + +#if 0 +// Keep a really bad float version as reference for now. +void vp9_short_idct16x16_c(short *input, short *output, int pitch) { + + vp9_clear_system_state(); // Make it simd safe : __asm emms; + { + double x; + const int short_pitch = pitch >> 1; + int i, j, k, l; + for (l = 0; l < 16; ++l) { + for (k = 0; k < 16; ++k) { + double s = 0; + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) { + x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32; + if (i != 0) + x *= sqrt(2.0); + if (j != 0) + x *= sqrt(2.0); + s += x; + } + } + output[k*short_pitch+l] = (short)round(s); + } + } + } + vp9_clear_system_state(); // Make it simd safe : __asm emms; +} +#endif + +#define TEST_INT_16x16_IDCT 1 +#if !TEST_INT_16x16_IDCT +static const double C1 = 0.995184726672197; +static const double C2 = 0.98078528040323; +static const double C3 = 0.956940335732209; +static const double C4 = 0.923879532511287; +static const double C5 = 0.881921264348355; +static const double C6 = 0.831469612302545; +static const double C7 = 0.773010453362737; +static const double C8 = 0.707106781186548; +static const double C9 = 0.634393284163646; +static const double C10 = 0.555570233019602; +static const double C11 = 0.471396736825998; +static const double C12 = 0.38268343236509; +static const double C13 = 0.290284677254462; +static const double C14 = 0.195090322016128; +static const double C15 = 0.098017140329561; + + +static void butterfly_16x16_idct_1d(double input[16], double output[16]) { + + vp9_clear_system_state(); // Make it simd safe : __asm emms; + { + double step[16]; + double intermediate[16]; + double temp1, temp2; + + + // step 1 and 2 + step[ 0] = input[0] + input[8]; + step[ 1] = input[0] - input[8]; + + temp1 = input[4]*C12; + temp2 = input[12]*C4; + + temp1 -= temp2; + temp1 *= C8; + + step[ 2] = 2*(temp1); + + temp1 = input[4]*C4; + temp2 = input[12]*C12; + temp1 += temp2; + temp1 = (temp1); + temp1 *= C8; + step[ 3] = 2*(temp1); + + temp1 = input[2]*C8; + temp1 = 2*(temp1); + temp2 = input[6] + input[10]; + + step[ 4] = temp1 + temp2; + step[ 5] = temp1 - temp2; + + temp1 = input[14]*C8; + temp1 = 2*(temp1); + temp2 = input[6] - input[10]; + + step[ 6] = temp2 - temp1; + step[ 7] = temp2 + temp1; + + // for odd input + temp1 = input[3]*C12; + temp2 = input[13]*C4; + temp1 += temp2; + temp1 = (temp1); + temp1 *= C8; + intermediate[ 8] = 2*(temp1); + + temp1 = input[3]*C4; + temp2 = input[13]*C12; + temp2 -= temp1; + temp2 = (temp2); + temp2 *= C8; + intermediate[ 9] = 2*(temp2); + + intermediate[10] = 2*(input[9]*C8); + intermediate[11] = input[15] - input[1]; + intermediate[12] = input[15] + input[1]; + intermediate[13] = 2*((input[7]*C8)); + + temp1 = input[11]*C12; + temp2 = input[5]*C4; + temp2 -= temp1; + temp2 = (temp2); + temp2 *= C8; + intermediate[14] = 2*(temp2); + + temp1 = input[11]*C4; + temp2 = input[5]*C12; + temp1 += temp2; + temp1 = (temp1); + temp1 *= C8; + intermediate[15] = 2*(temp1); + + step[ 8] = intermediate[ 8] + intermediate[14]; + step[ 9] = intermediate[ 9] + intermediate[15]; + step[10] = intermediate[10] + intermediate[11]; + step[11] = intermediate[10] - intermediate[11]; + step[12] = intermediate[12] + intermediate[13]; + step[13] = intermediate[12] - intermediate[13]; + step[14] = intermediate[ 8] - intermediate[14]; + step[15] = intermediate[ 9] - intermediate[15]; + + // step 3 + output[0] = step[ 0] + step[ 3]; + output[1] = step[ 1] + step[ 2]; + output[2] = step[ 1] - step[ 2]; + output[3] = step[ 0] - step[ 3]; + + temp1 = step[ 4]*C14; + temp2 = step[ 7]*C2; + temp1 -= temp2; + output[4] = (temp1); + + temp1 = step[ 4]*C2; + temp2 = step[ 7]*C14; + temp1 += temp2; + output[7] = (temp1); + + temp1 = step[ 5]*C10; + temp2 = step[ 6]*C6; + temp1 -= temp2; + output[5] = (temp1); + + temp1 = step[ 5]*C6; + temp2 = step[ 6]*C10; + temp1 += temp2; + output[6] = (temp1); + + output[8] = step[ 8] + step[11]; + output[9] = step[ 9] + step[10]; + output[10] = step[ 9] - step[10]; + output[11] = step[ 8] - step[11]; + output[12] = step[12] + step[15]; + output[13] = step[13] + step[14]; + output[14] = step[13] - step[14]; + output[15] = step[12] - step[15]; + + // output 4 + step[ 0] = output[0] + output[7]; + step[ 1] = output[1] + output[6]; + step[ 2] = output[2] + output[5]; + step[ 3] = output[3] + output[4]; + step[ 4] = output[3] - output[4]; + step[ 5] = output[2] - output[5]; + step[ 6] = output[1] - output[6]; + step[ 7] = output[0] - output[7]; + + temp1 = output[8]*C7; + temp2 = output[15]*C9; + temp1 -= temp2; + step[ 8] = (temp1); + + temp1 = output[9]*C11; + temp2 = output[14]*C5; + temp1 += temp2; + step[ 9] = (temp1); + + temp1 = output[10]*C3; + temp2 = output[13]*C13; + temp1 -= temp2; + step[10] = (temp1); + + temp1 = output[11]*C15; + temp2 = output[12]*C1; + temp1 += temp2; + step[11] = (temp1); + + temp1 = output[11]*C1; + temp2 = output[12]*C15; + temp2 -= temp1; + step[12] = (temp2); + + temp1 = output[10]*C13; + temp2 = output[13]*C3; + temp1 += temp2; + step[13] = (temp1); + + temp1 = output[9]*C5; + temp2 = output[14]*C11; + temp2 -= temp1; + step[14] = (temp2); + + temp1 = output[8]*C9; + temp2 = output[15]*C7; + temp1 += temp2; + step[15] = (temp1); + + // step 5 + output[0] = (step[0] + step[15]); + output[1] = (step[1] + step[14]); + output[2] = (step[2] + step[13]); + output[3] = (step[3] + step[12]); + output[4] = (step[4] + step[11]); + output[5] = (step[5] + step[10]); + output[6] = (step[6] + step[ 9]); + output[7] = (step[7] + step[ 8]); + + output[15] = (step[0] - step[15]); + output[14] = (step[1] - step[14]); + output[13] = (step[2] - step[13]); + output[12] = (step[3] - step[12]); + output[11] = (step[4] - step[11]); + output[10] = (step[5] - step[10]); + output[9] = (step[6] - step[ 9]); + output[8] = (step[7] - step[ 8]); + } + vp9_clear_system_state(); // Make it simd safe : __asm emms; +} + +// Remove once an int version of iDCT is written +#if 0 +void reference_16x16_idct_1d(double input[16], double output[16]) { + + vp9_clear_system_state(); // Make it simd safe : __asm emms; + { + const double kPi = 3.141592653589793238462643383279502884; + const double kSqrt2 = 1.414213562373095048801688724209698; + for (int k = 0; k < 16; k++) { + output[k] = 0.0; + for (int n = 0; n < 16; n++) { + output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0); + if (n == 0) + output[k] = output[k]/kSqrt2; + } + } + } + vp9_clear_system_state(); // Make it simd safe : __asm emms; +} +#endif + +void vp9_short_idct16x16_c(short *input, short *output, int pitch) { + + vp9_clear_system_state(); // Make it simd safe : __asm emms; + { + double out[16*16], out2[16*16]; + const int short_pitch = pitch >> 1; + int i, j; + // First transform rows + for (i = 0; i < 16; ++i) { + double temp_in[16], temp_out[16]; + for (j = 0; j < 16; ++j) + temp_in[j] = input[j + i*short_pitch]; + butterfly_16x16_idct_1d(temp_in, temp_out); + for (j = 0; j < 16; ++j) + out[j + i*16] = temp_out[j]; + } + // Then transform columns + for (i = 0; i < 16; ++i) { + double temp_in[16], temp_out[16]; + for (j = 0; j < 16; ++j) + temp_in[j] = out[j*16 + i]; + butterfly_16x16_idct_1d(temp_in, temp_out); + for (j = 0; j < 16; ++j) + out2[j*16 + i] = temp_out[j]; + } + for (i = 0; i < 16*16; ++i) + output[i] = round(out2[i]/128); + } + vp9_clear_system_state(); // Make it simd safe : __asm emms; +} + +#else +static const int16_t C1 = 16305; +static const int16_t C2 = 16069; +static const int16_t C3 = 15679; +static const int16_t C4 = 15137; +static const int16_t C5 = 14449; +static const int16_t C6 = 13623; +static const int16_t C7 = 12665; +static const int16_t C8 = 11585; +static const int16_t C9 = 10394; +static const int16_t C10 = 9102; +static const int16_t C11 = 7723; +static const int16_t C12 = 6270; +static const int16_t C13 = 4756; +static const int16_t C14 = 3196; +static const int16_t C15 = 1606; + +#define INITIAL_SHIFT 2 +#define INITIAL_ROUNDING (1 << (INITIAL_SHIFT - 1)) +#define RIGHT_SHIFT 14 +#define RIGHT_ROUNDING (1 << (RIGHT_SHIFT - 1)) + +static void butterfly_16x16_idct_1d(int16_t input[16], int16_t output[16], + int last_shift_bits) { + int16_t step[16]; + int intermediate[16]; + int temp1, temp2; + + int step1_shift = RIGHT_SHIFT + INITIAL_SHIFT; + int step1_rounding = 1 << (step1_shift - 1); + int last_rounding = 0; + + if (last_shift_bits > 0) + last_rounding = 1 << (last_shift_bits - 1); + + // step 1 and 2 + step[ 0] = (input[0] + input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[ 1] = (input[0] - input[8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + + temp1 = input[4] * C12; + temp2 = input[12] * C4; + temp1 = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + temp1 *= C8; + step[ 2] = (2 * (temp1) + step1_rounding) >> step1_shift; + + temp1 = input[4] * C4; + temp2 = input[12] * C12; + temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + temp1 *= C8; + step[ 3] = (2 * (temp1) + step1_rounding) >> step1_shift; + + temp1 = input[2] * C8; + temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; + temp2 = input[6] + input[10]; + step[ 4] = (temp1 + temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[ 5] = (temp1 - temp2 + INITIAL_ROUNDING) >> INITIAL_SHIFT; + + temp1 = input[14] * C8; + temp1 = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; + temp2 = input[6] - input[10]; + step[ 6] = (temp2 - temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[ 7] = (temp2 + temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; + + // for odd input + temp1 = input[3] * C12; + temp2 = input[13] * C4; + temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + temp1 *= C8; + intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = input[3] * C4; + temp2 = input[13] * C12; + temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + temp2 *= C8; + intermediate[ 9] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + intermediate[10] = (2 * (input[9] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; + intermediate[11] = input[15] - input[1]; + intermediate[12] = input[15] + input[1]; + intermediate[13] = (2 * (input[7] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = input[11] * C12; + temp2 = input[5] * C4; + temp2 = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + temp2 *= C8; + intermediate[14] = (2 * (temp2) + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = input[11] * C4; + temp2 = input[5] * C12; + temp1 = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + temp1 *= C8; + intermediate[15] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + step[ 8] = (intermediate[ 8] + intermediate[14] + INITIAL_ROUNDING) + >> INITIAL_SHIFT; + step[ 9] = (intermediate[ 9] + intermediate[15] + INITIAL_ROUNDING) + >> INITIAL_SHIFT; + step[10] = (intermediate[10] + intermediate[11] + INITIAL_ROUNDING) + >> INITIAL_SHIFT; + step[11] = (intermediate[10] - intermediate[11] + INITIAL_ROUNDING) + >> INITIAL_SHIFT; + step[12] = (intermediate[12] + intermediate[13] + INITIAL_ROUNDING) + >> INITIAL_SHIFT; + step[13] = (intermediate[12] - intermediate[13] + INITIAL_ROUNDING) + >> INITIAL_SHIFT; + step[14] = (intermediate[ 8] - intermediate[14] + INITIAL_ROUNDING) + >> INITIAL_SHIFT; + step[15] = (intermediate[ 9] - intermediate[15] + INITIAL_ROUNDING) + >> INITIAL_SHIFT; + + // step 3 + output[0] = step[ 0] + step[ 3]; + output[1] = step[ 1] + step[ 2]; + output[2] = step[ 1] - step[ 2]; + output[3] = step[ 0] - step[ 3]; + + temp1 = step[ 4] * C14; + temp2 = step[ 7] * C2; + output[4] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = step[ 4] * C2; + temp2 = step[ 7] * C14; + output[7] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = step[ 5] * C10; + temp2 = step[ 6] * C6; + output[5] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = step[ 5] * C6; + temp2 = step[ 6] * C10; + output[6] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + output[8] = step[ 8] + step[11]; + output[9] = step[ 9] + step[10]; + output[10] = step[ 9] - step[10]; + output[11] = step[ 8] - step[11]; + output[12] = step[12] + step[15]; + output[13] = step[13] + step[14]; + output[14] = step[13] - step[14]; + output[15] = step[12] - step[15]; + + // output 4 + step[ 0] = output[0] + output[7]; + step[ 1] = output[1] + output[6]; + step[ 2] = output[2] + output[5]; + step[ 3] = output[3] + output[4]; + step[ 4] = output[3] - output[4]; + step[ 5] = output[2] - output[5]; + step[ 6] = output[1] - output[6]; + step[ 7] = output[0] - output[7]; + + temp1 = output[8] * C7; + temp2 = output[15] * C9; + step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[9] * C11; + temp2 = output[14] * C5; + step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[10] * C3; + temp2 = output[13] * C13; + step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[11] * C15; + temp2 = output[12] * C1; + step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[11] * C1; + temp2 = output[12] * C15; + step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[10] * C13; + temp2 = output[13] * C3; + step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[9] * C5; + temp2 = output[14] * C11; + step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[8] * C9; + temp2 = output[15] * C7; + step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + // step 5 + output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits; + output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits; + output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits; + output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits; + output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits; + output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits; + output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits; + output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits; + + output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits; + output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits; + output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits; + output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits; + output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits; + output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits; + output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits; + output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits; +} + +void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) { + int16_t out[16 * 16]; + int16_t *outptr = &out[0]; + const int short_pitch = pitch >> 1; + int i, j; + int16_t temp_in[16], temp_out[16]; + + // First transform rows + for (i = 0; i < 16; ++i) { + butterfly_16x16_idct_1d(input, outptr, 0); + input += short_pitch; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + butterfly_16x16_idct_1d(temp_in, temp_out, 3); + for (j = 0; j < 16; ++j) + output[j * 16 + i] = temp_out[j]; + } +} + +/* The following function is called when we know the maximum number of non-zero + * dct coefficients is less or equal 10. + */ +static void butterfly_16x16_idct10_1d(int16_t input[16], int16_t output[16], + int last_shift_bits) { + int16_t step[16] = {0}; + int intermediate[16] = {0}; + int temp1, temp2; + int last_rounding = 0; + + if (last_shift_bits > 0) + last_rounding = 1 << (last_shift_bits - 1); + + // step 1 and 2 + step[ 0] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[ 1] = (input[0] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + + temp1 = (2 * (input[2] * C8) + RIGHT_ROUNDING) >> RIGHT_SHIFT; + step[ 4] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[ 5] = (temp1 + INITIAL_ROUNDING) >> INITIAL_SHIFT; + + // for odd input + temp1 = (input[3] * C12 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + temp1 *= C8; + intermediate[ 8] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = (-input[3] * C4 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + temp1 *= C8; + intermediate[ 9] = (2 * (temp1) + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + step[ 8] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[ 9] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[10] = (-input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[11] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[12] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[13] = (input[1] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[14] = (intermediate[ 8] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + step[15] = (intermediate[ 9] + INITIAL_ROUNDING) >> INITIAL_SHIFT; + + // step 3 + output[0] = step[ 0]; + output[1] = step[ 1]; + output[2] = step[ 1]; + output[3] = step[ 0]; + + temp1 = step[ 4] * C14; + output[4] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = step[ 4] * C2; + output[7] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = step[ 5] * C10; + output[5] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = step[ 5] * C6; + output[6] = (temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + output[8] = step[ 8] + step[11]; + output[9] = step[ 9] + step[10]; + output[10] = step[ 9] - step[10]; + output[11] = step[ 8] - step[11]; + output[12] = step[12] + step[15]; + output[13] = step[13] + step[14]; + output[14] = step[13] - step[14]; + output[15] = step[12] - step[15]; + + // output 4 + step[ 0] = output[0] + output[7]; + step[ 1] = output[1] + output[6]; + step[ 2] = output[2] + output[5]; + step[ 3] = output[3] + output[4]; + step[ 4] = output[3] - output[4]; + step[ 5] = output[2] - output[5]; + step[ 6] = output[1] - output[6]; + step[ 7] = output[0] - output[7]; + + temp1 = output[8] * C7; + temp2 = output[15] * C9; + step[ 8] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[9] * C11; + temp2 = output[14] * C5; + step[ 9] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[10] * C3; + temp2 = output[13] * C13; + step[10] = (temp1 - temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[11] * C15; + temp2 = output[12] * C1; + step[11] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[11] * C1; + temp2 = output[12] * C15; + step[12] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[10] * C13; + temp2 = output[13] * C3; + step[13] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[9] * C5; + temp2 = output[14] * C11; + step[14] = (temp2 - temp1 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + temp1 = output[8] * C9; + temp2 = output[15] * C7; + step[15] = (temp1 + temp2 + RIGHT_ROUNDING) >> RIGHT_SHIFT; + + // step 5 + output[0] = (step[0] + step[15] + last_rounding) >> last_shift_bits; + output[1] = (step[1] + step[14] + last_rounding) >> last_shift_bits; + output[2] = (step[2] + step[13] + last_rounding) >> last_shift_bits; + output[3] = (step[3] + step[12] + last_rounding) >> last_shift_bits; + output[4] = (step[4] + step[11] + last_rounding) >> last_shift_bits; + output[5] = (step[5] + step[10] + last_rounding) >> last_shift_bits; + output[6] = (step[6] + step[ 9] + last_rounding) >> last_shift_bits; + output[7] = (step[7] + step[ 8] + last_rounding) >> last_shift_bits; + + output[15] = (step[0] - step[15] + last_rounding) >> last_shift_bits; + output[14] = (step[1] - step[14] + last_rounding) >> last_shift_bits; + output[13] = (step[2] - step[13] + last_rounding) >> last_shift_bits; + output[12] = (step[3] - step[12] + last_rounding) >> last_shift_bits; + output[11] = (step[4] - step[11] + last_rounding) >> last_shift_bits; + output[10] = (step[5] - step[10] + last_rounding) >> last_shift_bits; + output[9] = (step[6] - step[ 9] + last_rounding) >> last_shift_bits; + output[8] = (step[7] - step[ 8] + last_rounding) >> last_shift_bits; +} + +void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) { + int16_t out[16 * 16]; + int16_t *outptr = &out[0]; + const int short_pitch = pitch >> 1; + int i, j; + int16_t temp_in[16], temp_out[16]; + + /* First transform rows. Since all non-zero dct coefficients are in + * upper-left 4x4 area, we only need to calculate first 4 rows here. + */ + vpx_memset(out, 0, sizeof(out)); + for (i = 0; i < 4; ++i) { + butterfly_16x16_idct10_1d(input, outptr, 0); + input += short_pitch; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j*16 + i]; + butterfly_16x16_idct10_1d(temp_in, temp_out, 3); + for (j = 0; j < 16; ++j) + output[j*16 + i] = temp_out[j]; + } +} +#undef INITIAL_SHIFT +#undef INITIAL_ROUNDING +#undef RIGHT_SHIFT +#undef RIGHT_ROUNDING +#endif diff --git a/vp9/common/vp9_implicit_segmentation.c b/vp9/common/vp9_implicit_segmentation.c new file mode 100644 index 000000000..472c3d1a5 --- /dev/null +++ b/vp9/common/vp9_implicit_segmentation.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_onyxc_int.h" + +#define MAX_REGIONS 24000 +#ifndef NULL +#define NULL 0 +#endif + +#define min_mbs_in_region 3 + +// this linked list structure holds equivalences for connected +// component labeling +struct list_el { + int label; + int seg_value; + int count; + struct list_el *next; +}; +typedef struct list_el item; + +// connected colorsegments +typedef struct { + int min_x; + int min_y; + int max_x; + int max_y; + long long sum_x; + long long sum_y; + int pixels; + int seg_value; + int label; +} segment_info; + + +typedef enum { + SEGMENT_MODE, + SEGMENT_MV, + SEGMENT_REFFRAME, + SEGMENT_SKIPPED +} SEGMENT_TYPE; + + +// this merges the two equivalence lists and +// then makes sure that every label points to the same +// equivalence list +void merge(item *labels, int u, int v) { + item *a = labels[u].next; + item *b = labels[v].next; + item c; + item *it = &c; + int count; + + // check if they are already merged + if (u == v || a == b) + return; + + count = a->count + b->count; + + // merge 2 sorted linked lists. + while (a != NULL && b != NULL) { + if (a->label < b->label) { + it->next = a; + a = a->next; + } else { + it->next = b; + b = b->next; + } + + it = it->next; + } + + if (a == NULL) + it->next = b; + else + it->next = a; + + it = c.next; + + // make sure every equivalence in the linked list points to this new ll + while (it != NULL) { + labels[it->label].next = c.next; + it = it->next; + } + c.next->count = count; + +} + +void segment_via_mode_info(VP9_COMMON *oci, int how) { + MODE_INFO *mi = oci->mi; + int i, j; + int mb_index = 0; + + int label = 1; + int pitch = oci->mb_cols; + + // holds linked list equivalences + // the max should probably be allocated at a higher level in oci + item equivalences[MAX_REGIONS]; + int eq_ptr = 0; + item labels[MAX_REGIONS]; + segment_info segments[MAX_REGIONS]; + int label_count = 1; + int labeling[400 * 300]; + int *lp = labeling; + + label_count = 1; + memset(labels, 0, sizeof(labels)); + memset(segments, 0, sizeof(segments)); + + /* Go through each macroblock first pass labelling */ + for (i = 0; i < oci->mb_rows; i++, lp += pitch) { + for (j = 0; j < oci->mb_cols; j++) { + // int above seg_value, left seg_value, this seg_value... + int a = -1, l = -1, n = -1; + + // above label, left label + int al = -1, ll = -1; + if (i) { + al = lp[j - pitch]; + a = labels[al].next->seg_value; + } + if (j) { + ll = lp[j - 1]; + l = labels[ll].next->seg_value; + } + + // what setting are we going to do the implicit segmentation on + switch (how) { + case SEGMENT_MODE: + n = mi[mb_index].mbmi.mode; + break; + case SEGMENT_MV: + n = mi[mb_index].mbmi.mv[0].as_int; + if (mi[mb_index].mbmi.ref_frame == INTRA_FRAME) + n = -9999999; + break; + case SEGMENT_REFFRAME: + n = mi[mb_index].mbmi.ref_frame; + break; + case SEGMENT_SKIPPED: + n = mi[mb_index].mbmi.mb_skip_coeff; + break; + } + + // above and left both have the same seg_value + if (n == a && n == l) { + // pick the lowest label + lp[j] = (al < ll ? al : ll); + labels[lp[j]].next->count++; + + // merge the above and left equivalencies + merge(labels, al, ll); + } + // this matches above seg_value + else if (n == a) { + // give it the same label as above + lp[j] = al; + labels[al].next->count++; + } + // this matches left seg_value + else if (n == l) { + // give it the same label as above + lp[j] = ll; + labels[ll].next->count++; + } else { + // new label doesn't match either + item *e = &labels[label]; + item *nl = &equivalences[eq_ptr++]; + lp[j] = label; + nl->label = label; + nl->next = 0; + nl->seg_value = n; + nl->count = 1; + e->next = nl; + label++; + } + mb_index++; + } + mb_index++; + } + lp = labeling; + + // give new labels to regions + for (i = 1; i < label; i++) + if (labels[i].next->count > min_mbs_in_region && labels[labels[i].next->label].label == 0) { + segment_info *cs = &segments[label_count]; + cs->label = label_count; + labels[labels[i].next->label].label = label_count++; + labels[labels[i].next->label].seg_value = labels[i].next->seg_value; + cs->seg_value = labels[labels[i].next->label].seg_value; + cs->min_x = oci->mb_cols; + cs->min_y = oci->mb_rows; + cs->max_x = 0; + cs->max_y = 0; + cs->sum_x = 0; + cs->sum_y = 0; + cs->pixels = 0; + + } + lp = labeling; + + // this is just to gather stats... + for (i = 0; i < oci->mb_rows; i++, lp += pitch) { + for (j = 0; j < oci->mb_cols; j++) { + segment_info *cs; + int oldlab = labels[lp[j]].next->label; + int lab = labels[oldlab].label; + lp[j] = lab; + + cs = &segments[lab]; + + cs->min_x = (j < cs->min_x ? j : cs->min_x); + cs->max_x = (j > cs->max_x ? j : cs->max_x); + cs->min_y = (i < cs->min_y ? i : cs->min_y); + cs->max_y = (i > cs->max_y ? i : cs->max_y); + cs->sum_x += j; + cs->sum_y += i; + cs->pixels++; + + lp[j] = lab; + mb_index++; + } + mb_index++; + } + + { + lp = labeling; + printf("labelling \n"); + mb_index = 0; + for (i = 0; i < oci->mb_rows; i++, lp += pitch) { + for (j = 0; j < oci->mb_cols; j++) { + printf("%4d", lp[j]); + } + printf(" "); + for (j = 0; j < oci->mb_cols; j++, mb_index++) { + // printf("%3d",mi[mb_index].mbmi.mode ); + printf("%4d:%4d", mi[mb_index].mbmi.mv[0].as_mv.row, + mi[mb_index].mbmi.mv[0].as_mv.col); + } + printf("\n"); + ++mb_index; + } + printf("\n"); + } +} + diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c new file mode 100644 index 000000000..76eb10307 --- /dev/null +++ b/vp9/common/vp9_invtrans.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9_invtrans.h" +#include "./vp9_rtcd.h" + +static void recon_dcblock(MACROBLOCKD *xd) { + BLOCKD *b = &xd->block[24]; + int i; + + for (i = 0; i < 16; i++) { + xd->block[i].dqcoeff[0] = b->diff[i]; + } +} + +static void recon_dcblock_8x8(MACROBLOCKD *xd) { + BLOCKD *b = &xd->block[24]; // for coeff 0, 2, 8, 10 + + xd->block[0].dqcoeff[0] = b->diff[0]; + xd->block[4].dqcoeff[0] = b->diff[1]; + xd->block[8].dqcoeff[0] = b->diff[4]; + xd->block[12].dqcoeff[0] = b->diff[8]; +} + +void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch) { + BLOCKD *b = &xd->block[block]; + if (b->eob <= 1) + xd->inv_xform4x4_1_x8(b->dqcoeff, b->diff, pitch); + else + xd->inv_xform4x4_x8(b->dqcoeff, b->diff, pitch); +} + +void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) { + int i; + BLOCKD *blockd = xd->block; + + if (xd->mode_info_context->mbmi.mode != SPLITMV) { + /* do 2nd order transform on the dc block */ + vp9_short_inv_walsh4x4(blockd[24].dqcoeff, blockd[24].diff); + recon_dcblock(xd); + } + + for (i = 0; i < 16; i++) { + vp9_inverse_transform_b_4x4(xd, i, 32); + } +} + +void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd) { + int i; + for (i = 16; i < 24; i++) { + vp9_inverse_transform_b_4x4(xd, i, 16); + } +} + +void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd) { + vp9_inverse_transform_mby_4x4(xd); + vp9_inverse_transform_mbuv_4x4(xd); +} + +void vp9_inverse_transform_b_8x8(short *input_dqcoeff, short *output_coeff, + int pitch) { + vp9_short_idct8x8(input_dqcoeff, output_coeff, pitch); +} + +void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) { + int i; + BLOCKD *blockd = xd->block; + + if (xd->mode_info_context->mbmi.mode != SPLITMV) { + // do 2nd order transform on the dc block + vp9_short_ihaar2x2(blockd[24].dqcoeff, blockd[24].diff, 8); + recon_dcblock_8x8(xd); // need to change for 8x8 + } + + for (i = 0; i < 9; i += 8) { + vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0], + &blockd[i].diff[0], 32); + } + for (i = 2; i < 11; i += 8) { + vp9_inverse_transform_b_8x8(&blockd[i + 2].dqcoeff[0], + &blockd[i].diff[0], 32); + } +} + +void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd) { + int i; + BLOCKD *blockd = xd->block; + + for (i = 16; i < 24; i += 4) { + vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0], + &blockd[i].diff[0], 16); + } +} + +void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd) { + vp9_inverse_transform_mby_8x8(xd); + vp9_inverse_transform_mbuv_8x8(xd); +} + +void vp9_inverse_transform_b_16x16(short *input_dqcoeff, + short *output_coeff, int pitch) { + vp9_short_idct16x16(input_dqcoeff, output_coeff, pitch); +} + +void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) { + vp9_inverse_transform_b_16x16(&xd->block[0].dqcoeff[0], + &xd->block[0].diff[0], 32); +} + +void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd) { + vp9_inverse_transform_mby_16x16(xd); + vp9_inverse_transform_mbuv_8x8(xd); +} diff --git a/vp9/common/vp9_invtrans.h b/vp9/common/vp9_invtrans.h new file mode 100644 index 000000000..0e50c45d7 --- /dev/null +++ b/vp9/common/vp9_invtrans.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef __INC_INVTRANS_H +#define __INC_INVTRANS_H + +#include "vpx_ports/config.h" +#include "vp9_blockd.h" + +extern void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch); + +extern void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd); + +extern void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd); + +extern void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd); + +extern void vp9_inverse_transform_b_8x8(short *input_dqcoeff, + short *output_coeff, int pitch); + +extern void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd); + +extern void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd); + +extern void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd); + +extern void vp9_inverse_transform_b_16x16(short *input_dqcoeff, + short *output_coeff, int pitch); + +extern void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd); + +extern void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd); + +#endif // __INC_INVTRANS_H diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c new file mode 100644 index 000000000..73b343974 --- /dev/null +++ b/vp9/common/vp9_loopfilter.c @@ -0,0 +1,524 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp9_loopfilter.h" +#include "vp9_onyxc_int.h" +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_seg_common.h" + +static void lf_init_lut(loop_filter_info_n *lfi) { + int filt_lvl; + + for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++) { + if (filt_lvl >= 40) { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3; + } else if (filt_lvl >= 20) { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2; + } else if (filt_lvl >= 15) { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1; + } else { + lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0; + lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0; + } + } + + lfi->mode_lf_lut[DC_PRED] = 1; + lfi->mode_lf_lut[D45_PRED] = 1; + lfi->mode_lf_lut[D135_PRED] = 1; + lfi->mode_lf_lut[D117_PRED] = 1; + lfi->mode_lf_lut[D153_PRED] = 1; + lfi->mode_lf_lut[D27_PRED] = 1; + lfi->mode_lf_lut[D63_PRED] = 1; + lfi->mode_lf_lut[V_PRED] = 1; + lfi->mode_lf_lut[H_PRED] = 1; + lfi->mode_lf_lut[TM_PRED] = 1; + lfi->mode_lf_lut[B_PRED] = 0; + lfi->mode_lf_lut[I8X8_PRED] = 0; + lfi->mode_lf_lut[ZEROMV] = 1; + lfi->mode_lf_lut[NEARESTMV] = 2; + lfi->mode_lf_lut[NEARMV] = 2; + lfi->mode_lf_lut[NEWMV] = 2; + lfi->mode_lf_lut[SPLITMV] = 3; +} + +void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi, + int sharpness_lvl) { + int i; + + /* For each possible value for the loop filter fill out limits */ + for (i = 0; i <= MAX_LOOP_FILTER; i++) { + int filt_lvl = i; + int block_inside_limit = 0; + + /* Set loop filter paramaeters that control sharpness. */ + block_inside_limit = filt_lvl >> (sharpness_lvl > 0); + block_inside_limit = block_inside_limit >> (sharpness_lvl > 4); + + if (sharpness_lvl > 0) { + if (block_inside_limit > (9 - sharpness_lvl)) + block_inside_limit = (9 - sharpness_lvl); + } + + if (block_inside_limit < 1) + block_inside_limit = 1; + + vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH); + vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), + SIMD_WIDTH); + vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit), + SIMD_WIDTH); + } +} + +void vp9_loop_filter_init(VP9_COMMON *cm) { + loop_filter_info_n *lfi = &cm->lf_info; + int i; + + /* init limits for given sharpness*/ + vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + + /* init LUT for lvl and hev thr picking */ + lf_init_lut(lfi); + + /* init hev threshold const vectors */ + for (i = 0; i < 4; i++) { + vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH); + } +} + +void vp9_loop_filter_frame_init(VP9_COMMON *cm, + MACROBLOCKD *xd, + int default_filt_lvl) { + int seg, /* segment number */ + ref, /* index in ref_lf_deltas */ + mode; /* index in mode_lf_deltas */ + + loop_filter_info_n *lfi = &cm->lf_info; + + /* update limits if sharpness has changed */ + if (cm->last_sharpness_level != cm->sharpness_level) { + vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + } + + for (seg = 0; seg < MAX_MB_SEGMENTS; seg++) { + int lvl_seg = default_filt_lvl; + int lvl_ref, lvl_mode; + + + // Set the baseline filter values for each segment + if (vp9_segfeature_active(xd, seg, SEG_LVL_ALT_LF)) { + /* Abs value */ + if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) { + lvl_seg = vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF); + } else { /* Delta Value */ + lvl_seg += vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF); + lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0; + } + } + + if (!xd->mode_ref_lf_delta_enabled) { + /* we could get rid of this if we assume that deltas are set to + * zero when not in use; encoder always uses deltas + */ + vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4); + continue; + } + + lvl_ref = lvl_seg; + + /* INTRA_FRAME */ + ref = INTRA_FRAME; + + /* Apply delta for reference frame */ + lvl_ref += xd->ref_lf_deltas[ref]; + + /* Apply delta for Intra modes */ + mode = 0; /* B_PRED */ + /* Only the split mode BPRED has a further special case */ + lvl_mode = lvl_ref + xd->mode_lf_deltas[mode]; + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ + + lfi->lvl[seg][ref][mode] = lvl_mode; + + mode = 1; /* all the rest of Intra modes */ + lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; /* clamp */ + lfi->lvl[seg][ref][mode] = lvl_mode; + + /* LAST, GOLDEN, ALT */ + for (ref = 1; ref < MAX_REF_FRAMES; ref++) { + int lvl_ref = lvl_seg; + + /* Apply delta for reference frame */ + lvl_ref += xd->ref_lf_deltas[ref]; + + /* Apply delta for Inter modes */ + for (mode = 1; mode < 4; mode++) { + lvl_mode = lvl_ref + xd->mode_lf_deltas[mode]; + lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */ + + lfi->lvl[seg][ref][mode] = lvl_mode; + } + } + } +} + +void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd) { + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + loop_filter_info_n *lfi_n = &cm->lf_info; + struct loop_filter_info lfi; + + FRAME_TYPE frame_type = cm->frame_type; + + int mb_row; + int mb_col; + + int filter_level; + + unsigned char *y_ptr, *u_ptr, *v_ptr; + + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; + + /* Initialize the loop filter for this frame. */ + vp9_loop_filter_frame_init(cm, xd, cm->filter_level); + + /* Set up the buffer pointers */ + y_ptr = post->y_buffer; + u_ptr = post->u_buffer; + v_ptr = post->v_buffer; + + /* vp9_filter each macro block */ + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != I8X8_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + int tx_type = mode_info_context->mbmi.txfm_size; + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) { + if (cm->filter_type == NORMAL_LOOPFILTER) { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0 +#if CONFIG_SUPERBLOCKS + && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb && + mode_info_context[0].mbmi.mb_skip_coeff && + mode_info_context[-1].mbmi.mb_skip_coeff) +#endif + ) + vp9_loop_filter_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, + post->uv_stride, &lfi); + + if (!skip_lf && tx_type != TX_16X16) { + if (tx_type == TX_8X8) + vp9_loop_filter_bv8x8(y_ptr, u_ptr, v_ptr, post->y_stride, + post->uv_stride, &lfi); + else + vp9_loop_filter_bv(y_ptr, u_ptr, v_ptr, post->y_stride, + post->uv_stride, &lfi); + + } + + /* don't apply across umv border */ + if (mb_row > 0 +#if CONFIG_SUPERBLOCKS + && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb && + mode_info_context[0].mbmi.mb_skip_coeff && + mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff) +#endif + ) + vp9_loop_filter_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, + post->uv_stride, &lfi); + + if (!skip_lf && tx_type != TX_16X16) { + if (tx_type == TX_8X8) + vp9_loop_filter_bh8x8(y_ptr, u_ptr, v_ptr, post->y_stride, + post->uv_stride, &lfi); + else + vp9_loop_filter_bh(y_ptr, u_ptr, v_ptr, post->y_stride, + post->uv_stride, &lfi); + } + } else { + // FIXME: Not 8x8 aware + if (mb_col > 0 +#if CONFIG_SUPERBLOCKS + && !((mb_col & 1) && mode_info_context->mbmi.encoded_as_sb && + mode_info_context[0].mbmi.mb_skip_coeff && + mode_info_context[-1].mbmi.mb_skip_coeff) +#endif + ) + vp9_loop_filter_simple_mbv(y_ptr, post->y_stride, + lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp9_loop_filter_simple_bv(y_ptr, post->y_stride, + lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0 +#if CONFIG_SUPERBLOCKS + && !((mb_row & 1) && mode_info_context->mbmi.encoded_as_sb && + mode_info_context[0].mbmi.mb_skip_coeff && + mode_info_context[-cm->mode_info_stride].mbmi.mb_skip_coeff) +#endif + ) + vp9_loop_filter_simple_mbh(y_ptr, post->y_stride, + lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp9_loop_filter_simple_bh(y_ptr, post->y_stride, + lfi_n->blim[filter_level]); + } + } + + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + + mode_info_context++; /* step to next MB */ + } + + y_ptr += post->y_stride * 16 - post->y_width; + u_ptr += post->uv_stride * 8 - post->uv_width; + v_ptr += post->uv_stride * 8 - post->uv_width; + + mode_info_context++; /* Skip border mb */ + } +} + +void vp9_loop_filter_frame_yonly(VP9_COMMON *cm, MACROBLOCKD *xd, + int default_filt_lvl) { + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + + unsigned char *y_ptr; + int mb_row; + int mb_col; + + loop_filter_info_n *lfi_n = &cm->lf_info; + struct loop_filter_info lfi; + + int filter_level; + FRAME_TYPE frame_type = cm->frame_type; + + /* Point at base of Mb MODE_INFO list */ + const MODE_INFO *mode_info_context = cm->mi; + +#if 0 + if (default_filt_lvl == 0) /* no filter applied */ + return; +#endif + + /* Initialize the loop filter for this frame. */ + vp9_loop_filter_frame_init(cm, xd, default_filt_lvl); + + /* Set up the buffer pointers */ + y_ptr = post->y_buffer; + + /* vp9_filter each macro block */ + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) { + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != I8X8_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode]; + const int seg = mode_info_context->mbmi.segment_id; + const int ref_frame = mode_info_context->mbmi.ref_frame; + int tx_type = mode_info_context->mbmi.txfm_size; + filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) { + if (cm->filter_type == NORMAL_LOOPFILTER) { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf && tx_type != TX_16X16) { + if (tx_type == TX_8X8) + vp9_loop_filter_bv8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi); + else + vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi); + } + + /* don't apply across umv border */ + if (mb_row > 0) + vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf && tx_type != TX_16X16) { + if (tx_type == TX_8X8) + vp9_loop_filter_bh8x8(y_ptr, 0, 0, post->y_stride, 0, &lfi); + else + vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi); + } + } else { + // FIXME: Not 8x8 aware + if (mb_col > 0) + vp9_loop_filter_simple_mbv(y_ptr, post->y_stride, + lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp9_loop_filter_simple_bv(y_ptr, post->y_stride, + lfi_n->blim[filter_level]); + + /* don't apply across umv border */ + if (mb_row > 0) + vp9_loop_filter_simple_mbh(y_ptr, post->y_stride, + lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp9_loop_filter_simple_bh(y_ptr, post->y_stride, + lfi_n->blim[filter_level]); + } + } + + y_ptr += 16; + mode_info_context++; /* step to next MB */ + } + + y_ptr += post->y_stride * 16 - post->y_width; + mode_info_context++; /* Skip border mb */ + } +} + +void vp9_loop_filter_partial_frame(VP9_COMMON *cm, MACROBLOCKD *xd, + int default_filt_lvl) { + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + + unsigned char *y_ptr; + int mb_row; + int mb_col; + int mb_cols = post->y_width >> 4; + + int linestocopy, i; + + loop_filter_info_n *lfi_n = &cm->lf_info; + struct loop_filter_info lfi; + + int filter_level; + int alt_flt_enabled = xd->segmentation_enabled; + FRAME_TYPE frame_type = cm->frame_type; + + const MODE_INFO *mode_info_context; + + int lvl_seg[MAX_MB_SEGMENTS]; + + mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); + + /* 3 is a magic number. 4 is probably magic too */ + linestocopy = (post->y_height >> (4 + 3)); + + if (linestocopy < 1) + linestocopy = 1; + + linestocopy <<= 4; + + /* Note the baseline filter values for each segment */ + /* See vp9_loop_filter_frame_init. Rather than call that for each change + * to default_filt_lvl, copy the relevant calculation here. + */ + if (alt_flt_enabled) { + for (i = 0; i < MAX_MB_SEGMENTS; i++) { + /* Abs value */ + if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) { + lvl_seg[i] = vp9_get_segdata(xd, i, SEG_LVL_ALT_LF); + } + /* Delta Value */ + else { + lvl_seg[i] = default_filt_lvl + + vp9_get_segdata(xd, i, SEG_LVL_ALT_LF); + lvl_seg[i] = (lvl_seg[i] > 0) ? + ((lvl_seg[i] > 63) ? 63 : lvl_seg[i]) : 0; + } + } + } + + /* Set up the buffer pointers */ + y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride; + + /* vp9_filter each macro block */ + for (mb_row = 0; mb_row < (linestocopy >> 4); mb_row++) { + for (mb_col = 0; mb_col < mb_cols; mb_col++) { + int skip_lf = (mode_info_context->mbmi.mode != B_PRED && + mode_info_context->mbmi.mode != I8X8_PRED && + mode_info_context->mbmi.mode != SPLITMV && + mode_info_context->mbmi.mb_skip_coeff); + + if (alt_flt_enabled) + filter_level = lvl_seg[mode_info_context->mbmi.segment_id]; + else + filter_level = default_filt_lvl; + + if (filter_level) { + if (cm->filter_type == NORMAL_LOOPFILTER) { + const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level]; + lfi.mblim = lfi_n->mblim[filter_level]; + lfi.blim = lfi_n->blim[filter_level]; + lfi.lim = lfi_n->lim[filter_level]; + lfi.hev_thr = lfi_n->hev_thr[hev_index]; + + if (mb_col > 0) + vp9_loop_filter_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp9_loop_filter_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi); + + vp9_loop_filter_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi); + + if (!skip_lf) + vp9_loop_filter_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi); + } else { + if (mb_col > 0) + vp9_loop_filter_simple_mbv (y_ptr, post->y_stride, + lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp9_loop_filter_simple_bv(y_ptr, post->y_stride, + lfi_n->blim[filter_level]); + + vp9_loop_filter_simple_mbh(y_ptr, post->y_stride, + lfi_n->mblim[filter_level]); + + if (!skip_lf) + vp9_loop_filter_simple_bh(y_ptr, post->y_stride, + lfi_n->blim[filter_level]); + } + } + + y_ptr += 16; + mode_info_context += 1; /* step to next MB */ + } + + y_ptr += post->y_stride * 16 - post->y_width; + mode_info_context += 1; /* Skip border mb */ + } +} diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h new file mode 100644 index 000000000..b3254f692 --- /dev/null +++ b/vp9/common/vp9_loopfilter.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef loopfilter_h +#define loopfilter_h + +#include "vpx_ports/mem.h" +#include "vpx_config.h" +#include "vp9_blockd.h" + +#define MAX_LOOP_FILTER 63 + +typedef enum { + NORMAL_LOOPFILTER = 0, + SIMPLE_LOOPFILTER = 1 +} LOOPFILTERTYPE; + +#if ARCH_ARM +#define SIMD_WIDTH 1 +#else +#define SIMD_WIDTH 16 +#endif + +/* Need to align this structure so when it is declared and + * passed it can be loaded into vector registers. + */ +typedef struct { + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, + mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, + blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, + lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, + hev_thr[4][SIMD_WIDTH]); + unsigned char lvl[4][4][4]; + unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1]; + unsigned char mode_lf_lut[MB_MODE_COUNT]; +} loop_filter_info_n; + +struct loop_filter_info { + const unsigned char *mblim; + const unsigned char *blim; + const unsigned char *lim; + const unsigned char *hev_thr; +}; + +#define prototype_loopfilter(sym) \ + void sym(unsigned char *src, int pitch, const unsigned char *blimit,\ + const unsigned char *limit, const unsigned char *thresh, int count) + +#define prototype_loopfilter_block(sym) \ + void sym(unsigned char *y, unsigned char *u, unsigned char *v, \ + int ystride, int uv_stride, struct loop_filter_info *lfi) + +#define prototype_simple_loopfilter(sym) \ + void sym(unsigned char *y, int ystride, const unsigned char *blimit) + +#if ARCH_X86 || ARCH_X86_64 +#include "x86/vp9_loopfilter_x86.h" +#endif + +#if ARCH_ARM +#include "arm/vp9_loopfilter_arm.h" +#endif + +typedef void loop_filter_uvfunction(unsigned char *u, /* source pointer */ + int p, /* pitch */ + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + unsigned char *v); + +/* assorted loopfilter functions which get used elsewhere */ +struct VP9Common; +struct macroblockd; + +void vp9_loop_filter_init(struct VP9Common *cm); + +void vp9_loop_filter_frame_init(struct VP9Common *cm, + struct macroblockd *mbd, + int default_filt_lvl); + +void vp9_loop_filter_frame(struct VP9Common *cm, struct macroblockd *mbd); + +void vp9_loop_filter_partial_frame(struct VP9Common *cm, + struct macroblockd *mbd, + int default_filt_lvl); + +void vp9_loop_filter_frame_yonly(struct VP9Common *cm, + struct macroblockd *mbd, + int default_filt_lvl); + +void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi, + int sharpness_lvl); + +#endif // loopfilter_h diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c new file mode 100644 index 000000000..9fd6c5fa6 --- /dev/null +++ b/vp9/common/vp9_loopfilter_filters.c @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "vpx_config.h" +#include "vp9_loopfilter.h" +#include "vp9_onyxc_int.h" + +typedef unsigned char uc; + +static __inline signed char signed_char_clamp(int t) { + t = (t < -128 ? -128 : t); + t = (t > 127 ? 127 : t); + return (signed char) t; +} + + +/* should we apply any filter at all ( 11111111 yes, 00000000 no) */ +static __inline signed char filter_mask(uc limit, uc blimit, + uc p3, uc p2, uc p1, uc p0, + uc q0, uc q1, uc q2, uc q3) { + signed char mask = 0; + mask |= (abs(p3 - p2) > limit) * -1; + mask |= (abs(p2 - p1) > limit) * -1; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(q2 - q1) > limit) * -1; + mask |= (abs(q3 - q2) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = ~mask; + return mask; +} + +/* is there high variance internal edge ( 11111111 yes, 00000000 no) */ +static __inline signed char hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1) { + signed char hev = 0; + hev |= (abs(p1 - p0) > thresh) * -1; + hev |= (abs(q1 - q0) > thresh) * -1; + return hev; +} + +static __inline void filter(signed char mask, uc hev, uc *op1, + uc *op0, uc *oq0, uc *oq1) + +{ + signed char ps0, qs0; + signed char ps1, qs1; + signed char filter, Filter1, Filter2; + signed char u; + + ps1 = (signed char) * op1 ^ 0x80; + ps0 = (signed char) * op0 ^ 0x80; + qs0 = (signed char) * oq0 ^ 0x80; + qs1 = (signed char) * oq1 ^ 0x80; + + /* add outer taps if we have high edge variance */ + filter = signed_char_clamp(ps1 - qs1); + filter &= hev; + + /* inner taps */ + filter = signed_char_clamp(filter + 3 * (qs0 - ps0)); + filter &= mask; + + /* save bottom 3 bits so that we round one side +4 and the other +3 + * if it equals 4 we'll set to adjust by -1 to account for the fact + * we'd round 3 the other way + */ + Filter1 = signed_char_clamp(filter + 4); + Filter2 = signed_char_clamp(filter + 3); + Filter1 >>= 3; + Filter2 >>= 3; + u = signed_char_clamp(qs0 - Filter1); + *oq0 = u ^ 0x80; + u = signed_char_clamp(ps0 + Filter2); + *op0 = u ^ 0x80; + filter = Filter1; + + /* outer tap adjustments */ + filter += 1; + filter >>= 1; + filter &= ~hev; + + u = signed_char_clamp(qs1 - filter); + *oq1 = u ^ 0x80; + u = signed_char_clamp(ps1 + filter); + *op1 = u ^ 0x80; + +} + +void vp9_loop_filter_horizontal_edge_c +( + unsigned char *s, + int p, /* pitch */ + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) { + int hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + do { + mask = filter_mask(limit[0], blimit[0], + s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p], + s[0 * p], s[1 * p], s[2 * p], s[3 * p]); + + hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]); + + filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); + + ++s; + } while (++i < count * 8); +} + +void vp9_loop_filter_vertical_edge_c(unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { + int hev = 0; /* high edge variance */ + signed char mask = 0; + int i = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + do { + mask = filter_mask(limit[0], blimit[0], + s[-4], s[-3], s[-2], s[-1], + s[0], s[1], s[2], s[3]); + + hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); + + filter(mask, hev, s - 2, s - 1, s, s + 1); + + s += p; + } while (++i < count * 8); +} +static __inline signed char flatmask(uc thresh, + uc p4, uc p3, uc p2, uc p1, uc p0, + uc q0, uc q1, uc q2, uc q3, uc q4) { + signed char flat = 0; + flat |= (abs(p1 - p0) > 1) * -1; + flat |= (abs(q1 - q0) > 1) * -1; + flat |= (abs(p0 - p2) > 1) * -1; + flat |= (abs(q0 - q2) > 1) * -1; + flat |= (abs(p3 - p0) > 1) * -1; + flat |= (abs(q3 - q0) > 1) * -1; + flat |= (abs(p4 - p0) > 1) * -1; + flat |= (abs(q4 - q0) > 1) * -1; + flat = ~flat; + return flat; +} + +static __inline void mbfilter(signed char mask, uc hev, uc flat, + uc *op4, uc *op3, uc *op2, uc *op1, uc *op0, + uc *oq0, uc *oq1, uc *oq2, uc *oq3, uc *oq4) { + /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */ + if (flat && mask) { + unsigned char p0, q0; + unsigned char p1, q1; + unsigned char p2, q2; + unsigned char p3, q3; + unsigned char p4, q4; + + p4 = *op4; + p3 = *op3; + p2 = *op2; + p1 = *op1; + p0 = *op0; + q0 = *oq0; + q1 = *oq1; + q2 = *oq2; + q3 = *oq3; + q4 = *oq4; + + *op2 = (p4 + p4 + p3 + p2 + p2 + p1 + p0 + q0 + 4) >> 3; + *op1 = (p4 + p3 + p2 + p1 + p1 + p0 + q0 + q1 + 4) >> 3; + *op0 = (p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2 + 4) >> 3; + *oq0 = (p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3 + 4) >> 3; + *oq1 = (p1 + p0 + q0 + q1 + q1 + q2 + q3 + q4 + 4) >> 3; + *oq2 = (p0 + q0 + q1 + q2 + q2 + q3 + q4 + q4 + 4) >> 3; + } else { + signed char ps0, qs0; + signed char ps1, qs1; + signed char filter, Filter1, Filter2; + signed char u; + + ps1 = (signed char) * op1 ^ 0x80; + ps0 = (signed char) * op0 ^ 0x80; + qs0 = (signed char) * oq0 ^ 0x80; + qs1 = (signed char) * oq1 ^ 0x80; + + /* add outer taps if we have high edge variance */ + filter = signed_char_clamp(ps1 - qs1); + filter &= hev; + + /* inner taps */ + filter = signed_char_clamp(filter + 3 * (qs0 - ps0)); + filter &= mask; + + Filter1 = signed_char_clamp(filter + 4); + Filter2 = signed_char_clamp(filter + 3); + Filter1 >>= 3; + Filter2 >>= 3; + + u = signed_char_clamp(qs0 - Filter1); + *oq0 = u ^ 0x80; + u = signed_char_clamp(ps0 + Filter2); + *op0 = u ^ 0x80; + filter = Filter1; + + /* outer tap adjustments */ + filter += 1; + filter >>= 1; + filter &= ~hev; + + u = signed_char_clamp(qs1 - filter); + *oq1 = u ^ 0x80; + u = signed_char_clamp(ps1 + filter); + *op1 = u ^ 0x80; + } +} +void vp9_mbloop_filter_horizontal_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) { + signed char hev = 0; /* high edge variance */ + signed char mask = 0; + signed char flat = 0; + int i = 0; + + /* loop filter designed to work using chars so that we can make maximum use + * of 8 bit simd instructions. + */ + do { + + mask = filter_mask(limit[0], blimit[0], + s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p], + s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p]); + + hev = hevmask(thresh[0], s[-2 * p], s[-1 * p], s[0 * p], s[1 * p]); + + flat = flatmask(thresh[0], + s[-5 * p], s[-4 * p], s[-3 * p], s[-2 * p], s[-1 * p], + s[ 0 * p], s[ 1 * p], s[ 2 * p], s[ 3 * p], s[ 4 * p]); + mbfilter(mask, hev, flat, + s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, + s, s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p); + + ++s; + } while (++i < count * 8); + +} +void vp9_mbloop_filter_vertical_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count +) { + signed char hev = 0; /* high edge variance */ + signed char mask = 0; + signed char flat = 0; + int i = 0; + + do { + + mask = filter_mask(limit[0], blimit[0], + s[-4], s[-3], s[-2], s[-1], + s[0], s[1], s[2], s[3]); + + hev = hevmask(thresh[0], s[-2], s[-1], s[0], s[1]); + flat = flatmask(thresh[0], + s[-5], s[-4], s[-3], s[-2], s[-1], + s[ 0], s[ 1], s[ 2], s[ 3], s[ 4]); + mbfilter(mask, hev, flat, + s - 5, s - 4, s - 3, s - 2, s - 1, + s, s + 1, s + 2, s + 3, s + 4); + s += p; + } while (++i < count * 8); + +} + +/* should we apply any filter at all ( 11111111 yes, 00000000 no) */ +static __inline signed char simple_filter_mask(uc blimit, + uc p1, uc p0, + uc q0, uc q1) { + /* Why does this cause problems for win32? + * error C2143: syntax error : missing ';' before 'type' + * (void) limit; + */ + signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1; + return mask; +} + +static __inline void simple_filter(signed char mask, + uc *op1, uc *op0, + uc *oq0, uc *oq1) { + signed char filter, Filter1, Filter2; + signed char p1 = (signed char) * op1 ^ 0x80; + signed char p0 = (signed char) * op0 ^ 0x80; + signed char q0 = (signed char) * oq0 ^ 0x80; + signed char q1 = (signed char) * oq1 ^ 0x80; + signed char u; + + filter = signed_char_clamp(p1 - q1); + filter = signed_char_clamp(filter + 3 * (q0 - p0)); + filter &= mask; + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + Filter1 = signed_char_clamp(filter + 4); + Filter1 >>= 3; + u = signed_char_clamp(q0 - Filter1); + *oq0 = u ^ 0x80; + + Filter2 = signed_char_clamp(filter + 3); + Filter2 >>= 3; + u = signed_char_clamp(p0 + Filter2); + *op0 = u ^ 0x80; +} + +void vp9_loop_filter_simple_horizontal_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit +) { + signed char mask = 0; + int i = 0; + + do { + mask = simple_filter_mask(blimit[0], + s[-2 * p], s[-1 * p], + s[0 * p], s[1 * p]); + simple_filter(mask, + s - 2 * p, s - 1 * p, + s, s + 1 * p); + ++s; + } while (++i < 16); +} + +void vp9_loop_filter_simple_vertical_edge_c +( + unsigned char *s, + int p, + const unsigned char *blimit +) { + signed char mask = 0; + int i = 0; + + do { + mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]); + simple_filter(mask, s - 2, s - 1, s, s + 1); + s += p; + } while (++i < 16); + +} + +/* Vertical MB Filtering */ +void vp9_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp9_mbloop_filter_vertical_edge_c(y_ptr, y_stride, + lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp9_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, + lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp9_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, + lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Vertical B Filtering */ +void vp9_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp9_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp9_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp9_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +/* Horizontal MB filtering */ +void vp9_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp9_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, + lfi->mblim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp9_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, + lfi->mblim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp9_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, + lfi->mblim, lfi->lim, lfi->hev_thr, 1); +} + +/* Horizontal B Filtering */ +void vp9_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp9_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp9_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp9_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp9_loop_filter_bh8x8_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp9_mbloop_filter_horizontal_edge_c( + y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); +} + +void vp9_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, + y_stride, blimit); + vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, + y_stride, blimit); + vp9_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, + y_stride, blimit); +} + +void vp9_loop_filter_bv8x8_c(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp9_mbloop_filter_vertical_edge_c( + y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); +} + +void vp9_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp9_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit); + vp9_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit); + vp9_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit); +} diff --git a/vp9/common/vp9_maskingmv.c b/vp9/common/vp9_maskingmv.c new file mode 100644 index 000000000..f1151e3dc --- /dev/null +++ b/vp9/common/vp9_maskingmv.c @@ -0,0 +1,806 @@ +/* + ============================================================================ + Name : vp9_maskingmv.c + Author : jimbankoski + Version : + Copyright : Your copyright notice + Description : Hello World in C, Ansi-style + ============================================================================ + */ + +#include +#include +#include +extern unsigned int vp9_sad16x16_sse3( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + int max_err); + +extern void vp9_sad16x16x3_sse3( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + int *results); + +extern int vp8_growmaskmb_sse3( + unsigned char *om, + unsigned char *nm); + +extern void vp8_makemask_sse3( + unsigned char *y, + unsigned char *u, + unsigned char *v, + unsigned char *ym, + int yp, + int uvp, + int ys, + int us, + int vs, + int yt, + int ut, + int vt); + +unsigned int vp9_sad16x16_unmasked_wmt( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + unsigned char *mask); + +unsigned int vp9_sad16x16_masked_wmt( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + unsigned char *mask); + +unsigned int vp8_masked_predictor_wmt( + unsigned char *masked, + unsigned char *unmasked, + int src_stride, + unsigned char *dst_ptr, + int dst_stride, + unsigned char *mask); +unsigned int vp8_masked_predictor_uv_wmt( + unsigned char *masked, + unsigned char *unmasked, + int src_stride, + unsigned char *dst_ptr, + int dst_stride, + unsigned char *mask); +unsigned int vp8_uv_from_y_mask( + unsigned char *ymask, + unsigned char *uvmask); +int yp = 16; +unsigned char sxy[] = { + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90 +}; + +unsigned char sts[] = { + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +}; +unsigned char str[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +}; + +unsigned char y[] = { + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, + 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, + 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, + 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, + 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, + 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, + 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40 +}; +int uvp = 8; +unsigned char u[] = { + 90, 80, 70, 70, 90, 90, 90, 17, + 90, 80, 70, 70, 90, 90, 90, 17, + 84, 70, 70, 90, 90, 90, 17, 17, + 84, 70, 70, 90, 90, 90, 17, 17, + 80, 70, 70, 90, 90, 90, 17, 17, + 90, 80, 70, 70, 90, 90, 90, 17, + 90, 80, 70, 70, 90, 90, 90, 17, + 90, 80, 70, 70, 90, 90, 90, 17 +}; + +unsigned char v[] = { + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80 +}; + +unsigned char ym[256]; +unsigned char uvm[64]; +typedef struct { + unsigned char y; + unsigned char yt; + unsigned char u; + unsigned char ut; + unsigned char v; + unsigned char vt; + unsigned char use; +} COLOR_SEG_ELEMENT; + +/* +COLOR_SEG_ELEMENT segmentation[]= +{ + { 60,4,80,17,80,10, 1}, + { 40,4,15,10,80,10, 1}, +}; +*/ + +COLOR_SEG_ELEMENT segmentation[] = { + { 79, 44, 92, 44, 237, 60, 1}, +}; + +unsigned char pixel_mask(unsigned char y, unsigned char u, unsigned char v, + COLOR_SEG_ELEMENT sgm[], + int c) { + COLOR_SEG_ELEMENT *s = sgm; + unsigned char m = 0; + int i; + for (i = 0; i < c; i++, s++) + m |= (abs(y - s->y) < s->yt && + abs(u - s->u) < s->ut && + abs(v - s->v) < s->vt ? 255 : 0); + + return m; +} +int neighbors[256][8]; +int makeneighbors(void) { + int i, j; + for (i = 0; i < 256; i++) { + int r = (i >> 4), c = (i & 15); + int ni = 0; + for (j = 0; j < 8; j++) + neighbors[i][j] = i; + for (j = 0; j < 256; j++) { + int nr = (j >> 4), nc = (j & 15); + if (abs(nr - r) < 2 && abs(nc - c) < 2) + neighbors[i][ni++] = j; + } + } + return 0; +} +void grow_ymask(unsigned char *ym) { + unsigned char nym[256]; + int i, j; + + for (i = 0; i < 256; i++) { + nym[i] = ym[i]; + for (j = 0; j < 8; j++) { + nym[i] |= ym[neighbors[i][j]]; + } + } + for (i = 0; i < 256; i++) + ym[i] = nym[i]; +} +void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v, + unsigned char *ym, unsigned char *uvm, + int yp, int uvp, + COLOR_SEG_ELEMENT sgm[], + int count) { + int r, c; + unsigned char *oym = ym; + + memset(ym, 20, 256); + for (r = 0; r < 8; r++, uvm += 8, u += uvp, v += uvp, y += (yp + yp), ym += 32) + for (c = 0; c < 8; c++) { + int y1 = y[c << 1]; + int u1 = u[c]; + int v1 = v[c]; + int m = pixel_mask(y1, u1, v1, sgm, count); + uvm[c] = m; + ym[c << 1] = uvm[c]; // = pixel_mask(y[c<<1],u[c],v[c],sgm,count); + ym[(c << 1) + 1] = pixel_mask(y[1 + (c << 1)], u[c], v[c], sgm, count); + ym[(c << 1) + 16] = pixel_mask(y[yp + (c << 1)], u[c], v[c], sgm, count); + ym[(c << 1) + 17] = pixel_mask(y[1 + yp + (c << 1)], u[c], v[c], sgm, count); + } + grow_ymask(oym); +} + +int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp, + unsigned char *ym) { + int i, j; + unsigned sad = 0; + for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16) + for (j = 0; j < 16; j++) + if (ym[j]) + sad += abs(src[j] - dst[j]); + + return sad; +} + +int compare_masks(unsigned char *sym, unsigned char *ym) { + int i, j; + unsigned sad = 0; + for (i = 0; i < 16; i++, sym += 16, ym += 16) + for (j = 0; j < 16; j++) + sad += (sym[j] != ym[j] ? 1 : 0); + + return sad; +} +int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp, + unsigned char *ym) { + int i, j; + unsigned sad = 0; + for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16) + for (j = 0; j < 16; j++) + if (!ym[j]) + sad += abs(src[j] - dst[j]); + + return sad; +} +int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v, + int yp, int uvp, + unsigned char *dy, unsigned char *du, unsigned char *dv, + int dyp, int duvp, + COLOR_SEG_ELEMENT sgm[], + int count, + int *mi, + int *mj, + int *ui, + int *uj, + int *wm) { + int i, j; + + unsigned char ym[256]; + unsigned char uvm[64]; + unsigned char dym[256]; + unsigned char duvm[64]; + unsigned int e = 0; + int beste = 256; + int bmi = -32, bmj = -32; + int bui = -32, buj = -32; + int beste1 = 256; + int bmi1 = -32, bmj1 = -32; + int bui1 = -32, buj1 = -32; + int obeste; + + // first try finding best mask and then unmasked + beste = 0xffffffff; + + // find best unmasked mv + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + unsigned char *duz = i / 2 * duvp + du; + unsigned char *dvz = i / 2 * duvp + dv; + for (j = -32; j < 32; j++) { + // 0,0 masked destination + make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count); + + e = unmasked_sad(y, yp, dyz + j, dyp, dym); + + if (e < beste) { + bui = i; + buj = j; + beste = e; + } + } + } + // bui=0;buj=0; + // best mv masked destination + make_mb_mask(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2, + dym, duvm, dyp, duvp, sgm, count); + + obeste = beste; + beste = 0xffffffff; + + // find best masked + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + for (j = -32; j < 32; j++) { + e = masked_sad(y, yp, dyz + j, dyp, dym); + + if (e < beste) { + bmi = i; + bmj = j; + beste = e; + } + } + } + beste1 = beste + obeste; + bmi1 = bmi; + bmj1 = bmj; + bui1 = bui; + buj1 = buj; + + beste = 0xffffffff; + // source mask + make_mb_mask(y, u, v, ym, uvm, yp, uvp, sgm, count); + + // find best mask + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + unsigned char *duz = i / 2 * duvp + du; + unsigned char *dvz = i / 2 * duvp + dv; + for (j = -32; j < 32; j++) { + // 0,0 masked destination + make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count); + + e = compare_masks(ym, dym); + + if (e < beste) { + bmi = i; + bmj = j; + beste = e; + } + } + } + + + // best mv masked destination + make_mb_mask(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2, + dym, duvm, dyp, duvp, sgm, count); + + obeste = masked_sad(y, yp, dy + bmi * dyp + bmj, dyp, dym); + + beste = 0xffffffff; + + // find best unmasked mv + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + for (j = -32; j < 32; j++) { + e = unmasked_sad(y, yp, dyz + j, dyp, dym); + + if (e < beste) { + bui = i; + buj = j; + beste = e; + } + } + } + beste += obeste; + + + if (beste < beste1) { + *mi = bmi; + *mj = bmj; + *ui = bui; + *uj = buj; + *wm = 1; + } else { + *mi = bmi1; + *mj = bmj1; + *ui = bui1; + *uj = buj1; + *wm = 0; + + } + return 0; +} + +int predict(unsigned char *src, int p, unsigned char *dst, int dp, + unsigned char *ym, unsigned char *prd) { + int i, j; + for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16, prd += 16) + for (j = 0; j < 16; j++) + prd[j] = (ym[j] ? src[j] : dst[j]); + return 0; +} + +int fast_masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v, + int yp, int uvp, + unsigned char *dy, unsigned char *du, unsigned char *dv, + int dyp, int duvp, + COLOR_SEG_ELEMENT sgm[], + int count, + int *mi, + int *mj, + int *ui, + int *uj, + int *wm) { + int i, j; + + unsigned char ym[256]; + unsigned char ym2[256]; + unsigned char uvm[64]; + unsigned char dym2[256]; + unsigned char dym[256]; + unsigned char duvm[64]; + unsigned int e = 0; + int beste = 256; + int bmi = -32, bmj = -32; + int bui = -32, buj = -32; + int beste1 = 256; + int bmi1 = -32, bmj1 = -32; + int bui1 = -32, buj1 = -32; + int obeste; + + // first try finding best mask and then unmasked + beste = 0xffffffff; + +#if 0 + for (i = 0; i < 16; i++) { + unsigned char *dy = i * yp + y; + for (j = 0; j < 16; j++) + printf("%2x", dy[j]); + printf("\n"); + } + printf("\n"); + + for (i = -32; i < 48; i++) { + unsigned char *dyz = i * dyp + dy; + for (j = -32; j < 48; j++) + printf("%2x", dyz[j]); + printf("\n"); + } +#endif + + // find best unmasked mv + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + unsigned char *duz = i / 2 * duvp + du; + unsigned char *dvz = i / 2 * duvp + dv; + for (j = -32; j < 32; j++) { + // 0,0 masked destination + vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp, + sgm[0].y, sgm[0].u, sgm[0].v, + sgm[0].yt, sgm[0].ut, sgm[0].vt); + + vp8_growmaskmb_sse3(dym, dym2); + + e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2); + + if (e < beste) { + bui = i; + buj = j; + beste = e; + } + } + } + // bui=0;buj=0; + // best mv masked destination + + vp8_makemask_sse3(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2, + dym, dyp, duvp, + sgm[0].y, sgm[0].u, sgm[0].v, + sgm[0].yt, sgm[0].ut, sgm[0].vt); + + vp8_growmaskmb_sse3(dym, dym2); + + obeste = beste; + beste = 0xffffffff; + + // find best masked + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + for (j = -32; j < 32; j++) { + e = vp9_sad16x16_masked_wmt(y, yp, dyz + j, dyp, dym2); + if (e < beste) { + bmi = i; + bmj = j; + beste = e; + } + } + } + beste1 = beste + obeste; + bmi1 = bmi; + bmj1 = bmj; + bui1 = bui; + buj1 = buj; + + // source mask + vp8_makemask_sse3(y, u, v, + ym, yp, uvp, + sgm[0].y, sgm[0].u, sgm[0].v, + sgm[0].yt, sgm[0].ut, sgm[0].vt); + + vp8_growmaskmb_sse3(ym, ym2); + + // find best mask + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + unsigned char *duz = i / 2 * duvp + du; + unsigned char *dvz = i / 2 * duvp + dv; + for (j = -32; j < 32; j++) { + // 0,0 masked destination + vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp, + sgm[0].y, sgm[0].u, sgm[0].v, + sgm[0].yt, sgm[0].ut, sgm[0].vt); + + vp8_growmaskmb_sse3(dym, dym2); + + e = compare_masks(ym2, dym2); + + if (e < beste) { + bmi = i; + bmj = j; + beste = e; + } + } + } + + vp8_makemask_sse3(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2, + dym, dyp, duvp, + sgm[0].y, sgm[0].u, sgm[0].v, + sgm[0].yt, sgm[0].ut, sgm[0].vt); + + vp8_growmaskmb_sse3(dym, dym2); + + obeste = vp9_sad16x16_masked_wmt(y, yp, dy + bmi * dyp + bmj, dyp, dym2); + + beste = 0xffffffff; + + // find best unmasked mv + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + for (j = -32; j < 32; j++) { + e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2); + + if (e < beste) { + bui = i; + buj = j; + beste = e; + } + } + } + beste += obeste; + + if (beste < beste1) { + *mi = bmi; + *mj = bmj; + *ui = bui; + *uj = buj; + *wm = 1; + } else { + *mi = bmi1; + *mj = bmj1; + *ui = bui1; + *uj = buj1; + *wm = 0; + beste = beste1; + + } + return beste; +} + +int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm, + int ymp, int uvmp, + unsigned char *yp, unsigned char *up, unsigned char *vp, + int ypp, int uvpp, + COLOR_SEG_ELEMENT sgm[], + int count, + int mi, + int mj, + int ui, + int uj, + int wm) { + int i, j; + unsigned char dym[256]; + unsigned char dym2[256]; + unsigned char duvm[64]; + unsigned char *yu = ym, *uu = um, *vu = vm; + + unsigned char *dym3 = dym2; + + ym += mi * ymp + mj; + um += mi / 2 * uvmp + mj / 2; + vm += mi / 2 * uvmp + mj / 2; + + yu += ui * ymp + uj; + uu += ui / 2 * uvmp + uj / 2; + vu += ui / 2 * uvmp + uj / 2; + + // best mv masked destination + if (wm) + vp8_makemask_sse3(ym, um, vm, dym, ymp, uvmp, + sgm[0].y, sgm[0].u, sgm[0].v, + sgm[0].yt, sgm[0].ut, sgm[0].vt); + else + vp8_makemask_sse3(yu, uu, vu, dym, ymp, uvmp, + sgm[0].y, sgm[0].u, sgm[0].v, + sgm[0].yt, sgm[0].ut, sgm[0].vt); + + vp8_growmaskmb_sse3(dym, dym2); + vp8_masked_predictor_wmt(ym, yu, ymp, yp, ypp, dym3); + vp8_uv_from_y_mask(dym3, duvm); + vp8_masked_predictor_uv_wmt(um, uu, uvmp, up, uvpp, duvm); + vp8_masked_predictor_uv_wmt(vm, vu, uvmp, vp, uvpp, duvm); + + return 0; +} + +unsigned char f0p[1280 * 720 * 3 / 2]; +unsigned char f1p[1280 * 720 * 3 / 2]; +unsigned char prd[1280 * 720 * 3 / 2]; +unsigned char msk[1280 * 720 * 3 / 2]; + + +int mainz(int argc, char *argv[]) { + + FILE *f = fopen(argv[1], "rb"); + FILE *g = fopen(argv[2], "wb"); + int w = atoi(argv[3]), h = atoi(argv[4]); + int y_stride = w, uv_stride = w / 2; + int r, c; + unsigned char *f0 = f0p, *f1 = f1p, *t; + unsigned char ym[256], uvm[64]; + unsigned char ym2[256], uvm2[64]; + unsigned char ym3[256], uvm3[64]; + int a, b; + + COLOR_SEG_ELEMENT last = { 20, 20, 20, 20, 230, 20, 1}, best; +#if 0 + makeneighbors(); + COLOR_SEG_ELEMENT segmentation[] = { + { 60, 4, 80, 17, 80, 10, 1}, + { 40, 4, 15, 10, 80, 10, 1}, + }; + make_mb_mask(y, u, v, ym2, uvm2, 16, 8, segmentation, 1); + + vp8_makemask_sse3(y, u, v, ym, (int) 16, (int) 8, + (int) segmentation[0].y, (int) segmentation[0].u, (int) segmentation[0].v, + segmentation[0].yt, segmentation[0].ut, segmentation[0].vt); + + vp8_growmaskmb_sse3(ym, ym3); + + a = vp9_sad16x16_masked_wmt(str, 16, sts, 16, ym3); + b = vp9_sad16x16_unmasked_wmt(str, 16, sts, 16, ym3); + + vp8_masked_predictor_wmt(str, sts, 16, ym, 16, ym3); + + vp8_uv_from_y_mask(ym3, uvm3); + + return 4; +#endif + makeneighbors(); + + + memset(prd, 128, w * h * 3 / 2); + + fread(f0, w * h * 3 / 2, 1, f); + + while (!feof(f)) { + unsigned char *ys = f1, *yd = f0, *yp = prd; + unsigned char *us = f1 + w * h, *ud = f0 + w * h, *up = prd + w * h; + unsigned char *vs = f1 + w * h * 5 / 4, *vd = f0 + w * h * 5 / 4, *vp = prd + w * h * 5 / 4; + fread(f1, w * h * 3 / 2, 1, f); + + ys += 32 * y_stride; + yd += 32 * y_stride; + yp += 32 * y_stride; + us += 16 * uv_stride; + ud += 16 * uv_stride; + up += 16 * uv_stride; + vs += 16 * uv_stride; + vd += 16 * uv_stride; + vp += 16 * uv_stride; + for (r = 32; r < h - 32; r += 16, + ys += 16 * w, yd += 16 * w, yp += 16 * w, + us += 8 * uv_stride, ud += 8 * uv_stride, up += 8 * uv_stride, + vs += 8 * uv_stride, vd += 8 * uv_stride, vp += 8 * uv_stride) { + for (c = 32; c < w - 32; c += 16) { + int mi, mj, ui, uj, wm; + int bmi, bmj, bui, buj, bwm; + unsigned char ym[256]; + + if (vp9_sad16x16_sse3(ys + c, y_stride, yd + c, y_stride, 0xffff) == 0) + bmi = bmj = bui = buj = bwm = 0; + else { + COLOR_SEG_ELEMENT cs[5]; + int j; + unsigned int beste = 0xfffffff; + unsigned int bestj = 0; + + // try color from last mb segmentation + cs[0] = last; + + // try color segs from 4 pixels in mb recon as segmentation + cs[1].y = yd[c + y_stride + 1]; + cs[1].u = ud[c / 2 + uv_stride]; + cs[1].v = vd[c / 2 + uv_stride]; + cs[1].yt = cs[1].ut = cs[1].vt = 20; + cs[2].y = yd[c + w + 14]; + cs[2].u = ud[c / 2 + uv_stride + 7]; + cs[2].v = vd[c / 2 + uv_stride + 7]; + cs[2].yt = cs[2].ut = cs[2].vt = 20; + cs[3].y = yd[c + w * 14 + 1]; + cs[3].u = ud[c / 2 + uv_stride * 7]; + cs[3].v = vd[c / 2 + uv_stride * 7]; + cs[3].yt = cs[3].ut = cs[3].vt = 20; + cs[4].y = yd[c + w * 14 + 14]; + cs[4].u = ud[c / 2 + uv_stride * 7 + 7]; + cs[4].v = vd[c / 2 + uv_stride * 7 + 7]; + cs[4].yt = cs[4].ut = cs[4].vt = 20; + + for (j = 0; j < 5; j++) { + int e; + + e = fast_masked_motion_search( + ys + c, us + c / 2, vs + c / 2, y_stride, uv_stride, + yd + c, ud + c / 2, vd + c / 2, y_stride, uv_stride, + &cs[j], 1, &mi, &mj, &ui, &uj, &wm); + + if (e < beste) { + bmi = mi; + bmj = mj; + bui = ui; + buj = uj, bwm = wm; + bestj = j; + beste = e; + } + } + best = cs[bestj]; + // best = segmentation[0]; + last = best; + } + predict_all(yd + c, ud + c / 2, vd + c / 2, w, uv_stride, + yp + c, up + c / 2, vp + c / 2, w, uv_stride, + &best, 1, bmi, bmj, bui, buj, bwm); + + } + } + fwrite(prd, w * h * 3 / 2, 1, g); + t = f0; + f0 = f1; + f1 = t; + + } + fclose(f); + fclose(g); + return; +} diff --git a/vp9/common/vp9_mbpitch.c b/vp9/common/vp9_mbpitch.c new file mode 100644 index 000000000..b42db987a --- /dev/null +++ b/vp9/common/vp9_mbpitch.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9_blockd.h" + +typedef enum { + PRED = 0, + DEST = 1 +} BLOCKSET; + +static void setup_block +( + BLOCKD *b, + int mv_stride, + unsigned char **base, + unsigned char **base2, + int Stride, + int offset, + BLOCKSET bs +) { + + if (bs == DEST) { + b->dst_stride = Stride; + b->dst = offset; + b->base_dst = base; + } else { + b->pre_stride = Stride; + b->pre = offset; + b->base_pre = base; + b->base_second_pre = base2; + } + +} + + +static void setup_macroblock(MACROBLOCKD *xd, BLOCKSET bs) { + int block; + + unsigned char **y, **u, **v; + unsigned char **y2, **u2, **v2; + BLOCKD *blockd = xd->block; + int stride; + + if (bs == DEST) { + y = &xd->dst.y_buffer; + u = &xd->dst.u_buffer; + v = &xd->dst.v_buffer; + } else { + y = &xd->pre.y_buffer; + u = &xd->pre.u_buffer; + v = &xd->pre.v_buffer; + + y2 = &xd->second_pre.y_buffer; + u2 = &xd->second_pre.u_buffer; + v2 = &xd->second_pre.v_buffer; + } + + stride = xd->dst.y_stride; + for (block = 0; block < 16; block++) { /* y blocks */ + setup_block(&blockd[block], stride, y, y2, stride, + (block >> 2) * 4 * stride + (block & 3) * 4, bs); + } + + stride = xd->dst.uv_stride; + for (block = 16; block < 20; block++) { /* U and V blocks */ + setup_block(&blockd[block], stride, u, u2, stride, + ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs); + + setup_block(&blockd[block + 4], stride, v, v2, stride, + ((block - 16) >> 1) * 4 * stride + (block & 1) * 4, bs); + } +} + +void vp9_setup_block_dptrs(MACROBLOCKD *xd) { + int r, c; + BLOCKD *blockd = xd->block; + + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) { + blockd[r * 4 + c].diff = &xd->diff[r * 4 * 16 + c * 4]; + blockd[r * 4 + c].predictor = xd->predictor + r * 4 * 16 + c * 4; + } + } + + for (r = 0; r < 2; r++) { + for (c = 0; c < 2; c++) { + blockd[16 + r * 2 + c].diff = &xd->diff[256 + r * 4 * 8 + c * 4]; + blockd[16 + r * 2 + c].predictor = + xd->predictor + 256 + r * 4 * 8 + c * 4; + + } + } + + for (r = 0; r < 2; r++) { + for (c = 0; c < 2; c++) { + blockd[20 + r * 2 + c].diff = &xd->diff[320 + r * 4 * 8 + c * 4]; + blockd[20 + r * 2 + c].predictor = + xd->predictor + 320 + r * 4 * 8 + c * 4; + + } + } + + blockd[24].diff = &xd->diff[384]; + + for (r = 0; r < 25; r++) { + blockd[r].qcoeff = xd->qcoeff + r * 16; + blockd[r].dqcoeff = xd->dqcoeff + r * 16; + } +} + +void vp9_build_block_doffsets(MACROBLOCKD *xd) { + + /* handle the destination pitch features */ + setup_macroblock(xd, DEST); + setup_macroblock(xd, PRED); +} diff --git a/vp9/common/vp9_modecont.c b/vp9/common/vp9_modecont.c new file mode 100644 index 000000000..2f5bbd2ba --- /dev/null +++ b/vp9/common/vp9_modecont.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9_entropy.h" + +const int vp9_default_mode_contexts[INTER_MODE_CONTEXTS][4] = { + {223, 1, 1, 237}, // 0,0 best: Only candidate + {87, 166, 26, 219}, // 0,0 best: non zero candidates + {89, 67, 18, 125}, // 0,0 best: non zero candidates, split + {16, 141, 69, 226}, // strong nz candidate(s), no split + {35, 122, 14, 227}, // weak nz candidate(s), no split + {14, 122, 22, 164}, // strong nz candidate(s), split + {16, 70, 9, 183}, // weak nz candidate(s), split +}; +const int vp9_default_mode_contexts_a[INTER_MODE_CONTEXTS][4] = { + {204, 1, 1, 213}, // 0,0 best: Only candidate + {106, 139, 22, 203}, // 0,0 best: non zero candidates + {75, 52, 15, 118}, // 0,0 best: non zero candidates, split + {12, 148, 61, 211}, // strong nz candidate(s), no split + {18, 98, 17, 199}, // weak nz candidate(s), no split + {11, 91, 25, 148}, // strong nz candidate(s), split + {10, 53, 9, 145}, // weak nz candidate(s), split +}; diff --git a/vp9/common/vp9_modecont.h b/vp9/common/vp9_modecont.h new file mode 100644 index 000000000..1fa4558e1 --- /dev/null +++ b/vp9/common/vp9_modecont.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_MODECONT_H +#define __INC_MODECONT_H + +extern const int vp9_default_mode_contexts[INTER_MODE_CONTEXTS][4]; +extern const int vp9_default_mode_contexts_a[INTER_MODE_CONTEXTS][4]; +#endif diff --git a/vp9/common/vp9_modecontext.c b/vp9/common/vp9_modecontext.c new file mode 100644 index 000000000..e18cf63c1 --- /dev/null +++ b/vp9/common/vp9_modecontext.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9_entropymode.h" + +const unsigned int vp9_kf_default_bmode_counts[VP9_KF_BINTRAMODES] + [VP9_KF_BINTRAMODES] + [VP9_KF_BINTRAMODES] = { + { + /*Above Mode : 0*/ + { 43438, 2195, 470, 316, 615, 171, 217, 412, 124, 160, }, /* left_mode 0 */ + { 5722, 2751, 296, 291, 81, 68, 80, 101, 100, 170, }, /* left_mode 1 */ + { 1629, 201, 307, 25, 47, 16, 34, 72, 19, 28, }, /* left_mode 2 */ + { 332, 266, 36, 500, 20, 65, 23, 14, 154, 106, }, /* left_mode 3 */ + { 450, 97, 10, 24, 117, 10, 2, 12, 8, 71, }, /* left_mode 4 */ + { 384, 49, 29, 44, 12, 162, 51, 5, 87, 42, }, /* left_mode 5 */ + { 495, 53, 157, 27, 14, 57, 180, 17, 17, 34, }, /* left_mode 6 */ + { 695, 64, 62, 9, 27, 5, 3, 147, 10, 26, }, /* left_mode 7 */ + { 230, 54, 20, 124, 16, 125, 29, 12, 283, 37, }, /* left_mode 8 */ + { 260, 87, 21, 120, 32, 16, 33, 16, 33, 203, }, /* left_mode 9 */ + }, + { + /*Above Mode : 1*/ + { 3934, 2573, 355, 137, 128, 87, 133, 117, 37, 27, }, /* left_mode 0 */ + { 1036, 1929, 278, 135, 27, 37, 48, 55, 41, 91, }, /* left_mode 1 */ + { 223, 256, 253, 15, 13, 9, 28, 64, 3, 3, }, /* left_mode 2 */ + { 120, 129, 17, 316, 15, 11, 9, 4, 53, 74, }, /* left_mode 3 */ + { 129, 58, 6, 11, 38, 2, 0, 5, 2, 67, }, /* left_mode 4 */ + { 53, 22, 11, 16, 8, 26, 14, 3, 19, 12, }, /* left_mode 5 */ + { 59, 26, 61, 11, 4, 9, 35, 13, 8, 8, }, /* left_mode 6 */ + { 101, 52, 40, 8, 5, 2, 8, 59, 2, 20, }, /* left_mode 7 */ + { 48, 34, 10, 52, 8, 15, 6, 6, 63, 20, }, /* left_mode 8 */ + { 96, 48, 22, 63, 11, 14, 5, 8, 9, 96, }, /* left_mode 9 */ + }, + { + /*Above Mode : 2*/ + { 709, 461, 506, 36, 27, 33, 151, 98, 24, 6, }, /* left_mode 0 */ + { 201, 375, 442, 27, 13, 8, 46, 58, 6, 19, }, /* left_mode 1 */ + { 122, 140, 417, 4, 13, 3, 33, 59, 4, 2, }, /* left_mode 2 */ + { 36, 17, 22, 16, 6, 8, 12, 17, 9, 21, }, /* left_mode 3 */ + { 51, 15, 7, 1, 14, 0, 4, 5, 3, 22, }, /* left_mode 4 */ + { 18, 11, 30, 9, 7, 20, 11, 5, 2, 6, }, /* left_mode 5 */ + { 38, 21, 103, 9, 4, 12, 79, 13, 2, 5, }, /* left_mode 6 */ + { 64, 17, 66, 2, 12, 4, 2, 65, 4, 5, }, /* left_mode 7 */ + { 14, 7, 7, 16, 3, 11, 4, 13, 15, 16, }, /* left_mode 8 */ + { 36, 8, 32, 9, 9, 4, 14, 7, 6, 24, }, /* left_mode 9 */ + }, + { + /*Above Mode : 3*/ + { 1340, 173, 36, 119, 30, 10, 13, 10, 20, 26, }, /* left_mode 0 */ + { 156, 293, 26, 108, 5, 16, 2, 4, 23, 30, }, /* left_mode 1 */ + { 60, 34, 13, 7, 3, 3, 0, 8, 4, 5, }, /* left_mode 2 */ + { 72, 64, 1, 235, 3, 9, 2, 7, 28, 38, }, /* left_mode 3 */ + { 29, 14, 1, 3, 5, 0, 2, 2, 5, 13, }, /* left_mode 4 */ + { 22, 7, 4, 11, 2, 5, 1, 2, 6, 4, }, /* left_mode 5 */ + { 18, 14, 5, 6, 4, 3, 14, 0, 9, 2, }, /* left_mode 6 */ + { 41, 10, 7, 1, 2, 0, 0, 10, 2, 1, }, /* left_mode 7 */ + { 23, 19, 2, 33, 1, 5, 2, 0, 51, 8, }, /* left_mode 8 */ + { 33, 26, 7, 53, 3, 9, 3, 3, 9, 19, }, /* left_mode 9 */ + }, + { + /*Above Mode : 4*/ + { 410, 165, 43, 31, 66, 15, 30, 54, 8, 17, }, /* left_mode 0 */ + { 115, 64, 27, 18, 30, 7, 11, 15, 4, 19, }, /* left_mode 1 */ + { 31, 23, 25, 1, 7, 2, 2, 10, 0, 5, }, /* left_mode 2 */ + { 17, 4, 1, 6, 8, 2, 7, 5, 5, 21, }, /* left_mode 3 */ + { 120, 12, 1, 2, 83, 3, 0, 4, 1, 40, }, /* left_mode 4 */ + { 4, 3, 1, 2, 1, 2, 5, 0, 3, 6, }, /* left_mode 5 */ + { 10, 2, 13, 6, 6, 6, 8, 2, 4, 5, }, /* left_mode 6 */ + { 58, 10, 5, 1, 28, 1, 1, 33, 1, 9, }, /* left_mode 7 */ + { 8, 2, 1, 4, 2, 5, 1, 1, 2, 10, }, /* left_mode 8 */ + { 76, 7, 5, 7, 18, 2, 2, 0, 5, 45, }, /* left_mode 9 */ + }, + { + /*Above Mode : 5*/ + { 444, 46, 47, 20, 14, 110, 60, 14, 60, 7, }, /* left_mode 0 */ + { 59, 57, 25, 18, 3, 17, 21, 6, 14, 6, }, /* left_mode 1 */ + { 24, 17, 20, 6, 4, 13, 7, 2, 3, 2, }, /* left_mode 2 */ + { 13, 11, 5, 14, 4, 9, 2, 4, 15, 7, }, /* left_mode 3 */ + { 8, 5, 2, 1, 4, 0, 1, 1, 2, 12, }, /* left_mode 4 */ + { 19, 5, 5, 7, 4, 40, 6, 3, 10, 4, }, /* left_mode 5 */ + { 16, 5, 9, 1, 1, 16, 26, 2, 10, 4, }, /* left_mode 6 */ + { 11, 4, 8, 1, 1, 4, 4, 5, 4, 1, }, /* left_mode 7 */ + { 15, 1, 3, 7, 3, 21, 7, 1, 34, 5, }, /* left_mode 8 */ + { 18, 5, 1, 3, 4, 3, 7, 1, 2, 9, }, /* left_mode 9 */ + }, + { + /*Above Mode : 6*/ + { 476, 149, 94, 13, 14, 77, 291, 27, 23, 3, }, /* left_mode 0 */ + { 79, 83, 42, 14, 2, 12, 63, 2, 4, 14, }, /* left_mode 1 */ + { 43, 36, 55, 1, 3, 8, 42, 11, 5, 1, }, /* left_mode 2 */ + { 9, 9, 6, 16, 1, 5, 6, 3, 11, 10, }, /* left_mode 3 */ + { 10, 3, 1, 3, 10, 1, 0, 1, 1, 4, }, /* left_mode 4 */ + { 14, 6, 15, 5, 1, 20, 25, 2, 5, 0, }, /* left_mode 5 */ + { 28, 7, 51, 1, 0, 8, 127, 6, 2, 5, }, /* left_mode 6 */ + { 13, 3, 3, 2, 3, 1, 2, 8, 1, 2, }, /* left_mode 7 */ + { 10, 3, 3, 3, 3, 8, 2, 2, 9, 3, }, /* left_mode 8 */ + { 13, 7, 11, 4, 0, 4, 6, 2, 5, 8, }, /* left_mode 9 */ + }, + { + /*Above Mode : 7*/ + { 376, 135, 119, 6, 32, 8, 31, 224, 9, 3, }, /* left_mode 0 */ + { 93, 60, 54, 6, 13, 7, 8, 92, 2, 12, }, /* left_mode 1 */ + { 74, 36, 84, 0, 3, 2, 9, 67, 2, 1, }, /* left_mode 2 */ + { 19, 4, 4, 8, 8, 2, 4, 7, 6, 16, }, /* left_mode 3 */ + { 51, 7, 4, 1, 77, 3, 0, 14, 1, 15, }, /* left_mode 4 */ + { 7, 7, 5, 7, 4, 7, 4, 5, 0, 3, }, /* left_mode 5 */ + { 18, 2, 19, 2, 2, 4, 12, 11, 1, 2, }, /* left_mode 6 */ + { 129, 6, 27, 1, 21, 3, 0, 189, 0, 6, }, /* left_mode 7 */ + { 9, 1, 2, 8, 3, 7, 0, 5, 3, 3, }, /* left_mode 8 */ + { 20, 4, 5, 10, 4, 2, 7, 17, 3, 16, }, /* left_mode 9 */ + }, + { + /*Above Mode : 8*/ + { 617, 68, 34, 79, 11, 27, 25, 14, 75, 13, }, /* left_mode 0 */ + { 51, 82, 21, 26, 6, 12, 13, 1, 26, 16, }, /* left_mode 1 */ + { 29, 9, 12, 11, 3, 7, 1, 10, 2, 2, }, /* left_mode 2 */ + { 17, 19, 11, 74, 4, 3, 2, 0, 58, 13, }, /* left_mode 3 */ + { 10, 1, 1, 3, 4, 1, 0, 2, 1, 8, }, /* left_mode 4 */ + { 14, 4, 5, 5, 1, 13, 2, 0, 27, 8, }, /* left_mode 5 */ + { 10, 3, 5, 4, 1, 7, 6, 4, 5, 1, }, /* left_mode 6 */ + { 10, 2, 6, 2, 1, 1, 1, 4, 2, 1, }, /* left_mode 7 */ + { 14, 8, 5, 23, 2, 12, 6, 2, 117, 5, }, /* left_mode 8 */ + { 9, 6, 2, 19, 1, 6, 3, 2, 9, 9, }, /* left_mode 9 */ + }, + { + /*Above Mode : 9*/ + { 680, 73, 22, 38, 42, 5, 11, 9, 6, 28, }, /* left_mode 0 */ + { 113, 112, 21, 22, 10, 2, 8, 4, 6, 42, }, /* left_mode 1 */ + { 44, 20, 24, 6, 5, 4, 3, 3, 1, 2, }, /* left_mode 2 */ + { 40, 23, 7, 71, 5, 2, 4, 1, 7, 22, }, /* left_mode 3 */ + { 85, 9, 4, 4, 17, 2, 0, 3, 2, 23, }, /* left_mode 4 */ + { 13, 4, 2, 6, 1, 7, 0, 1, 7, 6, }, /* left_mode 5 */ + { 26, 6, 8, 3, 2, 3, 8, 1, 5, 4, }, /* left_mode 6 */ + { 54, 8, 9, 6, 7, 0, 1, 11, 1, 3, }, /* left_mode 7 */ + { 9, 10, 4, 13, 2, 5, 4, 2, 14, 8, }, /* left_mode 8 */ + { 92, 9, 5, 19, 15, 3, 3, 1, 6, 58, }, /* left_mode 9 */ + }, +}; diff --git a/vp9/common/vp9_mv.h b/vp9/common/vp9_mv.h new file mode 100644 index 000000000..bbe6d2c8b --- /dev/null +++ b/vp9/common/vp9_mv.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_MV_H +#define __INC_MV_H +#include "vpx/vpx_integer.h" + +typedef struct { + short row; + short col; +} MV; + +typedef union int_mv { + uint32_t as_int; + MV as_mv; +} int_mv; /* facilitates faster equality tests and copies */ + +#endif diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c new file mode 100644 index 000000000..33dc3fa5b --- /dev/null +++ b/vp9/common/vp9_mvref_common.c @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9_mvref_common.h" + +#define MVREF_NEIGHBOURS 8 +static int mb_mv_ref_search[MVREF_NEIGHBOURS][2] = { + {0, -1}, {-1, 0}, {-1, -1}, {0, -2}, + {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2} +}; +static int mb_ref_distance_weight[MVREF_NEIGHBOURS] = + { 3, 3, 2, 1, 1, 1, 1, 1 }; +#if CONFIG_SUPERBLOCKS +static int sb_mv_ref_search[MVREF_NEIGHBOURS][2] = { + {0, -1}, {-1, 0}, {1, -1}, {-1, 1}, + {-1, -1}, {0, -2}, {-2, 0}, {-1, -2} +}; +static int sb_ref_distance_weight[MVREF_NEIGHBOURS] = + { 3, 3, 2, 2, 2, 1, 1, 1 }; +#endif +// clamp_mv +#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units +static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) { + + if (mv->as_mv.col < (xd->mb_to_left_edge - MV_BORDER)) + mv->as_mv.col = xd->mb_to_left_edge - MV_BORDER; + else if (mv->as_mv.col > xd->mb_to_right_edge + MV_BORDER) + mv->as_mv.col = xd->mb_to_right_edge + MV_BORDER; + + if (mv->as_mv.row < (xd->mb_to_top_edge - MV_BORDER)) + mv->as_mv.row = xd->mb_to_top_edge - MV_BORDER; + else if (mv->as_mv.row > xd->mb_to_bottom_edge + MV_BORDER) + mv->as_mv.row = xd->mb_to_bottom_edge + MV_BORDER; +} + + +// Gets a best matching candidate refenence motion vector +// from the given mode info structure (if available) +static int get_candidate_mvref( + const MODE_INFO *candidate_mi, + MV_REFERENCE_FRAME ref_frame, + MV_REFERENCE_FRAME *c_ref_frame, + int_mv *c_mv, + MV_REFERENCE_FRAME *c2_ref_frame, + int_mv *c2_mv +) { + + int ret_val = FALSE; + c2_mv->as_int = 0; + *c2_ref_frame = INTRA_FRAME; + + // Target ref frame matches candidate first ref frame + if (ref_frame == candidate_mi->mbmi.ref_frame) { + c_mv->as_int = candidate_mi->mbmi.mv[0].as_int; + *c_ref_frame = ref_frame; + ret_val = TRUE; + + // Is there a second non zero vector we can use. + if ((candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) && + (candidate_mi->mbmi.mv[1].as_int != 0) && + (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) { + c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int; + *c2_ref_frame = candidate_mi->mbmi.second_ref_frame; + } + + // Target ref frame matches candidate second ref frame + } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) { + c_mv->as_int = candidate_mi->mbmi.mv[1].as_int; + *c_ref_frame = ref_frame; + ret_val = TRUE; + + // Is there a second non zero vector we can use. + if ((candidate_mi->mbmi.ref_frame > INTRA_FRAME) && + (candidate_mi->mbmi.mv[0].as_int != 0) && + (candidate_mi->mbmi.mv[0].as_int != c_mv->as_int)) { + c2_mv->as_int = candidate_mi->mbmi.mv[0].as_int; + *c2_ref_frame = candidate_mi->mbmi.ref_frame; + } + + // No ref frame matches so use first ref mv as first choice + } else if (candidate_mi->mbmi.ref_frame > INTRA_FRAME) { + c_mv->as_int = candidate_mi->mbmi.mv[0].as_int; + *c_ref_frame = candidate_mi->mbmi.ref_frame; + ret_val = TRUE; + + // Is there a second non zero vector we can use. + if ((candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) && + (candidate_mi->mbmi.mv[1].as_int != 0) && + (candidate_mi->mbmi.mv[1].as_int != c_mv->as_int)) { + c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int; + *c2_ref_frame = candidate_mi->mbmi.second_ref_frame; + } + + // If only the second ref mv is valid:- (Should not trigger in current code + // base given current possible compound prediction options). + } else if (candidate_mi->mbmi.second_ref_frame > INTRA_FRAME) { + c_mv->as_int = candidate_mi->mbmi.mv[1].as_int; + *c_ref_frame = candidate_mi->mbmi.second_ref_frame; + ret_val = TRUE; + } + + return ret_val; +} + +// Performs mv adjustment based on reference frame and clamps the MV +// if it goes off the edge of the buffer. +static void scale_mv( + MACROBLOCKD *xd, + MV_REFERENCE_FRAME this_ref_frame, + MV_REFERENCE_FRAME candidate_ref_frame, + int_mv *candidate_mv, + int *ref_sign_bias +) { + + if (candidate_ref_frame != this_ref_frame) { + + //int frame_distances[MAX_REF_FRAMES]; + //int last_distance = 1; + //int gf_distance = xd->frames_since_golden; + //int arf_distance = xd->frames_till_alt_ref_frame; + + // Sign inversion where appropriate. + if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) { + candidate_mv->as_mv.row = -candidate_mv->as_mv.row; + candidate_mv->as_mv.col = -candidate_mv->as_mv.col; + } + + // Scale based on frame distance if the reference frames not the same. + /*frame_distances[INTRA_FRAME] = 1; // should never be used + frame_distances[LAST_FRAME] = 1; + frame_distances[GOLDEN_FRAME] = + (xd->frames_since_golden) ? xd->frames_since_golden : 1; + frame_distances[ALTREF_FRAME] = + (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1; + + if (frame_distances[this_ref_frame] && + frame_distances[candidate_ref_frame]) { + candidate_mv->as_mv.row = + (short)(((int)(candidate_mv->as_mv.row) * + frame_distances[this_ref_frame]) / + frame_distances[candidate_ref_frame]); + + candidate_mv->as_mv.col = + (short)(((int)(candidate_mv->as_mv.col) * + frame_distances[this_ref_frame]) / + frame_distances[candidate_ref_frame]); + } + */ + } + + // Clamp the MV so it does not point out of the frame buffer + clamp_mv(xd, candidate_mv); +} + +// Adds a new candidate reference vector to the list if indeed it is new. +// If it is not new then the score of the existing candidate that it matches +// is increased and the list is resorted. +static void addmv_and_shuffle( + int_mv *mv_list, + int *mv_scores, + int *index, + int_mv candidate_mv, + int weight +) { + + int i = *index; + int duplicate_found = FALSE; + + // Check for duplicates. If there is one increment its score. + // Duplicate defined as being the same full pel vector with rounding. + while (i > 0) { + i--; + + if (candidate_mv.as_int == mv_list[i].as_int) { + duplicate_found = TRUE; + mv_scores[i] += weight; + break; + } + } + + // If no duplicate was found add the new vector and give it a weight + if (!duplicate_found) { + mv_list[*index].as_int = candidate_mv.as_int; + mv_scores[*index] = weight; + i = *index; + (*index)++; + } + + // Reshuffle the list so that highest scoring mvs at the top. + while (i > 0) { + if (mv_scores[i] > mv_scores[i-1]) { + int tmp_score = mv_scores[i-1]; + int_mv tmp_mv = mv_list[i-1]; + + mv_scores[i-1] = mv_scores[i]; + mv_list[i-1] = mv_list[i]; + mv_scores[i] = tmp_score; + mv_list[i] = tmp_mv; + i--; + } else + break; + } +} + +// This function searches the neighbourhood of a given MB/SB and populates a +// list of candidate reference vectors. +// +void vp9_find_mv_refs( + MACROBLOCKD *xd, + MODE_INFO *here, + MODE_INFO *lf_here, + MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int *ref_sign_bias +) { + + int i; + MODE_INFO *candidate_mi; + MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi; + int_mv candidate_mvs[MAX_MV_REFS]; + int_mv c_refmv; + MV_REFERENCE_FRAME c_ref_frame; + int_mv c2_refmv; + MV_REFERENCE_FRAME c2_ref_frame; + int candidate_scores[MAX_MV_REFS]; + int index = 0; + int split_count = 0; + int ref_weight = 0; + int valid_mv_ref; + int (*mv_ref_search)[2]; + int *ref_distance_weight; + + // Blank the reference vector lists and other local structures. + vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REFS); + vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REFS); + vpx_memset(candidate_scores, 0, sizeof(candidate_scores)); + +#if CONFIG_SUPERBLOCKS + if (mbmi->encoded_as_sb) { + mv_ref_search = sb_mv_ref_search; + ref_distance_weight = sb_ref_distance_weight; + } else { + mv_ref_search = mb_mv_ref_search; + ref_distance_weight = mb_ref_distance_weight; + } +#else + mv_ref_search = mb_mv_ref_search; + ref_distance_weight = mb_ref_distance_weight; +#endif + // Populate a list with candidate reference vectors from the + // spatial neighbours. + for (i = 0; i < 2; ++i) { + if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) && + ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) { + + candidate_mi = here + mv_ref_search[i][0] + + (mv_ref_search[i][1] * xd->mode_info_stride); + + valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame, + &c_ref_frame, &c_refmv, + &c2_ref_frame, &c2_refmv); + + // If there is a valid MV candidate then add it to the list + if (valid_mv_ref) { + scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias ); + ref_weight = ref_distance_weight[i] + + ((c_ref_frame == ref_frame) << 4); + split_count += (candidate_mi->mbmi.mode == SPLITMV); + + addmv_and_shuffle(candidate_mvs, candidate_scores, + &index, c_refmv, ref_weight); + + // If there is a second valid mv then add it as well. + if (c2_ref_frame != INTRA_FRAME) { + scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias ); + ref_weight = ref_distance_weight[i] + + ((c2_ref_frame == ref_frame) << 4); + + addmv_and_shuffle(candidate_mvs, candidate_scores, + &index, c2_refmv, ref_weight); + } + } + } + } + + // Look at the corresponding vector in the last frame + candidate_mi = lf_here; + valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame, + &c_ref_frame, &c_refmv, + &c2_ref_frame, &c2_refmv); + + // If there is a valid MV candidate then add it to the list + if (valid_mv_ref) { + scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias ); + ref_weight = 2 + ((c_ref_frame == ref_frame) << 4); + addmv_and_shuffle(candidate_mvs, candidate_scores, + &index, c_refmv, ref_weight); + + // If there is a second valid mv then add it as well. + if (c2_ref_frame != INTRA_FRAME) { + scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias ); + ref_weight = ref_distance_weight[i] + + ((c2_ref_frame == ref_frame) << 4); + + addmv_and_shuffle(candidate_mvs, candidate_scores, + &index, c2_refmv, ref_weight); + } + } + + // Populate a list with candidate reference vectors from the + // spatial neighbours. + for (i = 2; (i < MVREF_NEIGHBOURS) && (index < (MAX_MV_REFS - 2)); ++i) { + if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) && + ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) { + + candidate_mi = here + mv_ref_search[i][0] + + (mv_ref_search[i][1] * xd->mode_info_stride); + + valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame, + &c_ref_frame, &c_refmv, + &c2_ref_frame, &c2_refmv); + + // If there is a valid MV candidate then add it to the list + if (valid_mv_ref) { + scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias ); + ref_weight = ref_distance_weight[i] + + ((c_ref_frame == ref_frame) << 4); + + addmv_and_shuffle(candidate_mvs, candidate_scores, + &index, c_refmv, ref_weight); + + // If there is a second valid mv then add it as well. + if (c2_ref_frame != INTRA_FRAME) { + scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias ); + ref_weight = ref_distance_weight[i] + + ((c2_ref_frame == ref_frame) << 4); + + addmv_and_shuffle(candidate_mvs, candidate_scores, + &index, c2_refmv, ref_weight); + } + } + } + } + + // Make sure we are able to add 0,0 + if (index > (MAX_MV_REFS - 1)) { + index = (MAX_MV_REFS - 1); + } + + // Define inter mode coding context. + // 0,0 was best + if (candidate_mvs[0].as_int == 0) { + // 0,0 is only candidate + if (index <= 1) { + mbmi->mb_mode_context[ref_frame] = 0; + // non zero candidates candidates available + } else if (split_count == 0) { + mbmi->mb_mode_context[ref_frame] = 1; + } else { + mbmi->mb_mode_context[ref_frame] = 2; + } + // Non zero best, No Split MV cases + } else if (split_count == 0) { + if (candidate_scores[0] >= 32) { + mbmi->mb_mode_context[ref_frame] = 3; + } else { + mbmi->mb_mode_context[ref_frame] = 4; + } + // Non zero best, some split mv + } else { + if (candidate_scores[0] >= 32) { + mbmi->mb_mode_context[ref_frame] = 5; + } else { + mbmi->mb_mode_context[ref_frame] = 6; + } + } + + // 0,0 is always a valid reference. + for (i = 0; i < index; ++i) { + if (candidate_mvs[i].as_int == 0) + break; + } + if (i == index) { + c_refmv.as_int = 0; + addmv_and_shuffle(candidate_mvs, candidate_scores, + &index, c_refmv, candidate_scores[3]+1 ); + } + + // Copy over the candidate list. + vpx_memcpy(mv_ref_list, candidate_mvs, sizeof(candidate_mvs)); +} diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h new file mode 100644 index 000000000..9a1877919 --- /dev/null +++ b/vp9/common/vp9_mvref_common.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9_onyxc_int.h" +#include "vp9_blockd.h" + + +#ifndef __INC_MVREF_COMMON_H +#define __INC_MVREF_COMMON_H + +void vp9_find_mv_refs( + MACROBLOCKD *xd, + MODE_INFO *here, + MODE_INFO *lf_here, + MV_REFERENCE_FRAME ref_frame, + int_mv * mv_ref_list, + int *ref_sign_bias +); + +#endif + diff --git a/vp9/common/vp9_onyx.h b/vp9/common/vp9_onyx.h new file mode 100644 index 000000000..92ec667e7 --- /dev/null +++ b/vp9/common/vp9_onyx.h @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_ONYX_H +#define __INC_ONYX_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vp8cx.h" +#include "vpx_scale/yv12config.h" +#include "vp9_type_aliases.h" +#include "vp9_ppflags.h" + typedef int *VP9_PTR; + + /* Create/destroy static data structures. */ + + typedef enum { + NORMAL = 0, + FOURFIVE = 1, + THREEFIVE = 2, + ONETWO = 3 + + } VPX_SCALING; + + typedef enum { + VP9_LAST_FLAG = 1, + VP9_GOLD_FLAG = 2, + VP9_ALT_FLAG = 4 + } VP9_REFFRAME; + + + typedef enum { + USAGE_STREAM_FROM_SERVER = 0x0, + USAGE_LOCAL_FILE_PLAYBACK = 0x1, + USAGE_CONSTRAINED_QUALITY = 0x2 + } END_USAGE; + + + typedef enum { + MODE_GOODQUALITY = 0x1, + MODE_BESTQUALITY = 0x2, + MODE_FIRSTPASS = 0x3, + MODE_SECONDPASS = 0x4, + MODE_SECONDPASS_BEST = 0x5, + } MODE; + + typedef enum { + FRAMEFLAGS_KEY = 1, + FRAMEFLAGS_GOLDEN = 2, + FRAMEFLAGS_ALTREF = 4, + } FRAMETYPE_FLAGS; + + +#include + static __inline void Scale2Ratio(int mode, int *hr, int *hs) { + switch (mode) { + case NORMAL: + *hr = 1; + *hs = 1; + break; + case FOURFIVE: + *hr = 4; + *hs = 5; + break; + case THREEFIVE: + *hr = 3; + *hs = 5; + break; + case ONETWO: + *hr = 1; + *hs = 2; + break; + default: + *hr = 1; + *hs = 1; + assert(0); + break; + } + } + + typedef struct { + int Version; // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode + int Width; // width of data passed to the compressor + int Height; // height of data passed to the compressor + double frame_rate; // set to passed in framerate + int target_bandwidth; // bandwidth to be used in kilobits per second + + int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0 + int Sharpness; // parameter used for sharpening output: recommendation 0: + int cpu_used; + unsigned int rc_max_intra_bitrate_pct; + + // mode -> + // (0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing + // a television signal or feed from a live camera). ( speed setting controls how fast ) + // (1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to + // encode the output. ( speed setting controls how fast ) + // (2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding + // speed. The output is compressed at the highest possible quality. This option takes the longest + // amount of time to encode. ( speed setting ignored ) + // (3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding + // pass. ( speed setting controls how fast ) + // (4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding + // pass to create the compressed output. ( speed setting controls how fast ) + // (5)=Two Pass - Second Pass Best. The encoder uses the statistics that were generated in the first + // encoding pass to create the compressed output using the highest possible quality, and taking a + // longer amount of time to encode.. ( speed setting ignored ) + int Mode; // + + // Key Framing Operations + int auto_key; // automatically detect cut scenes and set the keyframes + int key_freq; // maximum distance to key frame. + + int allow_lag; // allow lagged compression (if 0 lagin frames is ignored) + int lag_in_frames; // how many frames lag before we start encoding + + // ---------------------------------------------------------------- + // DATARATE CONTROL OPTIONS + + int end_usage; // vbr or cbr + + // buffer targeting aggressiveness + int under_shoot_pct; + int over_shoot_pct; + + // buffering parameters + int starting_buffer_level; // in seconds + int optimal_buffer_level; + int maximum_buffer_size; + + // controlling quality + int fixed_q; + int worst_allowed_q; + int best_allowed_q; + int cq_level; + int lossless; + + // two pass datarate control + int two_pass_vbrbias; // two pass datarate control tweaks + int two_pass_vbrmin_section; + int two_pass_vbrmax_section; + // END DATARATE CONTROL OPTIONS + // ---------------------------------------------------------------- + + + // these parameters aren't to be used in final build don't use!!! + int play_alternate; + int alt_freq; + + int encode_breakout; // early breakout encode threshold : for video conf recommend 800 + + int arnr_max_frames; + int arnr_strength; + int arnr_type; + + struct vpx_fixed_buf two_pass_stats_in; + struct vpx_codec_pkt_list *output_pkt_list; + + vp8e_tuning tuning; + } VP9_CONFIG; + + + void vp9_initialize_enc(); + + VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf); + void vp9_remove_compressor(VP9_PTR *comp); + + void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf); + +// receive a frames worth of data caller can assume that a copy of this frame is made +// and not just a copy of the pointer.. + int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time_stamp); + + int vp9_get_compressed_data(VP9_PTR comp, unsigned int *frame_flags, + unsigned long *size, unsigned char *dest, + int64_t *time_stamp, int64_t *time_end, + int flush); + + int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, + vp9_ppflags_t *flags); + + int vp9_use_as_reference(VP9_PTR comp, int ref_frame_flags); + + int vp9_update_reference(VP9_PTR comp, int ref_frame_flags); + + int vp9_get_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + + int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + + int vp9_update_entropy(VP9_PTR comp, int update); + + int vp9_set_roimap(VP9_PTR comp, unsigned char *map, + unsigned int rows, unsigned int cols, + int delta_q[4], int delta_lf[4], + unsigned int threshold[4]); + + int vp9_set_active_map(VP9_PTR comp, unsigned char *map, + unsigned int rows, unsigned int cols); + + int vp9_set_internal_size(VP9_PTR comp, + VPX_SCALING horiz_mode, VPX_SCALING vert_mode); + + int vp9_get_quantizer(VP9_PTR c); + +#ifdef __cplusplus +} +#endif + +#endif // __INC_ONYX_H diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h new file mode 100644 index 000000000..18750e767 --- /dev/null +++ b/vp9/common/vp9_onyxc_int.h @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_ONYXC_INT_H +#define __INC_ONYXC_INT_H + +#include "vpx_config.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vp9_rtcd.h" +#include "vp9_loopfilter.h" +#include "vp9_entropymv.h" +#include "vp9_entropy.h" +#include "vp9_entropymode.h" +#if CONFIG_POSTPROC +#include "vp9_postproc.h" +#endif + +/*#ifdef PACKET_TESTING*/ +#include "vp9_header.h" +/*#endif*/ + +/* Create/destroy static data structures. */ + +void vp9_initialize_common(void); + +#define MINQ 0 + +#define MAXQ 255 +#define QINDEX_BITS 8 + +#define QINDEX_RANGE (MAXQ + 1) + +#define NUM_YV12_BUFFERS 4 + +#define COMP_PRED_CONTEXTS 2 + +typedef struct frame_contexts { + vp9_prob bmode_prob[VP9_NKF_BINTRAMODES - 1]; + vp9_prob ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */ +#if CONFIG_SUPERBLOCKS + vp9_prob sb_ymode_prob[VP9_I32X32_MODES - 1]; +#endif + vp9_prob uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1]; + vp9_prob i8x8_mode_prob[VP9_I8X8_MODES - 1]; + vp9_prob sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1]; + vp9_prob mbsplit_prob[VP9_NUMMBSPLITS - 1]; + vp9_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + vp9_prob hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + vp9_prob coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + vp9_prob hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + vp9_prob coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + vp9_prob hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + + nmv_context nmvc; + nmv_context pre_nmvc; + vp9_prob pre_bmode_prob[VP9_NKF_BINTRAMODES - 1]; + vp9_prob pre_ymode_prob[VP9_YMODES - 1]; /* interframe intra mode probs */ +#if CONFIG_SUPERBLOCKS + vp9_prob pre_sb_ymode_prob[VP9_I32X32_MODES - 1]; +#endif + vp9_prob pre_uv_mode_prob[VP9_YMODES][VP9_UV_MODES - 1]; + vp9_prob pre_i8x8_mode_prob[VP9_I8X8_MODES - 1]; + vp9_prob pre_sub_mv_ref_prob[SUBMVREF_COUNT][VP9_SUBMVREFS - 1]; + vp9_prob pre_mbsplit_prob[VP9_NUMMBSPLITS - 1]; + unsigned int bmode_counts[VP9_NKF_BINTRAMODES]; + unsigned int ymode_counts[VP9_YMODES]; /* interframe intra mode probs */ +#if CONFIG_SUPERBLOCKS + unsigned int sb_ymode_counts[VP9_I32X32_MODES]; +#endif + unsigned int uv_mode_counts[VP9_YMODES][VP9_UV_MODES]; + unsigned int i8x8_mode_counts[VP9_I8X8_MODES]; /* interframe intra probs */ + unsigned int sub_mv_ref_counts[SUBMVREF_COUNT][VP9_SUBMVREFS]; + unsigned int mbsplit_counts[VP9_NUMMBSPLITS]; + + vp9_prob pre_coef_probs [BLOCK_TYPES] [COEF_BANDS] + [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + vp9_prob pre_hybrid_coef_probs [BLOCK_TYPES] [COEF_BANDS] + [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + + vp9_prob pre_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] + [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + vp9_prob pre_hybrid_coef_probs_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] + [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + + vp9_prob pre_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] + [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + vp9_prob pre_hybrid_coef_probs_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] + [PREV_COEF_CONTEXTS] [ENTROPY_NODES]; + + unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] + [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; + unsigned int hybrid_coef_counts [BLOCK_TYPES] [COEF_BANDS] + [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; + + unsigned int coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] + [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; + unsigned int hybrid_coef_counts_8x8 [BLOCK_TYPES_8X8] [COEF_BANDS] + [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; + + unsigned int coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] + [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; + unsigned int hybrid_coef_counts_16x16 [BLOCK_TYPES_16X16] [COEF_BANDS] + [PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS]; + + nmv_context_counts NMVcount; + vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1] + [VP9_SWITCHABLE_FILTERS - 1]; +#if CONFIG_COMP_INTERINTRA_PRED + unsigned int interintra_counts[2]; + vp9_prob interintra_prob; + vp9_prob pre_interintra_prob; +#endif + + int mode_context[INTER_MODE_CONTEXTS][4]; + int mode_context_a[INTER_MODE_CONTEXTS][4]; + int vp9_mode_contexts[INTER_MODE_CONTEXTS][4]; + int mv_ref_ct[INTER_MODE_CONTEXTS][4][2]; +} FRAME_CONTEXT; + +typedef enum { + RECON_CLAMP_REQUIRED = 0, + RECON_CLAMP_NOTREQUIRED = 1 +} CLAMP_TYPE; + +typedef enum { + SINGLE_PREDICTION_ONLY = 0, + COMP_PREDICTION_ONLY = 1, + HYBRID_PREDICTION = 2, + NB_PREDICTION_TYPES = 3, +} COMPPREDMODE_TYPE; + +typedef enum { + ONLY_4X4 = 0, + ALLOW_8X8 = 1, + ALLOW_16X16 = 2, + TX_MODE_SELECT = 3, + NB_TXFM_MODES = 4, +} TXFM_MODE; + +typedef struct VP9_COMMON_RTCD { +#if CONFIG_RUNTIME_CPU_DETECT +#if CONFIG_POSTPROC + vp9_postproc_rtcd_vtable_t postproc; +#endif + int flags; +#else + int unused; +#endif +} VP9_COMMON_RTCD; + +typedef struct VP9Common { + struct vpx_internal_error_info error; + + DECLARE_ALIGNED(16, short, Y1dequant[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, Y2dequant[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, short, UVdequant[QINDEX_RANGE][16]); + + int Width; + int Height; + int horiz_scale; + int vert_scale; + + YUV_TYPE clr_type; + CLAMP_TYPE clamp_type; + + YV12_BUFFER_CONFIG *frame_to_show; + + YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS]; + int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; + int new_fb_idx, lst_fb_idx, gld_fb_idx, alt_fb_idx; + + YV12_BUFFER_CONFIG post_proc_buffer; + YV12_BUFFER_CONFIG temp_scale_frame; + + + FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */ + FRAME_TYPE frame_type; + + int show_frame; + + int frame_flags; + int MBs; + int mb_rows; + int mb_cols; + int mode_info_stride; + + /* profile settings */ + int experimental; + int mb_no_coeff_skip; + TXFM_MODE txfm_mode; + COMPPREDMODE_TYPE comp_pred_mode; + int no_lpf; + int use_bilinear_mc_filter; + int full_pixel; + + int base_qindex; + int last_kf_gf_q; /* Q used on the last GF or KF */ + + int y1dc_delta_q; + int y2dc_delta_q; + int y2ac_delta_q; + int uvdc_delta_q; + int uvac_delta_q; + + unsigned int frames_since_golden; + unsigned int frames_till_alt_ref_frame; + + /* We allocate a MODE_INFO struct for each macroblock, together with + an extra row on top and column on the left to simplify prediction. */ + + MODE_INFO *mip; /* Base of allocated array */ + MODE_INFO *mi; /* Corresponds to upper left visible macroblock */ + MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */ + MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */ + + + // Persistent mb segment id map used in prediction. + unsigned char *last_frame_seg_map; + + INTERPOLATIONFILTERTYPE mcomp_filter_type; + LOOPFILTERTYPE filter_type; + + loop_filter_info_n lf_info; + + int filter_level; + int last_sharpness_level; + int sharpness_level; + + int refresh_last_frame; /* Two state 0 = NO, 1 = YES */ + int refresh_golden_frame; /* Two state 0 = NO, 1 = YES */ + int refresh_alt_ref_frame; /* Two state 0 = NO, 1 = YES */ + + int copy_buffer_to_gf; /* 0 none, 1 Last to GF, 2 ARF to GF */ + int copy_buffer_to_arf; /* 0 none, 1 Last to ARF, 2 GF to ARF */ + + int refresh_entropy_probs; /* Two state 0 = NO, 1 = YES */ + + int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */ + + /* Y,U,V,Y2 */ + ENTROPY_CONTEXT_PLANES *above_context; /* row of context for each plane */ + ENTROPY_CONTEXT_PLANES left_context[2]; /* (up to) 4 contexts "" */ + + /* keyframe block modes are predicted by their above, left neighbors */ + + vp9_prob kf_bmode_prob[VP9_KF_BINTRAMODES] + [VP9_KF_BINTRAMODES] + [VP9_KF_BINTRAMODES - 1]; + vp9_prob kf_ymode_prob[8][VP9_YMODES - 1]; /* keyframe "" */ +#if CONFIG_SUPERBLOCKS + vp9_prob sb_kf_ymode_prob[8][VP9_I32X32_MODES - 1]; +#endif + int kf_ymode_probs_index; + int kf_ymode_probs_update; + vp9_prob kf_uv_mode_prob[VP9_YMODES] [VP9_UV_MODES - 1]; + + vp9_prob prob_intra_coded; + vp9_prob prob_last_coded; + vp9_prob prob_gf_coded; +#if CONFIG_SUPERBLOCKS + vp9_prob sb_coded; +#endif + + // Context probabilities when using predictive coding of segment id + vp9_prob segment_pred_probs[PREDICTION_PROBS]; + unsigned char temporal_update; + + // Context probabilities for reference frame prediction + unsigned char ref_scores[MAX_REF_FRAMES]; + vp9_prob ref_pred_probs[PREDICTION_PROBS]; + vp9_prob mod_refprobs[MAX_REF_FRAMES][PREDICTION_PROBS]; + + vp9_prob prob_comppred[COMP_PRED_CONTEXTS]; + + // FIXME contextualize + vp9_prob prob_tx[TX_SIZE_MAX - 1]; + + vp9_prob mbskip_pred_probs[MBSKIP_CONTEXTS]; + + FRAME_CONTEXT lfc_a; /* last alt ref entropy */ + FRAME_CONTEXT lfc; /* last frame entropy */ + FRAME_CONTEXT fc; /* this frame entropy */ + + unsigned int current_video_frame; + int near_boffset[3]; + int version; + +#ifdef PACKET_TESTING + VP9_HEADER oh; +#endif + double bitrate; + double framerate; + +#if CONFIG_RUNTIME_CPU_DETECT + VP9_COMMON_RTCD rtcd; +#endif + +#if CONFIG_POSTPROC + struct postproc_state postproc_state; +#endif + +#if CONFIG_PRED_FILTER + /* Prediction filter variables */ + int pred_filter_mode; // 0=disabled at the frame level (no MB filtered) + // 1=enabled at the frame level (all MB filtered) + // 2=specified per MB (1=filtered, 0=non-filtered) + vp9_prob prob_pred_filter_off; +#endif +#if CONFIG_COMP_INTERINTRA_PRED + int use_interintra; +#endif + +} VP9_COMMON; + +#endif // __INC_ONYX_INT_H diff --git a/vp9/common/vp9_onyxd.h b/vp9/common/vp9_onyxd.h new file mode 100644 index 000000000..2fc51db96 --- /dev/null +++ b/vp9/common/vp9_onyxd.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_ONYXD_H +#define __INC_ONYXD_H + + +/* Create/destroy static data structures. */ +#ifdef __cplusplus +extern "C" +{ +#endif +#include "vp9_type_aliases.h" +#include "vpx_scale/yv12config.h" +#include "vp9_ppflags.h" +#include "vpx_ports/mem.h" +#include "vpx/vpx_codec.h" + + typedef void *VP9D_PTR; + typedef struct { + int Width; + int Height; + int Version; + int postprocess; + int max_threads; + int input_partition; + } VP9D_CONFIG; + typedef enum { + VP9_LAST_FLAG = 1, + VP9_GOLD_FLAG = 2, + VP9_ALT_FLAG = 4 + } VP9_REFFRAME; + + void vp9_initialize_dec(void); + + int vp9_receive_compressed_data(VP9D_PTR comp, unsigned long size, + const unsigned char **dest, + int64_t time_stamp); + + int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd, + int64_t *time_stamp, int64_t *time_end_stamp, + vp9_ppflags_t *flags); + + vpx_codec_err_t vp9_get_reference_dec(VP9D_PTR comp, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + + vpx_codec_err_t vp9_set_reference_dec(VP9D_PTR comp, + VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + + VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf); + + void vp9_remove_decompressor(VP9D_PTR comp); + +#ifdef __cplusplus +} +#endif + +#endif // __INC_ONYXD_H diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c new file mode 100644 index 000000000..5752fbd82 --- /dev/null +++ b/vp9/common/vp9_postproc.c @@ -0,0 +1,1031 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vpx_scale/yv12config.h" +#include "vp9_postproc.h" +#include "vp9/common/vp9_textblit.h" +#include "vpx_scale/vpxscale.h" +#include "vp9_systemdependent.h" + +#include +#include +#include + +#define RGB_TO_YUV(t) \ + ( (0.257*(float)(t >> 16)) + (0.504*(float)(t >> 8 & 0xff)) + \ + (0.098*(float)(t & 0xff)) + 16), \ + (-(0.148*(float)(t >> 16)) - (0.291*(float)(t >> 8 & 0xff)) + \ + (0.439*(float)(t & 0xff)) + 128), \ + ( (0.439*(float)(t >> 16)) - (0.368*(float)(t >> 8 & 0xff)) - \ + (0.071*(float)(t & 0xff)) + 128) + +/* global constants */ +#if CONFIG_POSTPROC_VISUALIZER +static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = { + { RGB_TO_YUV(0x98FB98) }, /* PaleGreen */ + { RGB_TO_YUV(0x00FF00) }, /* Green */ + { RGB_TO_YUV(0xADFF2F) }, /* GreenYellow */ + { RGB_TO_YUV(0x8F0000) }, /* Dark Red */ + { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */ + { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */ + { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */ + { RGB_TO_YUV(0x8F0000) }, /* Dark Red */ + { RGB_TO_YUV(0x8F0000) }, /* Dark Red */ + { RGB_TO_YUV(0x228B22) }, /* ForestGreen */ + { RGB_TO_YUV(0x006400) }, /* DarkGreen */ + { RGB_TO_YUV(0x98F5FF) }, /* Cadet Blue */ + { RGB_TO_YUV(0x6CA6CD) }, /* Sky Blue */ + { RGB_TO_YUV(0x00008B) }, /* Dark blue */ + { RGB_TO_YUV(0x551A8B) }, /* Purple */ + { RGB_TO_YUV(0xFF0000) } /* Red */ + { RGB_TO_YUV(0xCC33FF) }, /* Magenta */ +}; + +static const unsigned char B_PREDICTION_MODE_colors[B_MODE_COUNT][3] = { + { RGB_TO_YUV(0x6633ff) }, /* Purple */ + { RGB_TO_YUV(0xcc33ff) }, /* Magenta */ + { RGB_TO_YUV(0xff33cc) }, /* Pink */ + { RGB_TO_YUV(0xff3366) }, /* Coral */ + { RGB_TO_YUV(0x3366ff) }, /* Blue */ + { RGB_TO_YUV(0xed00f5) }, /* Dark Blue */ + { RGB_TO_YUV(0x2e00b8) }, /* Dark Purple */ + { RGB_TO_YUV(0xff6633) }, /* Orange */ + { RGB_TO_YUV(0x33ccff) }, /* Light Blue */ + { RGB_TO_YUV(0x8ab800) }, /* Green */ + { RGB_TO_YUV(0xffcc33) }, /* Light Orange */ + { RGB_TO_YUV(0x33ffcc) }, /* Aqua */ + { RGB_TO_YUV(0x66ff33) }, /* Light Green */ + { RGB_TO_YUV(0xccff33) }, /* Yellow */ +}; + +static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = { + { RGB_TO_YUV(0x00ff00) }, /* Blue */ + { RGB_TO_YUV(0x0000ff) }, /* Green */ + { RGB_TO_YUV(0xffff00) }, /* Yellow */ + { RGB_TO_YUV(0xff0000) }, /* Red */ +}; +#endif + +static const short kernel5[] = { + 1, 1, 4, 1, 1 +}; + +const short vp9_rv[] = { + 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, + 0, 3, 9, 0, 0, 0, 8, 3, 14, 4, + 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, + 8, 6, 10, 0, 0, 8, 9, 0, 3, 14, + 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, + 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, + 3, 3, 1, 13, 13, 6, 6, 5, 2, 7, + 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, + 14, 4, 12, 5, 12, 10, 8, 10, 13, 10, + 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, + 7, 14, 6, 14, 13, 2, 13, 5, 4, 4, + 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, + 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, + 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, + 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, + 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, + 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, + 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, + 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, + 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, + 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, + 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, + 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, + 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, + 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, + 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, + 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, + 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, + 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, + 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, + 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, + 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, + 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, + 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, + 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, + 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, + 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, + 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, + 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, + 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, + 11, 9, 14, 8, 14, 13, 4, 3, 1, 2, + 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, + 5, 8, 8, 12, 13, 5, 14, 10, 12, 13, + 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, +}; + + +/**************************************************************************** + */ +void vp9_post_proc_down_and_across_c(unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int rows, + int cols, + int flimit) { + unsigned char *p_src, *p_dst; + int row; + int col; + int i; + int v; + int pitch = src_pixels_per_line; + unsigned char d[8]; + (void)dst_pixels_per_line; + + for (row = 0; row < rows; row++) { + /* post_proc_down for one row */ + p_src = src_ptr; + p_dst = dst_ptr; + + for (col = 0; col < cols; col++) { + + int kernel = 4; + int v = p_src[col]; + + for (i = -2; i <= 2; i++) { + if (abs(v - p_src[col + i * pitch]) > flimit) + goto down_skip_convolve; + + kernel += kernel5[2 + i] * p_src[col + i * pitch]; + } + + v = (kernel >> 3); + down_skip_convolve: + p_dst[col] = v; + } + + /* now post_proc_across */ + p_src = dst_ptr; + p_dst = dst_ptr; + + for (i = 0; i < 8; i++) + d[i] = p_src[i]; + + for (col = 0; col < cols; col++) { + int kernel = 4; + v = p_src[col]; + + d[col & 7] = v; + + for (i = -2; i <= 2; i++) { + if (abs(v - p_src[col + i]) > flimit) + goto across_skip_convolve; + + kernel += kernel5[2 + i] * p_src[col + i]; + } + + d[col & 7] = (kernel >> 3); + across_skip_convolve: + + if (col >= 2) + p_dst[col - 2] = d[(col - 2) & 7]; + } + + /* handle the last two pixels */ + p_dst[col - 2] = d[(col - 2) & 7]; + p_dst[col - 1] = d[(col - 1) & 7]; + + + /* next row */ + src_ptr += pitch; + dst_ptr += pitch; + } +} + +static int q2mbl(int x) { + if (x < 20) x = 20; + + x = 50 + (x - 50) * 10 / 8; + return x * x / 3; +} + +void vp9_mbpost_proc_across_ip_c(unsigned char *src, int pitch, + int rows, int cols, int flimit) { + int r, c, i; + + unsigned char *s = src; + unsigned char d[16]; + + + for (r = 0; r < rows; r++) { + int sumsq = 0; + int sum = 0; + + for (i = -8; i <= 6; i++) { + sumsq += s[i] * s[i]; + sum += s[i]; + d[i + 8] = 0; + } + + for (c = 0; c < cols + 8; c++) { + int x = s[c + 7] - s[c - 8]; + int y = s[c + 7] + s[c - 8]; + + sum += x; + sumsq += x * y; + + d[c & 15] = s[c]; + + if (sumsq * 15 - sum * sum < flimit) { + d[c & 15] = (8 + sum + s[c]) >> 4; + } + + s[c - 8] = d[(c - 8) & 15]; + } + + s += pitch; + } +} + +void vp9_mbpost_proc_down_c(unsigned char *dst, int pitch, + int rows, int cols, int flimit) { + int r, c, i; + const short *rv3 = &vp9_rv[63 & rand()]; + + for (c = 0; c < cols; c++) { + unsigned char *s = &dst[c]; + int sumsq = 0; + int sum = 0; + unsigned char d[16]; + const short *rv2 = rv3 + ((c * 17) & 127); + + for (i = -8; i <= 6; i++) { + sumsq += s[i * pitch] * s[i * pitch]; + sum += s[i * pitch]; + } + + for (r = 0; r < rows + 8; r++) { + sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch]; + sum += s[7 * pitch] - s[-8 * pitch]; + d[r & 15] = s[0]; + + if (sumsq * 15 - sum * sum < flimit) { + d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4; + } + + s[-8 * pitch] = d[(r - 8) & 15]; + s += pitch; + } + } +} + +static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, + int q, + int low_var_thresh, + int flag, + vp9_postproc_rtcd_vtable_t *rtcd) { + double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; + int ppl = (int)(level + .5); + (void) low_var_thresh; + (void) flag; + + POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer, + source->y_stride, post->y_stride, + source->y_height, source->y_width, ppl); + POSTPROC_INVOKE(rtcd, across)(post->y_buffer, post->y_stride, + post->y_height, post->y_width, q2mbl(q)); + POSTPROC_INVOKE(rtcd, down)(post->y_buffer, post->y_stride, + post->y_height, post->y_width, q2mbl(q)); + + POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer, + source->uv_stride, post->uv_stride, + source->uv_height, source->uv_width, ppl); + POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer, + source->uv_stride, post->uv_stride, + source->uv_height, source->uv_width, ppl); +} + +void vp9_deblock(YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, + int q, + int low_var_thresh, + int flag, + vp9_postproc_rtcd_vtable_t *rtcd) { + double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; + int ppl = (int)(level + .5); + (void) low_var_thresh; + (void) flag; + + POSTPROC_INVOKE(rtcd, downacross)(source->y_buffer, post->y_buffer, + source->y_stride, post->y_stride, + source->y_height, source->y_width, ppl); + POSTPROC_INVOKE(rtcd, downacross)(source->u_buffer, post->u_buffer, + source->uv_stride, post->uv_stride, + source->uv_height, source->uv_width, ppl); + POSTPROC_INVOKE(rtcd, downacross)(source->v_buffer, post->v_buffer, + source->uv_stride, post->uv_stride, + source->uv_height, source->uv_width, ppl); +} + +void vp9_de_noise(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *post, + int q, + int low_var_thresh, + int flag, + vp9_postproc_rtcd_vtable_t *rtcd) { + double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; + int ppl = (int)(level + .5); + (void) post; + (void) low_var_thresh; + (void) flag; + + POSTPROC_INVOKE(rtcd, downacross)(src->y_buffer + 2 * src->y_stride + 2, + src->y_buffer + 2 * src->y_stride + 2, + src->y_stride, + src->y_stride, + src->y_height - 4, + src->y_width - 4, + ppl); + POSTPROC_INVOKE(rtcd, downacross)(src->u_buffer + 2 * src->uv_stride + 2, + src->u_buffer + 2 * src->uv_stride + 2, + src->uv_stride, + src->uv_stride, + src->uv_height - 4, + src->uv_width - 4, ppl); + POSTPROC_INVOKE(rtcd, downacross)(src->v_buffer + 2 * src->uv_stride + 2, + src->v_buffer + 2 * src->uv_stride + 2, + src->uv_stride, + src->uv_stride, + src->uv_height - 4, + src->uv_width - 4, ppl); +} + +double vp9_gaussian(double sigma, double mu, double x) { + return 1 / (sigma * sqrt(2.0 * 3.14159265)) * + (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma))); +} + +static void fillrd(struct postproc_state *state, int q, int a) { + char char_dist[300]; + + double sigma; + int ai = a, qi = q, i; + + vp9_clear_system_state(); + + sigma = ai + .5 + .6 * (63 - qi) / 63.0; + + /* set up a lookup table of 256 entries that matches + * a gaussian distribution with sigma determined by q. + */ + { + double i; + int next, j; + + next = 0; + + for (i = -32; i < 32; i++) { + int a = (int)(.5 + 256 * vp9_gaussian(sigma, 0, i)); + + if (a) { + for (j = 0; j < a; j++) { + char_dist[next + j] = (char) i; + } + + next = next + j; + } + + } + + for (next = next; next < 256; next++) + char_dist[next] = 0; + } + + for (i = 0; i < 3072; i++) { + state->noise[i] = char_dist[rand() & 0xff]; + } + + for (i = 0; i < 16; i++) { + state->blackclamp[i] = -char_dist[0]; + state->whiteclamp[i] = -char_dist[0]; + state->bothclamp[i] = -2 * char_dist[0]; + } + + state->last_q = q; + state->last_noise = a; +} + +/**************************************************************************** + * + * ROUTINE : plane_add_noise_c + * + * INPUTS : unsigned char *Start starting address of buffer to + * add gaussian noise to + * unsigned int Width width of plane + * unsigned int Height height of plane + * int Pitch distance between subsequent lines of frame + * int q quantizer used to determine amount of noise + * to add + * + * OUTPUTS : None. + * + * RETURNS : void. + * + * FUNCTION : adds gaussian noise to a plane of pixels + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp9_plane_add_noise_c(unsigned char *Start, char *noise, + char blackclamp[16], + char whiteclamp[16], + char bothclamp[16], + unsigned int Width, unsigned int Height, int Pitch) { + unsigned int i, j; + + for (i = 0; i < Height; i++) { + unsigned char *Pos = Start + i * Pitch; + char *Ref = (char *)(noise + (rand() & 0xff)); + + for (j = 0; j < Width; j++) { + if (Pos[j] < blackclamp[0]) + Pos[j] = blackclamp[0]; + + if (Pos[j] > 255 + whiteclamp[0]) + Pos[j] = 255 + whiteclamp[0]; + + Pos[j] += Ref[j]; + } + } +} + +/* Blend the macro block with a solid colored square. Leave the + * edges unblended to give distinction to macro blocks in areas + * filled with the same color block. + */ +void vp9_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, + int y1, int u1, int v1, int alpha, int stride) { + int i, j; + int y1_const = y1 * ((1 << 16) - alpha); + int u1_const = u1 * ((1 << 16) - alpha); + int v1_const = v1 * ((1 << 16) - alpha); + + y += 2 * stride + 2; + for (i = 0; i < 12; i++) { + for (j = 0; j < 12; j++) { + y[j] = (y[j] * alpha + y1_const) >> 16; + } + y += stride; + } + + stride >>= 1; + + u += stride + 1; + v += stride + 1; + + for (i = 0; i < 6; i++) { + for (j = 0; j < 6; j++) { + u[j] = (u[j] * alpha + u1_const) >> 16; + v[j] = (v[j] * alpha + v1_const) >> 16; + } + u += stride; + v += stride; + } +} + +/* Blend only the edge of the macro block. Leave center + * unblended to allow for other visualizations to be layered. + */ +void vp9_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, + int y1, int u1, int v1, int alpha, int stride) { + int i, j; + int y1_const = y1 * ((1 << 16) - alpha); + int u1_const = u1 * ((1 << 16) - alpha); + int v1_const = v1 * ((1 << 16) - alpha); + + for (i = 0; i < 2; i++) { + for (j = 0; j < 16; j++) { + y[j] = (y[j] * alpha + y1_const) >> 16; + } + y += stride; + } + + for (i = 0; i < 12; i++) { + y[0] = (y[0] * alpha + y1_const) >> 16; + y[1] = (y[1] * alpha + y1_const) >> 16; + y[14] = (y[14] * alpha + y1_const) >> 16; + y[15] = (y[15] * alpha + y1_const) >> 16; + y += stride; + } + + for (i = 0; i < 2; i++) { + for (j = 0; j < 16; j++) { + y[j] = (y[j] * alpha + y1_const) >> 16; + } + y += stride; + } + + stride >>= 1; + + for (j = 0; j < 8; j++) { + u[j] = (u[j] * alpha + u1_const) >> 16; + v[j] = (v[j] * alpha + v1_const) >> 16; + } + u += stride; + v += stride; + + for (i = 0; i < 6; i++) { + u[0] = (u[0] * alpha + u1_const) >> 16; + v[0] = (v[0] * alpha + v1_const) >> 16; + + u[7] = (u[7] * alpha + u1_const) >> 16; + v[7] = (v[7] * alpha + v1_const) >> 16; + + u += stride; + v += stride; + } + + for (j = 0; j < 8; j++) { + u[j] = (u[j] * alpha + u1_const) >> 16; + v[j] = (v[j] * alpha + v1_const) >> 16; + } +} + +void vp9_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, + int y1, int u1, int v1, int alpha, int stride) { + int i, j; + int y1_const = y1 * ((1 << 16) - alpha); + int u1_const = u1 * ((1 << 16) - alpha); + int v1_const = v1 * ((1 << 16) - alpha); + + for (i = 0; i < 4; i++) { + for (j = 0; j < 4; j++) { + y[j] = (y[j] * alpha + y1_const) >> 16; + } + y += stride; + } + + stride >>= 1; + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + u[j] = (u[j] * alpha + u1_const) >> 16; + v[j] = (v[j] * alpha + v1_const) >> 16; + } + u += stride; + v += stride; + } +} + +static void constrain_line(int x0, int *x1, int y0, int *y1, + int width, int height) { + int dx; + int dy; + + if (*x1 > width) { + dx = *x1 - x0; + dy = *y1 - y0; + + *x1 = width; + if (dx) + *y1 = ((width - x0) * dy) / dx + y0; + } + if (*x1 < 0) { + dx = *x1 - x0; + dy = *y1 - y0; + + *x1 = 0; + if (dx) + *y1 = ((0 - x0) * dy) / dx + y0; + } + if (*y1 > height) { + dx = *x1 - x0; + dy = *y1 - y0; + + *y1 = height; + if (dy) + *x1 = ((height - y0) * dx) / dy + x0; + } + if (*y1 < 0) { + dx = *x1 - x0; + dy = *y1 - y0; + + *y1 = 0; + if (dy) + *x1 = ((0 - y0) * dx) / dy + x0; + } +} + + +#if CONFIG_RUNTIME_CPU_DETECT +#define RTCD_VTABLE(oci) (&(oci)->rtcd.postproc) +#else +#define RTCD_VTABLE(oci) NULL +#endif + +int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest, + vp9_ppflags_t *ppflags) { + int q = oci->filter_level * 10 / 6; + int flags = ppflags->post_proc_flag; + int deblock_level = ppflags->deblocking_level; + int noise_level = ppflags->noise_level; + + if (!oci->frame_to_show) + return -1; + + if (q > 63) + q = 63; + + if (!flags) { + *dest = *oci->frame_to_show; + + /* handle problem with extending borders */ + dest->y_width = oci->Width; + dest->y_height = oci->Height; + dest->uv_height = dest->y_height / 2; + return 0; + + } + +#if ARCH_X86||ARCH_X86_64 + vpx_reset_mmx_state(); +#endif + + if (flags & VP9D_DEMACROBLOCK) { + deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer, + q + (deblock_level - 5) * 10, 1, 0, + RTCD_VTABLE(oci)); + } else if (flags & VP9D_DEBLOCK) { + vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, + q, 1, 0, RTCD_VTABLE(oci)); + } else { + vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer); + } + + if (flags & VP9D_ADDNOISE) { + if (oci->postproc_state.last_q != q + || oci->postproc_state.last_noise != noise_level) { + fillrd(&oci->postproc_state, 63 - q, noise_level); + } + + POSTPROC_INVOKE(RTCD_VTABLE(oci), addnoise)(oci->post_proc_buffer.y_buffer, + oci->postproc_state.noise, + oci->postproc_state.blackclamp, + oci->postproc_state.whiteclamp, + oci->postproc_state.bothclamp, + oci->post_proc_buffer.y_width, + oci->post_proc_buffer.y_height, + oci->post_proc_buffer.y_stride); + } + +#if CONFIG_POSTPROC_VISUALIZER + if (flags & VP9D_DEBUG_TXT_FRAME_INFO) { + char message[512]; + sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d", + (oci->frame_type == KEY_FRAME), + oci->refresh_golden_frame, + oci->base_qindex, + oci->filter_level, + flags, + oci->mb_cols, oci->mb_rows); + vp9_blit_text(message, oci->post_proc_buffer.y_buffer, + oci->post_proc_buffer.y_stride); + } + + if (flags & VP9D_DEBUG_TXT_MBLK_MODES) { + int i, j; + unsigned char *y_ptr; + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int mb_rows = post->y_height >> 4; + int mb_cols = post->y_width >> 4; + int mb_index = 0; + MODE_INFO *mi = oci->mi; + + y_ptr = post->y_buffer + 4 * post->y_stride + 4; + + /* vp9_filter each macro block */ + for (i = 0; i < mb_rows; i++) { + for (j = 0; j < mb_cols; j++) { + char zz[4]; + + sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a'); + + vp9_blit_text(zz, y_ptr, post->y_stride); + mb_index++; + y_ptr += 16; + } + + mb_index++; /* border */ + y_ptr += post->y_stride * 16 - post->y_width; + + } + } + + if (flags & VP9D_DEBUG_TXT_DC_DIFF) { + int i, j; + unsigned char *y_ptr; + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int mb_rows = post->y_height >> 4; + int mb_cols = post->y_width >> 4; + int mb_index = 0; + MODE_INFO *mi = oci->mi; + + y_ptr = post->y_buffer + 4 * post->y_stride + 4; + + /* vp9_filter each macro block */ + for (i = 0; i < mb_rows; i++) { + for (j = 0; j < mb_cols; j++) { + char zz[4]; + int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED && + mi[mb_index].mbmi.mode != SPLITMV && + mi[mb_index].mbmi.mb_skip_coeff); + + if (oci->frame_type == KEY_FRAME) + sprintf(zz, "a"); + else + sprintf(zz, "%c", dc_diff + '0'); + + vp9_blit_text(zz, y_ptr, post->y_stride); + mb_index++; + y_ptr += 16; + } + + mb_index++; /* border */ + y_ptr += post->y_stride * 16 - post->y_width; + + } + } + + if (flags & VP9D_DEBUG_TXT_RATE_INFO) { + char message[512]; + snprintf(message, sizeof(message), + "Bitrate: %10.2f frame_rate: %10.2f ", + oci->bitrate, oci->framerate); + vp9_blit_text(message, oci->post_proc_buffer.y_buffer, + oci->post_proc_buffer.y_stride); + } + + /* Draw motion vectors */ + if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) { + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int width = post->y_width; + int height = post->y_height; + unsigned char *y_buffer = oci->post_proc_buffer.y_buffer; + int y_stride = oci->post_proc_buffer.y_stride; + MODE_INFO *mi = oci->mi; + int x0, y0; + + for (y0 = 0; y0 < height; y0 += 16) { + for (x0 = 0; x0 < width; x0 += 16) { + int x1, y1; + + if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) { + mi++; + continue; + } + + if (mi->mbmi.mode == SPLITMV) { + switch (mi->mbmi.partitioning) { + case PARTITIONING_16X8 : { /* mv_top_bottom */ + union b_mode_info *bmi = &mi->bmi[0]; + MV *mv = &bmi->mv.as_mv; + + x1 = x0 + 8 + (mv->col >> 3); + y1 = y0 + 4 + (mv->row >> 3); + + constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height); + vp9_blit_line(x0 + 8, x1, y0 + 4, y1, y_buffer, y_stride); + + bmi = &mi->bmi[8]; + + x1 = x0 + 8 + (mv->col >> 3); + y1 = y0 + 12 + (mv->row >> 3); + + constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height); + vp9_blit_line(x0 + 8, x1, y0 + 12, y1, y_buffer, y_stride); + + break; + } + case PARTITIONING_8X16 : { /* mv_left_right */ + union b_mode_info *bmi = &mi->bmi[0]; + MV *mv = &bmi->mv.as_mv; + + x1 = x0 + 4 + (mv->col >> 3); + y1 = y0 + 8 + (mv->row >> 3); + + constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height); + vp9_blit_line(x0 + 4, x1, y0 + 8, y1, y_buffer, y_stride); + + bmi = &mi->bmi[2]; + + x1 = x0 + 12 + (mv->col >> 3); + y1 = y0 + 8 + (mv->row >> 3); + + constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height); + vp9_blit_line(x0 + 12, x1, y0 + 8, y1, y_buffer, y_stride); + + break; + } + case PARTITIONING_8X8 : { /* mv_quarters */ + union b_mode_info *bmi = &mi->bmi[0]; + MV *mv = &bmi->mv.as_mv; + + x1 = x0 + 4 + (mv->col >> 3); + y1 = y0 + 4 + (mv->row >> 3); + + constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height); + vp9_blit_line(x0 + 4, x1, y0 + 4, y1, y_buffer, y_stride); + + bmi = &mi->bmi[2]; + + x1 = x0 + 12 + (mv->col >> 3); + y1 = y0 + 4 + (mv->row >> 3); + + constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height); + vp9_blit_line(x0 + 12, x1, y0 + 4, y1, y_buffer, y_stride); + + bmi = &mi->bmi[8]; + + x1 = x0 + 4 + (mv->col >> 3); + y1 = y0 + 12 + (mv->row >> 3); + + constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height); + vp9_blit_line(x0 + 4, x1, y0 + 12, y1, y_buffer, y_stride); + + bmi = &mi->bmi[10]; + + x1 = x0 + 12 + (mv->col >> 3); + y1 = y0 + 12 + (mv->row >> 3); + + constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height); + vp9_blit_line(x0 + 12, x1, y0 + 12, y1, y_buffer, y_stride); + break; + } + case PARTITIONING_4X4: + default : { + union b_mode_info *bmi = mi->bmi; + int bx0, by0; + + for (by0 = y0; by0 < (y0 + 16); by0 += 4) { + for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) { + MV *mv = &bmi->mv.as_mv; + + x1 = bx0 + 2 + (mv->col >> 3); + y1 = by0 + 2 + (mv->row >> 3); + + constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height); + vp9_blit_line(bx0 + 2, x1, by0 + 2, y1, y_buffer, y_stride); + + bmi++; + } + } + } + } + } else if (mi->mbmi.mode >= NEARESTMV) { + MV *mv = &mi->mbmi.mv.as_mv; + const int lx0 = x0 + 8; + const int ly0 = y0 + 8; + + x1 = lx0 + (mv->col >> 3); + y1 = ly0 + (mv->row >> 3); + + if (x1 != lx0 && y1 != ly0) { + constrain_line(lx0, &x1, ly0 - 1, &y1, width, height); + vp9_blit_line(lx0, x1, ly0 - 1, y1, y_buffer, y_stride); + + constrain_line(lx0, &x1, ly0 + 1, &y1, width, height); + vp9_blit_line(lx0, x1, ly0 + 1, y1, y_buffer, y_stride); + } else + vp9_blit_line(lx0, x1, ly0, y1, y_buffer, y_stride); + } + + mi++; + } + mi++; + } + } + + /* Color in block modes */ + if ((flags & VP9D_DEBUG_CLR_BLK_MODES) + && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) { + int y, x; + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int width = post->y_width; + int height = post->y_height; + unsigned char *y_ptr = oci->post_proc_buffer.y_buffer; + unsigned char *u_ptr = oci->post_proc_buffer.u_buffer; + unsigned char *v_ptr = oci->post_proc_buffer.v_buffer; + int y_stride = oci->post_proc_buffer.y_stride; + MODE_INFO *mi = oci->mi; + + for (y = 0; y < height; y += 16) { + for (x = 0; x < width; x += 16) { + int Y = 0, U = 0, V = 0; + + if (mi->mbmi.mode == B_PRED && + ((ppflags->display_mb_modes_flag & B_PRED) || + ppflags->display_b_modes_flag)) { + int by, bx; + unsigned char *yl, *ul, *vl; + union b_mode_info *bmi = mi->bmi; + + yl = y_ptr + x; + ul = u_ptr + (x >> 1); + vl = v_ptr + (x >> 1); + + for (by = 0; by < 16; by += 4) { + for (bx = 0; bx < 16; bx += 4) { + if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode)) + || (ppflags->display_mb_modes_flag & B_PRED)) { + Y = B_PREDICTION_MODE_colors[bmi->as_mode.first][0]; + U = B_PREDICTION_MODE_colors[bmi->as_mode.first][1]; + V = B_PREDICTION_MODE_colors[bmi->as_mode.first][2]; + + POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)(yl + bx, + ul + (bx >> 1), + vl + (bx >> 1), + Y, U, V, + 0xc000, y_stride); + } + bmi++; + } + + yl += y_stride * 4; + ul += y_stride * 1; + vl += y_stride * 1; + } + } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) { + Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0]; + U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1]; + V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2]; + + POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_inner)(y_ptr + x, + u_ptr + (x >> 1), + v_ptr + (x >> 1), + Y, U, V, + 0xc000, y_stride); + } + + mi++; + } + y_ptr += y_stride * 16; + u_ptr += y_stride * 4; + v_ptr += y_stride * 4; + + mi++; + } + } + + /* Color in frame reference blocks */ + if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) && + ppflags->display_ref_frame_flag) { + int y, x; + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int width = post->y_width; + int height = post->y_height; + unsigned char *y_ptr = oci->post_proc_buffer.y_buffer; + unsigned char *u_ptr = oci->post_proc_buffer.u_buffer; + unsigned char *v_ptr = oci->post_proc_buffer.v_buffer; + int y_stride = oci->post_proc_buffer.y_stride; + MODE_INFO *mi = oci->mi; + + for (y = 0; y < height; y += 16) { + for (x = 0; x < width; x += 16) { + int Y = 0, U = 0, V = 0; + + if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) { + Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0]; + U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1]; + V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2]; + + POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_mb_outer)(y_ptr + x, + u_ptr + (x >> 1), + v_ptr + (x >> 1), + Y, U, V, + 0xc000, y_stride); + } + + mi++; + } + y_ptr += y_stride * 16; + u_ptr += y_stride * 4; + v_ptr += y_stride * 4; + + mi++; + } + } +#endif + + *dest = oci->post_proc_buffer; + + /* handle problem with extending borders */ + dest->y_width = oci->Width; + dest->y_height = oci->Height; + dest->uv_height = dest->y_height / 2; + + return 0; +} diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h new file mode 100644 index 000000000..bf56b5692 --- /dev/null +++ b/vp9/common/vp9_postproc.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef POSTPROC_H +#define POSTPROC_H + +#define prototype_postproc_inplace(sym)\ + void sym(unsigned char *dst, int pitch, int rows, int cols, int flimit) + +#define prototype_postproc(sym)\ + void sym(unsigned char *src, unsigned char *dst, int src_pitch, \ + int dst_pitch, int rows, int cols, int flimit) + +#define prototype_postproc_addnoise(sym) \ + void sym(unsigned char *s, char *noise, char blackclamp[16], \ + char whiteclamp[16], char bothclamp[16], \ + unsigned int w, unsigned int h, int pitch) + +#define prototype_postproc_blend_mb_inner(sym)\ + void sym(unsigned char *y, unsigned char *u, unsigned char *v, \ + int y1, int u1, int v1, int alpha, int stride) + +#define prototype_postproc_blend_mb_outer(sym)\ + void sym(unsigned char *y, unsigned char *u, unsigned char *v, \ + int y1, int u1, int v1, int alpha, int stride) + +#define prototype_postproc_blend_b(sym)\ + void sym(unsigned char *y, unsigned char *u, unsigned char *v, \ + int y1, int u1, int v1, int alpha, int stride) + +#if ARCH_X86 || ARCH_X86_64 +#include "x86/vp9_postproc_x86.h" +#endif + +#ifndef vp9_postproc_down +#define vp9_postproc_down vp9_mbpost_proc_down_c +#endif +extern prototype_postproc_inplace(vp9_postproc_down); + +#ifndef vp9_postproc_across +#define vp9_postproc_across vp9_mbpost_proc_across_ip_c +#endif +extern prototype_postproc_inplace(vp9_postproc_across); + +#ifndef vp9_postproc_downacross +#define vp9_postproc_downacross vp9_post_proc_down_and_across_c +#endif +extern prototype_postproc(vp9_postproc_downacross); + +#ifndef vp9_postproc_addnoise +#define vp9_postproc_addnoise vp9_plane_add_noise_c +#endif +extern prototype_postproc_addnoise(vp9_postproc_addnoise); + +#ifndef vp9_postproc_blend_mb_inner +#define vp9_postproc_blend_mb_inner vp9_blend_mb_inner_c +#endif +extern prototype_postproc_blend_mb_inner(vp9_postproc_blend_mb_inner); + +#ifndef vp9_postproc_blend_mb_outer +#define vp9_postproc_blend_mb_outer vp9_blend_mb_outer_c +#endif +extern prototype_postproc_blend_mb_outer(vp9_postproc_blend_mb_outer); + +#ifndef vp9_postproc_blend_b +#define vp9_postproc_blend_b vp9_blend_b_c +#endif +extern prototype_postproc_blend_b(vp9_postproc_blend_b); + +typedef prototype_postproc((*vp9_postproc_fn_t)); +typedef prototype_postproc_inplace((*vp9_postproc_inplace_fn_t)); +typedef prototype_postproc_addnoise((*vp9_postproc_addnoise_fn_t)); +typedef prototype_postproc_blend_mb_inner((*vp9_postproc_blend_mb_inner_fn_t)); +typedef prototype_postproc_blend_mb_outer((*vp9_postproc_blend_mb_outer_fn_t)); +typedef prototype_postproc_blend_b((*vp9_postproc_blend_b_fn_t)); +typedef struct { + vp9_postproc_inplace_fn_t down; + vp9_postproc_inplace_fn_t across; + vp9_postproc_fn_t downacross; + vp9_postproc_addnoise_fn_t addnoise; + vp9_postproc_blend_mb_inner_fn_t blend_mb_inner; + vp9_postproc_blend_mb_outer_fn_t blend_mb_outer; + vp9_postproc_blend_b_fn_t blend_b; +} vp9_postproc_rtcd_vtable_t; + +#if CONFIG_RUNTIME_CPU_DETECT +#define POSTPROC_INVOKE(ctx,fn) (ctx)->fn +#else +#define POSTPROC_INVOKE(ctx,fn) vp9_postproc_##fn +#endif + +#include "vpx_ports/mem.h" +struct postproc_state { + int last_q; + int last_noise; + char noise[3072]; + DECLARE_ALIGNED(16, char, blackclamp[16]); + DECLARE_ALIGNED(16, char, whiteclamp[16]); + DECLARE_ALIGNED(16, char, bothclamp[16]); +}; +#include "vp9_onyxc_int.h" +#include "vp9_ppflags.h" +int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest, + vp9_ppflags_t *flags); + + +void vp9_de_noise(YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, + int q, + int low_var_thresh, + int flag, + vp9_postproc_rtcd_vtable_t *rtcd); + +void vp9_deblock(YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, + int q, + int low_var_thresh, + int flag, + vp9_postproc_rtcd_vtable_t *rtcd); +#endif diff --git a/vp9/common/vp9_ppflags.h b/vp9/common/vp9_ppflags.h new file mode 100644 index 000000000..fd8371180 --- /dev/null +++ b/vp9/common/vp9_ppflags.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_PPFLAGS_H +#define __INC_PPFLAGS_H +enum { + VP9D_NOFILTERING = 0, + VP9D_DEBLOCK = 1 << 0, + VP9D_DEMACROBLOCK = 1 << 1, + VP9D_ADDNOISE = 1 << 2, + VP9D_DEBUG_TXT_FRAME_INFO = 1 << 3, + VP9D_DEBUG_TXT_MBLK_MODES = 1 << 4, + VP9D_DEBUG_TXT_DC_DIFF = 1 << 5, + VP9D_DEBUG_TXT_RATE_INFO = 1 << 6, + VP9D_DEBUG_DRAW_MV = 1 << 7, + VP9D_DEBUG_CLR_BLK_MODES = 1 << 8, + VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9 +}; + +typedef struct { + int post_proc_flag; + int deblocking_level; + int noise_level; + int display_ref_frame_flag; + int display_mb_modes_flag; + int display_b_modes_flag; + int display_mv_flag; +} vp9_ppflags_t; + +#endif diff --git a/vp9/common/vp9_pragmas.h b/vp9/common/vp9_pragmas.h new file mode 100644 index 000000000..99fee5ae2 --- /dev/null +++ b/vp9/common/vp9_pragmas.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + + + +#ifdef __INTEL_COMPILER +#pragma warning(disable:997 1011 170) +#endif +#ifdef _MSC_VER +#pragma warning(disable:4799) +#endif diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c new file mode 100644 index 000000000..cb65d7282 --- /dev/null +++ b/vp9/common/vp9_pred_common.c @@ -0,0 +1,463 @@ + +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_seg_common.h" + +// TBD prediction functions for various bitstream signals + +// Returns a context number for the given MB prediction signal +unsigned char vp9_get_pred_context(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + PRED_ID pred_id) { + int pred_context; + MODE_INFO *m = xd->mode_info_context; + + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries correpsonding to real macroblocks. + // The prediction flags in these dummy entries are initialised to 0. + switch (pred_id) { + case PRED_SEG_ID: + pred_context = (m - 1)->mbmi.seg_id_predicted + + (m - cm->mode_info_stride)->mbmi.seg_id_predicted; + break; + + + case PRED_REF: + pred_context = (m - 1)->mbmi.ref_predicted + + (m - cm->mode_info_stride)->mbmi.ref_predicted; + break; + + case PRED_COMP: + // Context based on use of comp pred flag by neighbours + // pred_context = + // ((m - 1)->mbmi.second_ref_frame > INTRA_FRAME) + + // ((m - cm->mode_info_stride)->mbmi.second_ref_frame > INTRA_FRAME); + + // Context based on mode and reference frame + // if ( m->mbmi.ref_frame == LAST_FRAME ) + // pred_context = 0 + (m->mbmi.mode != ZEROMV); + // else if ( m->mbmi.ref_frame == GOLDEN_FRAME ) + // pred_context = 2 + (m->mbmi.mode != ZEROMV); + // else + // pred_context = 4 + (m->mbmi.mode != ZEROMV); + + if (m->mbmi.ref_frame == LAST_FRAME) + pred_context = 0; + else + pred_context = 1; + + break; + + case PRED_MBSKIP: + pred_context = (m - 1)->mbmi.mb_skip_coeff + + (m - cm->mode_info_stride)->mbmi.mb_skip_coeff; + break; + + case PRED_SWITCHABLE_INTERP: + { + int left_in_image = (m - 1)->mbmi.mb_in_image; + int above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image; + int left_mode = (m - 1)->mbmi.mode; + int above_mode = (m - cm->mode_info_stride)->mbmi.mode; + int left_interp, above_interp; + if (left_in_image && left_mode >= NEARESTMV && left_mode <= SPLITMV) + left_interp = vp9_switchable_interp_map[(m - 1)->mbmi.interp_filter]; + else + left_interp = VP9_SWITCHABLE_FILTERS; + if (above_in_image && above_mode >= NEARESTMV && above_mode <= SPLITMV) + above_interp = vp9_switchable_interp_map[ + (m - cm->mode_info_stride)->mbmi.interp_filter]; + else + above_interp = VP9_SWITCHABLE_FILTERS; + + if (left_interp == above_interp) + pred_context = left_interp; + else if (left_interp == VP9_SWITCHABLE_FILTERS && + above_interp != VP9_SWITCHABLE_FILTERS) + pred_context = above_interp; + else if (left_interp != VP9_SWITCHABLE_FILTERS && + above_interp == VP9_SWITCHABLE_FILTERS) + pred_context = left_interp; + else + pred_context = VP9_SWITCHABLE_FILTERS; + } + break; + + default: + // TODO *** add error trap code. + pred_context = 0; + break; + } + + return pred_context; +} + +// This function returns a context probability for coding a given +// prediction signal +vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + PRED_ID pred_id) { + vp9_prob pred_probability; + int pred_context; + + // Get the appropriate prediction context + pred_context = vp9_get_pred_context(cm, xd, pred_id); + + switch (pred_id) { + case PRED_SEG_ID: + pred_probability = cm->segment_pred_probs[pred_context]; + break; + + case PRED_REF: + pred_probability = cm->ref_pred_probs[pred_context]; + break; + + case PRED_COMP: + // In keeping with convention elsewhre the probability returned is + // the probability of a "0" outcome which in this case means the + // probability of comp pred off. + pred_probability = cm->prob_comppred[pred_context]; + break; + + case PRED_MBSKIP: + pred_probability = cm->mbskip_pred_probs[pred_context]; + break; + + default: + // TODO *** add error trap code. + pred_probability = 128; + break; + } + + return pred_probability; +} + +// This function returns a context probability ptr for coding a given +// prediction signal +const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + PRED_ID pred_id) { + const vp9_prob *pred_probability; + int pred_context; + + // Get the appropriate prediction context + pred_context = vp9_get_pred_context(cm, xd, pred_id); + + switch (pred_id) { + case PRED_SEG_ID: + pred_probability = &cm->segment_pred_probs[pred_context]; + break; + + case PRED_REF: + pred_probability = &cm->ref_pred_probs[pred_context]; + break; + + case PRED_COMP: + // In keeping with convention elsewhre the probability returned is + // the probability of a "0" outcome which in this case means the + // probability of comp pred off. + pred_probability = &cm->prob_comppred[pred_context]; + break; + + case PRED_MBSKIP: + pred_probability = &cm->mbskip_pred_probs[pred_context]; + break; + + case PRED_SWITCHABLE_INTERP: + pred_probability = &cm->fc.switchable_interp_prob[pred_context][0]; + break; + + default: + // TODO *** add error trap code. + pred_probability = NULL; + break; + } + + return pred_probability; +} + +// This function returns the status of the given prediction signal. +// I.e. is the predicted value for the given signal correct. +unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd, + PRED_ID pred_id) { + unsigned char pred_flag = 0; + + switch (pred_id) { + case PRED_SEG_ID: + pred_flag = xd->mode_info_context->mbmi.seg_id_predicted; + break; + + case PRED_REF: + pred_flag = xd->mode_info_context->mbmi.ref_predicted; + break; + + case PRED_MBSKIP: + pred_flag = xd->mode_info_context->mbmi.mb_skip_coeff; + break; + + default: + // TODO *** add error trap code. + pred_flag = 0; + break; + } + + return pred_flag; +} + +// This function sets the status of the given prediction signal. +// I.e. is the predicted value for the given signal correct. +void vp9_set_pred_flag(MACROBLOCKD *const xd, + PRED_ID pred_id, + unsigned char pred_flag) { +#if CONFIG_SUPERBLOCKS + const int mis = xd->mode_info_stride; +#endif + + switch (pred_id) { + case PRED_SEG_ID: + xd->mode_info_context->mbmi.seg_id_predicted = pred_flag; +#if CONFIG_SUPERBLOCKS + if (xd->mode_info_context->mbmi.encoded_as_sb) { + if (xd->mb_to_right_edge >= 0) + xd->mode_info_context[1].mbmi.seg_id_predicted = pred_flag; + if (xd->mb_to_bottom_edge >= 0) { + xd->mode_info_context[mis].mbmi.seg_id_predicted = pred_flag; + if (xd->mb_to_right_edge >= 0) + xd->mode_info_context[mis + 1].mbmi.seg_id_predicted = pred_flag; + } + } +#endif + break; + + case PRED_REF: + xd->mode_info_context->mbmi.ref_predicted = pred_flag; +#if CONFIG_SUPERBLOCKS + if (xd->mode_info_context->mbmi.encoded_as_sb) { + if (xd->mb_to_right_edge >= 0) + xd->mode_info_context[1].mbmi.ref_predicted = pred_flag; + if (xd->mb_to_bottom_edge >= 0) { + xd->mode_info_context[mis].mbmi.ref_predicted = pred_flag; + if (xd->mb_to_right_edge >= 0) + xd->mode_info_context[mis + 1].mbmi.ref_predicted = pred_flag; + } + } +#endif + break; + + case PRED_MBSKIP: + xd->mode_info_context->mbmi.mb_skip_coeff = pred_flag; +#if CONFIG_SUPERBLOCKS + if (xd->mode_info_context->mbmi.encoded_as_sb) { + if (xd->mb_to_right_edge >= 0) + xd->mode_info_context[1].mbmi.mb_skip_coeff = pred_flag; + if (xd->mb_to_bottom_edge >= 0) { + xd->mode_info_context[mis].mbmi.mb_skip_coeff = pred_flag; + if (xd->mb_to_right_edge >= 0) + xd->mode_info_context[mis + 1].mbmi.mb_skip_coeff = pred_flag; + } + } +#endif + break; + + default: + // TODO *** add error trap code. + break; + } +} + + +// The following contain the guts of the prediction code used to +// peredict various bitstream signals. + +// Macroblock segment id prediction function +unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, int MbIndex) { + // Currently the prediction for the macroblock segment ID is + // the value stored for this macroblock in the previous frame. +#if CONFIG_SUPERBLOCKS + if (!xd->mode_info_context->mbmi.encoded_as_sb) { +#endif + return cm->last_frame_seg_map[MbIndex]; +#if CONFIG_SUPERBLOCKS + } else { + int seg_id = cm->last_frame_seg_map[MbIndex]; + int mb_col = MbIndex % cm->mb_cols; + int mb_row = MbIndex / cm->mb_cols; + if (mb_col + 1 < cm->mb_cols) + seg_id = seg_id && cm->last_frame_seg_map[MbIndex + 1]; + if (mb_row + 1 < cm->mb_rows) { + seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols]; + if (mb_col + 1 < cm->mb_cols) + seg_id = seg_id && cm->last_frame_seg_map[MbIndex + cm->mb_cols + 1]; + } + return seg_id; + } +#endif +} + +MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd) { + MODE_INFO *m = xd->mode_info_context; + + MV_REFERENCE_FRAME left; + MV_REFERENCE_FRAME above; + MV_REFERENCE_FRAME above_left; + MV_REFERENCE_FRAME pred_ref = LAST_FRAME; + + int segment_id = xd->mode_info_context->mbmi.segment_id; + int seg_ref_active; + int i; + + unsigned char frame_allowed[MAX_REF_FRAMES] = {1, 1, 1, 1}; + unsigned char ref_score[MAX_REF_FRAMES]; + unsigned char best_score = 0; + unsigned char left_in_image; + unsigned char above_in_image; + unsigned char above_left_in_image; + + // Is segment coding ennabled + seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME); + + // Special case treatment if segment coding is enabled. + // Dont allow prediction of a reference frame that the segment + // does not allow + if (seg_ref_active) { + for (i = 0; i < MAX_REF_FRAMES; i++) { + frame_allowed[i] = + vp9_check_segref(xd, segment_id, i); + + // Score set to 0 if ref frame not allowed + ref_score[i] = cm->ref_scores[i] * frame_allowed[i]; + } + } else + vpx_memcpy(ref_score, cm->ref_scores, sizeof(ref_score)); + + // Reference frames used by neighbours + left = (m - 1)->mbmi.ref_frame; + above = (m - cm->mode_info_stride)->mbmi.ref_frame; + above_left = (m - 1 - cm->mode_info_stride)->mbmi.ref_frame; + + // Are neighbours in image + left_in_image = (m - 1)->mbmi.mb_in_image; + above_in_image = (m - cm->mode_info_stride)->mbmi.mb_in_image; + above_left_in_image = (m - 1 - cm->mode_info_stride)->mbmi.mb_in_image; + + // Adjust scores for candidate reference frames based on neigbours + if (frame_allowed[left] && left_in_image) { + ref_score[left] += 16; + if (above_left_in_image && (left == above_left)) + ref_score[left] += 4; + } + if (frame_allowed[above] && above_in_image) { + ref_score[above] += 16; + if (above_left_in_image && (above == above_left)) + ref_score[above] += 4; + } + + // Now choose the candidate with the highest score + for (i = 0; i < MAX_REF_FRAMES; i++) { + if (ref_score[i] > best_score) { + pred_ref = i; + best_score = ref_score[i]; + } + } + + return pred_ref; +} + +// Functions to computes a set of modified reference frame probabilities +// to use when the prediction of the reference frame value fails +void vp9_calc_ref_probs(int *count, vp9_prob *probs) { + int tot_count; + + tot_count = count[0] + count[1] + count[2] + count[3]; + if (tot_count) { + probs[0] = (vp9_prob)((count[0] * 255 + (tot_count >> 1)) / tot_count); + probs[0] += !probs[0]; + } else + probs[0] = 128; + + tot_count -= count[0]; + if (tot_count) { + probs[1] = (vp9_prob)((count[1] * 255 + (tot_count >> 1)) / tot_count); + probs[1] += !probs[1]; + } else + probs[1] = 128; + + tot_count -= count[1]; + if (tot_count) { + probs[2] = (vp9_prob)((count[2] * 255 + (tot_count >> 1)) / tot_count); + probs[2] += !probs[2]; + } else + probs[2] = 128; + +} + +// Computes a set of modified conditional probabilities for the reference frame +// Values willbe set to 0 for reference frame options that are not possible +// because wither they were predicted and prediction has failed or because +// they are not allowed for a given segment. +void vp9_compute_mod_refprobs(VP9_COMMON *const cm) { + int norm_cnt[MAX_REF_FRAMES]; + int intra_count; + int inter_count; + int last_count; + int gfarf_count; + int gf_count; + int arf_count; + + intra_count = cm->prob_intra_coded; + inter_count = (255 - intra_count); + last_count = (inter_count * cm->prob_last_coded) / 255; + gfarf_count = inter_count - last_count; + gf_count = (gfarf_count * cm->prob_gf_coded) / 255; + arf_count = gfarf_count - gf_count; + + // Work out modified reference frame probabilities to use where prediction + // of the reference frame fails + norm_cnt[0] = 0; + norm_cnt[1] = last_count; + norm_cnt[2] = gf_count; + norm_cnt[3] = arf_count; + vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[INTRA_FRAME]); + cm->mod_refprobs[INTRA_FRAME][0] = 0; // This branch implicit + + norm_cnt[0] = intra_count; + norm_cnt[1] = 0; + norm_cnt[2] = gf_count; + norm_cnt[3] = arf_count; + vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[LAST_FRAME]); + cm->mod_refprobs[LAST_FRAME][1] = 0; // This branch implicit + + norm_cnt[0] = intra_count; + norm_cnt[1] = last_count; + norm_cnt[2] = 0; + norm_cnt[3] = arf_count; + vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[GOLDEN_FRAME]); + cm->mod_refprobs[GOLDEN_FRAME][2] = 0; // This branch implicit + + norm_cnt[0] = intra_count; + norm_cnt[1] = last_count; + norm_cnt[2] = gf_count; + norm_cnt[3] = 0; + vp9_calc_ref_probs(norm_cnt, cm->mod_refprobs[ALTREF_FRAME]); + cm->mod_refprobs[ALTREF_FRAME][2] = 0; // This branch implicit + + // Score the reference frames based on overal frequency. + // These scores contribute to the prediction choices. + // Max score 17 min 1 + cm->ref_scores[INTRA_FRAME] = 1 + (intra_count * 16 / 255); + cm->ref_scores[LAST_FRAME] = 1 + (last_count * 16 / 255); + cm->ref_scores[GOLDEN_FRAME] = 1 + (gf_count * 16 / 255); + cm->ref_scores[ALTREF_FRAME] = 1 + (arf_count * 16 / 255); +} diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h new file mode 100644 index 000000000..2628bb3ca --- /dev/null +++ b/vp9/common/vp9_pred_common.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9_type_aliases.h" +#include "vp9_onyxc_int.h" +#include "vp9/common/vp9_blockd.h" + +#ifndef __INC_PRED_COMMON_H__ +#define __INC_PRED_COMMON_H__ 1 + + +// Predicted items +typedef enum { + PRED_SEG_ID = 0, // Segment identifier + PRED_REF = 1, + PRED_COMP = 2, + PRED_MBSKIP = 3, + PRED_SWITCHABLE_INTERP = 4 +} PRED_ID; + +extern unsigned char vp9_get_pred_context(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + PRED_ID pred_id); + +extern vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + PRED_ID pred_id); + +extern const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + PRED_ID pred_id); + +extern unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd, + PRED_ID pred_id); + +extern void vp9_set_pred_flag(MACROBLOCKD *const xd, + PRED_ID pred_id, + unsigned char pred_flag); + + +extern unsigned char vp9_get_pred_mb_segid(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + int MbIndex); + +extern MV_REFERENCE_FRAME vp9_get_pred_ref(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd); +extern void vp9_compute_mod_refprobs(VP9_COMMON *const cm); + +#endif /* __INC_PRED_COMMON_H__ */ diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c new file mode 100644 index 000000000..5d1f2594a --- /dev/null +++ b/vp9/common/vp9_quant_common.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9_quant_common.h" + +static int dc_qlookup[QINDEX_RANGE]; +static int ac_qlookup[QINDEX_RANGE]; + +#define ACDC_MIN 4 + +void vp9_init_quant_tables() { + int i; + int current_val = 4; + int last_val = 4; + int ac_val; + + for (i = 0; i < QINDEX_RANGE; i++) { + ac_qlookup[i] = current_val; + current_val = (int)((double)current_val * 1.02); + if (current_val == last_val) + current_val++; + last_val = current_val; + + ac_val = ac_qlookup[i]; + dc_qlookup[i] = (int)((0.000000305 * ac_val * ac_val * ac_val) + + (-0.00065 * ac_val * ac_val) + + (0.9 * ac_val) + 0.5); + if (dc_qlookup[i] < ACDC_MIN) + dc_qlookup[i] = ACDC_MIN; + } +} + +int vp9_dc_quant(int QIndex, int Delta) { + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > MAXQ) + QIndex = MAXQ; + else if (QIndex < 0) + QIndex = 0; + + retval = dc_qlookup[ QIndex ]; + return retval; +} + +int vp9_dc2quant(int QIndex, int Delta) { + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > MAXQ) + QIndex = MAXQ; + else if (QIndex < 0) + QIndex = 0; + + retval = dc_qlookup[ QIndex ]; + + return retval; + +} +int vp9_dc_uv_quant(int QIndex, int Delta) { + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > MAXQ) + QIndex = MAXQ; + else if (QIndex < 0) + QIndex = 0; + + retval = dc_qlookup[ QIndex ]; + + return retval; +} + +int vp9_ac_yquant(int QIndex) { + int retval; + + if (QIndex > MAXQ) + QIndex = MAXQ; + else if (QIndex < 0) + QIndex = 0; + + retval = ac_qlookup[ QIndex ]; + return retval; +} + +int vp9_ac2quant(int QIndex, int Delta) { + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > MAXQ) + QIndex = MAXQ; + else if (QIndex < 0) + QIndex = 0; + + retval = (ac_qlookup[ QIndex ] * 775) / 1000; + if (retval < 4) + retval = 4; + + return retval; +} +int vp9_ac_uv_quant(int QIndex, int Delta) { + int retval; + + QIndex = QIndex + Delta; + + if (QIndex > MAXQ) + QIndex = MAXQ; + else if (QIndex < 0) + QIndex = 0; + + retval = ac_qlookup[ QIndex ]; + return retval; +} diff --git a/vp9/common/vp9_quant_common.h b/vp9/common/vp9_quant_common.h new file mode 100644 index 000000000..88588d5d2 --- /dev/null +++ b/vp9/common/vp9_quant_common.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "string.h" +#include "vp9_blockd.h" +#include "vp9_onyxc_int.h" + +extern void vp9_init_quant_tables(); +extern int vp9_ac_yquant(int QIndex); +extern int vp9_dc_quant(int QIndex, int Delta); +extern int vp9_dc2quant(int QIndex, int Delta); +extern int vp9_ac2quant(int QIndex, int Delta); +extern int vp9_dc_uv_quant(int QIndex, int Delta); +extern int vp9_ac_uv_quant(int QIndex, int Delta); diff --git a/vp9/common/vp9_recon.c b/vp9/common/vp9_recon.c new file mode 100644 index 000000000..9c9029f2a --- /dev/null +++ b/vp9/common/vp9_recon.c @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vp9_rtcd.h" +#include "vp9_blockd.h" + +void vp9_recon_b_c +( + unsigned char *pred_ptr, + short *diff_ptr, + unsigned char *dst_ptr, + int stride +) { + int r, c; + + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) { + int a = diff_ptr[c] + pred_ptr[c]; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a; + } + + dst_ptr += stride; + diff_ptr += 16; + pred_ptr += 16; + } +} + +void vp9_recon_uv_b_c +( + unsigned char *pred_ptr, + short *diff_ptr, + unsigned char *dst_ptr, + int stride +) { + int r, c; + + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) { + int a = diff_ptr[c] + pred_ptr[c]; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a; + } + + dst_ptr += stride; + diff_ptr += 8; + pred_ptr += 8; + } +} +void vp9_recon4b_c +( + unsigned char *pred_ptr, + short *diff_ptr, + unsigned char *dst_ptr, + int stride +) { + int r, c; + + for (r = 0; r < 4; r++) { + for (c = 0; c < 16; c++) { + int a = diff_ptr[c] + pred_ptr[c]; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a; + } + + dst_ptr += stride; + diff_ptr += 16; + pred_ptr += 16; + } +} + +void vp9_recon2b_c +( + unsigned char *pred_ptr, + short *diff_ptr, + unsigned char *dst_ptr, + int stride +) { + int r, c; + + for (r = 0; r < 4; r++) { + for (c = 0; c < 8; c++) { + int a = diff_ptr[c] + pred_ptr[c]; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dst_ptr[c] = (unsigned char) a; + } + + dst_ptr += stride; + diff_ptr += 8; + pred_ptr += 8; + } +} + +#if CONFIG_SUPERBLOCKS +void vp9_recon_mby_s_c(MACROBLOCKD *xd, uint8_t *dst) { + int x, y; + BLOCKD *b = &xd->block[0]; + int stride = b->dst_stride; + short *diff = b->diff; + + for (y = 0; y < 16; y++) { + for (x = 0; x < 16; x++) { + int a = dst[x] + diff[x]; + if (a < 0) + a = 0; + else if (a > 255) + a = 255; + dst[x] = a; + } + dst += stride; + diff += 16; + } +} + +void vp9_recon_mbuv_s_c(MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst) { + int x, y, i; + uint8_t *dst = udst; + + for (i = 0; i < 2; i++, dst = vdst) { + BLOCKD *b = &xd->block[16 + 4 * i]; + int stride = b->dst_stride; + short *diff = b->diff; + + for (y = 0; y < 8; y++) { + for (x = 0; x < 8; x++) { + int a = dst[x] + diff[x]; + if (a < 0) + a = 0; + else if (a > 255) + a = 255; + dst[x] = a; + } + dst += stride; + diff += 8; + } + } +} +#endif + +void vp9_recon_mby_c(MACROBLOCKD *xd) { + int i; + + for (i = 0; i < 16; i += 4) { + BLOCKD *b = &xd->block[i]; + + vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } +} + +void vp9_recon_mb_c(MACROBLOCKD *xd) { + int i; + + for (i = 0; i < 16; i += 4) { + BLOCKD *b = &xd->block[i]; + + vp9_recon4b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } + + for (i = 16; i < 24; i += 2) { + BLOCKD *b = &xd->block[i]; + + vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } +} diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c new file mode 100644 index 000000000..eac07ae19 --- /dev/null +++ b/vp9/common/vp9_reconinter.c @@ -0,0 +1,1140 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vpx/vpx_integer.h" +#include "vp9_blockd.h" +#include "vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#if CONFIG_RUNTIME_CPU_DETECT +#include "vp9_onyxc_int.h" +#endif + +void vp9_setup_interp_filters(MACROBLOCKD *xd, + INTERPOLATIONFILTERTYPE mcomp_filter_type, + VP9_COMMON *cm) { + if (mcomp_filter_type == SIXTAP) { + xd->subpixel_predict = vp9_sixtap_predict; + xd->subpixel_predict8x4 = vp9_sixtap_predict8x4; + xd->subpixel_predict8x8 = vp9_sixtap_predict8x8; + xd->subpixel_predict16x16 = vp9_sixtap_predict16x16; + xd->subpixel_predict_avg = vp9_sixtap_predict_avg; + xd->subpixel_predict_avg8x8 = vp9_sixtap_predict_avg8x8; + xd->subpixel_predict_avg16x16 = vp9_sixtap_predict_avg16x16; + } else if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) { + xd->subpixel_predict = vp9_eighttap_predict; + xd->subpixel_predict8x4 = vp9_eighttap_predict8x4; + xd->subpixel_predict8x8 = vp9_eighttap_predict8x8; + xd->subpixel_predict16x16 = vp9_eighttap_predict16x16; + xd->subpixel_predict_avg = vp9_eighttap_predict_avg4x4; + xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8; + xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16; + } else if (mcomp_filter_type == EIGHTTAP_SHARP) { + xd->subpixel_predict = vp9_eighttap_predict_sharp; + xd->subpixel_predict8x4 = vp9_eighttap_predict8x4_sharp; + xd->subpixel_predict8x8 = vp9_eighttap_predict8x8_sharp; + xd->subpixel_predict16x16 = vp9_eighttap_predict16x16_sharp; + xd->subpixel_predict_avg = vp9_eighttap_predict_avg4x4_sharp; + xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_sharp; + xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_sharp_c; + } + else { + xd->subpixel_predict = vp9_bilinear_predict4x4; + xd->subpixel_predict8x4 = vp9_bilinear_predict8x4; + xd->subpixel_predict8x8 = vp9_bilinear_predict8x8; + xd->subpixel_predict16x16 = vp9_bilinear_predict16x16; + xd->subpixel_predict_avg = vp9_bilinear_predict_avg4x4; + xd->subpixel_predict_avg8x8 = vp9_bilinear_predict_avg8x8; + xd->subpixel_predict_avg16x16 = vp9_bilinear_predict_avg16x16; + } +} + +void vp9_copy_mem16x16_c(unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + int r; + + for (r = 0; r < 16; r++) { +#if !(CONFIG_FAST_UNALIGNED) + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = src[5]; + dst[6] = src[6]; + dst[7] = src[7]; + dst[8] = src[8]; + dst[9] = src[9]; + dst[10] = src[10]; + dst[11] = src[11]; + dst[12] = src[12]; + dst[13] = src[13]; + dst[14] = src[14]; + dst[15] = src[15]; + +#else + ((uint32_t *)dst)[0] = ((uint32_t *)src)[0]; + ((uint32_t *)dst)[1] = ((uint32_t *)src)[1]; + ((uint32_t *)dst)[2] = ((uint32_t *)src)[2]; + ((uint32_t *)dst)[3] = ((uint32_t *)src)[3]; + +#endif + src += src_stride; + dst += dst_stride; + } +} + +void vp9_avg_mem16x16_c(unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + int r; + + for (r = 0; r < 16; r++) { + int n; + + for (n = 0; n < 16; n++) { + dst[n] = (dst[n] + src[n] + 1) >> 1; + } + + src += src_stride; + dst += dst_stride; + } +} + +void vp9_copy_mem8x8_c(unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + int r; + + for (r = 0; r < 8; r++) { +#if !(CONFIG_FAST_UNALIGNED) + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = src[5]; + dst[6] = src[6]; + dst[7] = src[7]; +#else + ((uint32_t *)dst)[0] = ((uint32_t *)src)[0]; + ((uint32_t *)dst)[1] = ((uint32_t *)src)[1]; +#endif + src += src_stride; + dst += dst_stride; + } +} + +void vp9_avg_mem8x8_c(unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + int r; + + for (r = 0; r < 8; r++) { + int n; + + for (n = 0; n < 8; n++) { + dst[n] = (dst[n] + src[n] + 1) >> 1; + } + + src += src_stride; + dst += dst_stride; + } +} + +void vp9_copy_mem8x4_c(unsigned char *src, + int src_stride, + unsigned char *dst, + int dst_stride) { + int r; + + for (r = 0; r < 4; r++) { +#if !(CONFIG_FAST_UNALIGNED) + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = src[5]; + dst[6] = src[6]; + dst[7] = src[7]; +#else + ((uint32_t *)dst)[0] = ((uint32_t *)src)[0]; + ((uint32_t *)dst)[1] = ((uint32_t *)src)[1]; +#endif + src += src_stride; + dst += dst_stride; + } +} + +void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) { + int r; + unsigned char *ptr_base; + unsigned char *ptr; + unsigned char *pred_ptr = d->predictor; + int_mv mv; + + ptr_base = *(d->base_pre); + mv.as_int = d->bmi.as_mv.first.as_int; + + if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { + ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + + (mv.as_mv.col >> 3); + sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1, + pred_ptr, pitch); + } else { + ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride + + (mv.as_mv.col >> 3); + ptr = ptr_base; + + for (r = 0; r < 4; r++) { +#if !(CONFIG_FAST_UNALIGNED) + pred_ptr[0] = ptr[0]; + pred_ptr[1] = ptr[1]; + pred_ptr[2] = ptr[2]; + pred_ptr[3] = ptr[3]; +#else + *(uint32_t *)pred_ptr = *(uint32_t *)ptr; +#endif + pred_ptr += pitch; + ptr += d->pre_stride; + } + } +} + +/* + * Similar to vp9_build_inter_predictors_b(), but instead of storing the + * results in d->predictor, we average the contents of d->predictor (which + * come from an earlier call to vp9_build_inter_predictors_b()) with the + * predictor of the second reference frame / motion vector. + */ +void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch, + vp9_subpix_fn_t sppf) { + int r; + unsigned char *ptr_base; + unsigned char *ptr; + unsigned char *pred_ptr = d->predictor; + int_mv mv; + + ptr_base = *(d->base_second_pre); + mv.as_int = d->bmi.as_mv.second.as_int; + + if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { + ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + + (mv.as_mv.col >> 3); + sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1, + pred_ptr, pitch); + } else { + ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride + + (mv.as_mv.col >> 3); + ptr = ptr_base; + + for (r = 0; r < 4; r++) { + pred_ptr[0] = (pred_ptr[0] + ptr[0] + 1) >> 1; + pred_ptr[1] = (pred_ptr[1] + ptr[1] + 1) >> 1; + pred_ptr[2] = (pred_ptr[2] + ptr[2] + 1) >> 1; + pred_ptr[3] = (pred_ptr[3] + ptr[3] + 1) >> 1; + pred_ptr += pitch; + ptr += d->pre_stride; + } + } +} + +void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { + unsigned char *ptr_base; + unsigned char *ptr; + unsigned char *pred_ptr = d->predictor; + int_mv mv; + + ptr_base = *(d->base_pre); + mv.as_int = d->bmi.as_mv.first.as_int; + ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + + (mv.as_mv.col >> 3); + + if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { + xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, + (mv.as_mv.row & 7) << 1, pred_ptr, pitch); + } else { + vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch); + } +} + +/* + * Similar to build_inter_predictors_4b(), but instead of storing the + * results in d->predictor, we average the contents of d->predictor (which + * come from an earlier call to build_inter_predictors_4b()) with the + * predictor of the second reference frame / motion vector. + */ +void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd, + BLOCKD *d, int pitch) { + unsigned char *ptr_base; + unsigned char *ptr; + unsigned char *pred_ptr = d->predictor; + int_mv mv; + + ptr_base = *(d->base_second_pre); + mv.as_int = d->bmi.as_mv.second.as_int; + ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + + (mv.as_mv.col >> 3); + + if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { + xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, + (mv.as_mv.row & 7) << 1, pred_ptr, pitch); + } else { + vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch); + } +} + +static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { + unsigned char *ptr_base; + unsigned char *ptr; + unsigned char *pred_ptr = d->predictor; + int_mv mv; + + ptr_base = *(d->base_pre); + mv.as_int = d->bmi.as_mv.first.as_int; + ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + + (mv.as_mv.col >> 3); + + if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { + xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, + (mv.as_mv.row & 7) << 1, pred_ptr, pitch); + } else { + vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch); + } +} + + +/*encoder only*/ +#if CONFIG_PRED_FILTER + +// Select the thresholded or non-thresholded filter +#define USE_THRESH_FILTER 0 + +#define PRED_FILT_LEN 5 + +static const int filt_shift = 4; +static const int pred_filter[PRED_FILT_LEN] = {1, 2, 10, 2, 1}; +// Alternative filter {1, 1, 4, 1, 1} + +#if !USE_THRESH_FILTER +void filter_mb(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int width, int height) { + int i, j, k; + unsigned int Temp[32 * 32]; + unsigned int *pTmp = Temp; + unsigned char *pSrc = src - (1 + src_stride) * (PRED_FILT_LEN / 2); + + // Horizontal + for (i = 0; i < height + PRED_FILT_LEN - 1; i++) { + for (j = 0; j < width; j++) { + int sum = 0; + for (k = 0; k < PRED_FILT_LEN; k++) + sum += pSrc[j + k] * pred_filter[k]; + pTmp[j] = sum; + } + + pSrc += src_stride; + pTmp += width; + } + + // Vertical + pTmp = Temp; + for (i = 0; i < width; i++) { + unsigned char *pDst = dst + i; + for (j = 0; j < height; j++) { + int sum = 0; + for (k = 0; k < PRED_FILT_LEN; k++) + sum += pTmp[(j + k) * width] * pred_filter[k]; + // Round + sum = (sum + ((1 << (filt_shift << 1)) >> 1)) >> (filt_shift << 1); + pDst[j * dst_stride] = (sum < 0 ? 0 : sum > 255 ? 255 : sum); + } + ++pTmp; + } +} +#else +// Based on vp9_post_proc_down_and_across_c (vp9_postproc.c) +void filter_mb(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int width, int height) { + unsigned char *pSrc, *pDst; + int row; + int col; + int i; + int v; + unsigned char d[8]; + + /* TODO flimit should be linked to the quantizer value */ + int flimit = 7; + + for (row = 0; row < height; row++) { + /* post_proc_down for one row */ + pSrc = src; + pDst = dst; + + for (col = 0; col < width; col++) { + int kernel = (1 << (filt_shift - 1)); + int v = pSrc[col]; + + for (i = -2; i <= 2; i++) { + if (abs(v - pSrc[col + i * src_stride]) > flimit) + goto down_skip_convolve; + + kernel += pred_filter[2 + i] * pSrc[col + i * src_stride]; + } + + v = (kernel >> filt_shift); + down_skip_convolve: + pDst[col] = v; + } + + /* now post_proc_across */ + pSrc = dst; + pDst = dst; + + for (i = 0; i < 8; i++) + d[i] = pSrc[i]; + + for (col = 0; col < width; col++) { + int kernel = (1 << (filt_shift - 1)); + v = pSrc[col]; + + d[col & 7] = v; + + for (i = -2; i <= 2; i++) { + if (abs(v - pSrc[col + i]) > flimit) + goto across_skip_convolve; + + kernel += pred_filter[2 + i] * pSrc[col + i]; + } + + d[col & 7] = (kernel >> filt_shift); + across_skip_convolve: + + if (col >= 2) + pDst[col - 2] = d[(col - 2) & 7]; + } + + /* handle the last two pixels */ + pDst[col - 2] = d[(col - 2) & 7]; + pDst[col - 1] = d[(col - 1) & 7]; + + /* next row */ + src += src_stride; + dst += dst_stride; + } +} +#endif // !USE_THRESH_FILTER + +#endif // CONFIG_PRED_FILTER + +/*encoder only*/ +void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) { + int i, j; + BLOCKD *blockd = xd->block; + + /* build uv mvs */ + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + int yoffset = i * 8 + j * 2; + int uoffset = 16 + i * 2 + j; + int voffset = 20 + i * 2 + j; + int temp; + + temp = blockd[yoffset ].bmi.as_mv.first.as_mv.row + + blockd[yoffset + 1].bmi.as_mv.first.as_mv.row + + blockd[yoffset + 4].bmi.as_mv.first.as_mv.row + + blockd[yoffset + 5].bmi.as_mv.first.as_mv.row; + + if (temp < 0) temp -= 4; + else temp += 4; + + xd->block[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) & + xd->fullpixel_mask; + + temp = blockd[yoffset ].bmi.as_mv.first.as_mv.col + + blockd[yoffset + 1].bmi.as_mv.first.as_mv.col + + blockd[yoffset + 4].bmi.as_mv.first.as_mv.col + + blockd[yoffset + 5].bmi.as_mv.first.as_mv.col; + + if (temp < 0) temp -= 4; + else temp += 4; + + blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) & + xd->fullpixel_mask; + + blockd[voffset].bmi.as_mv.first.as_mv.row = + blockd[uoffset].bmi.as_mv.first.as_mv.row; + blockd[voffset].bmi.as_mv.first.as_mv.col = + blockd[uoffset].bmi.as_mv.first.as_mv.col; + + if (xd->mode_info_context->mbmi.second_ref_frame > 0) { + temp = blockd[yoffset ].bmi.as_mv.second.as_mv.row + + blockd[yoffset + 1].bmi.as_mv.second.as_mv.row + + blockd[yoffset + 4].bmi.as_mv.second.as_mv.row + + blockd[yoffset + 5].bmi.as_mv.second.as_mv.row; + + if (temp < 0) { + temp -= 4; + } else { + temp += 4; + } + + blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) & + xd->fullpixel_mask; + + temp = blockd[yoffset ].bmi.as_mv.second.as_mv.col + + blockd[yoffset + 1].bmi.as_mv.second.as_mv.col + + blockd[yoffset + 4].bmi.as_mv.second.as_mv.col + + blockd[yoffset + 5].bmi.as_mv.second.as_mv.col; + + if (temp < 0) { + temp -= 4; + } else { + temp += 4; + } + + blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) & + xd->fullpixel_mask; + + blockd[voffset].bmi.as_mv.second.as_mv.row = + blockd[uoffset].bmi.as_mv.second.as_mv.row; + blockd[voffset].bmi.as_mv.second.as_mv.col = + blockd[uoffset].bmi.as_mv.second.as_mv.col; + } + } + } + + for (i = 16; i < 24; i += 2) { + BLOCKD *d0 = &blockd[i]; + BLOCKD *d1 = &blockd[i + 1]; + + if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) + build_inter_predictors2b(xd, d0, 8); + else { + vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict); + vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict); + } + + if (xd->mode_info_context->mbmi.second_ref_frame > 0) { + vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg); + vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg); + } + } +} + +static void clamp_mv_to_umv_border(MV *mv, const MACROBLOCKD *xd) { + /* If the MV points so far into the UMV border that no visible pixels + * are used for reconstruction, the subpel part of the MV can be + * discarded and the MV limited to 16 pixels with equivalent results. + * + * This limit kicks in at 19 pixels for the top and left edges, for + * the 16 pixels plus 3 taps right of the central pixel when subpel + * filtering. The bottom and right edges use 16 pixels plus 2 pixels + * left of the central pixel when filtering. + */ + if (mv->col < (xd->mb_to_left_edge - ((16 + VP9_INTERP_EXTEND) << 3))) + mv->col = xd->mb_to_left_edge - (16 << 3); + else if (mv->col > xd->mb_to_right_edge + ((15 + VP9_INTERP_EXTEND) << 3)) + mv->col = xd->mb_to_right_edge + (16 << 3); + + if (mv->row < (xd->mb_to_top_edge - ((16 + VP9_INTERP_EXTEND) << 3))) + mv->row = xd->mb_to_top_edge - (16 << 3); + else if (mv->row > xd->mb_to_bottom_edge + ((15 + VP9_INTERP_EXTEND) << 3)) + mv->row = xd->mb_to_bottom_edge + (16 << 3); +} + +/* A version of the above function for chroma block MVs.*/ +static void clamp_uvmv_to_umv_border(MV *mv, const MACROBLOCKD *xd) { + const int extend = VP9_INTERP_EXTEND; + + mv->col = (2 * mv->col < (xd->mb_to_left_edge - ((16 + extend) << 3))) ? + (xd->mb_to_left_edge - (16 << 3)) >> 1 : mv->col; + mv->col = (2 * mv->col > xd->mb_to_right_edge + ((15 + extend) << 3)) ? + (xd->mb_to_right_edge + (16 << 3)) >> 1 : mv->col; + + mv->row = (2 * mv->row < (xd->mb_to_top_edge - ((16 + extend) << 3))) ? + (xd->mb_to_top_edge - (16 << 3)) >> 1 : mv->row; + mv->row = (2 * mv->row > xd->mb_to_bottom_edge + ((15 + extend) << 3)) ? + (xd->mb_to_bottom_edge + (16 << 3)) >> 1 : mv->row; +} + +/*encoder only*/ +void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd, + unsigned char *dst_y, + int dst_ystride, + int clamp_mvs) { + unsigned char *ptr_base = xd->pre.y_buffer; + unsigned char *ptr; + int pre_stride = xd->block[0].pre_stride; + int_mv ymv; + + ymv.as_int = xd->mode_info_context->mbmi.mv[0].as_int; + + if (clamp_mvs) + clamp_mv_to_umv_border(&ymv.as_mv, xd); + + ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3); + +#if CONFIG_PRED_FILTER + if (xd->mode_info_context->mbmi.pred_filter_enabled) { + if ((ymv.as_mv.row | ymv.as_mv.col) & 7) { + // Sub-pel filter needs extended input + int len = 15 + (VP9_INTERP_EXTEND << 1); + unsigned char Temp[32 * 32]; // Data required by sub-pel filter + unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1); + + // Copy extended MB into Temp array, applying the spatial filter + filter_mb(ptr - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride, + Temp, len, len, len); + + // Sub-pel interpolation + xd->subpixel_predict16x16(pTemp, len, + (ymv.as_mv.col & 7) << 1, + (ymv.as_mv.row & 7) << 1, + dst_y, dst_ystride); + } else { + // Apply spatial filter to create the prediction directly + filter_mb(ptr, pre_stride, dst_y, dst_ystride, 16, 16); + } + } else +#endif + if ((ymv.as_mv.row | ymv.as_mv.col) & 7) { + xd->subpixel_predict16x16(ptr, pre_stride, + (ymv.as_mv.col & 7) << 1, + (ymv.as_mv.row & 7) << 1, + dst_y, dst_ystride); + } else { + vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride); + } +} + +void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_uvstride) { + int offset; + unsigned char *uptr, *vptr; + int pre_stride = xd->block[0].pre_stride; + int_mv _o16x16mv; + int_mv _16x16mv; + + _16x16mv.as_int = xd->mode_info_context->mbmi.mv[0].as_int; + + if (xd->mode_info_context->mbmi.need_to_clamp_mvs) + clamp_mv_to_umv_border(&_16x16mv.as_mv, xd); + + _o16x16mv = _16x16mv; + /* calc uv motion vectors */ + if (_16x16mv.as_mv.row < 0) + _16x16mv.as_mv.row -= 1; + else + _16x16mv.as_mv.row += 1; + + if (_16x16mv.as_mv.col < 0) + _16x16mv.as_mv.col -= 1; + else + _16x16mv.as_mv.col += 1; + + _16x16mv.as_mv.row /= 2; + _16x16mv.as_mv.col /= 2; + + _16x16mv.as_mv.row &= xd->fullpixel_mask; + _16x16mv.as_mv.col &= xd->fullpixel_mask; + + pre_stride >>= 1; + offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3); + uptr = xd->pre.u_buffer + offset; + vptr = xd->pre.v_buffer + offset; + +#if CONFIG_PRED_FILTER + if (xd->mode_info_context->mbmi.pred_filter_enabled) { + int i; + unsigned char *pSrc = uptr; + unsigned char *pDst = dst_u; + int len = 7 + (VP9_INTERP_EXTEND << 1); + unsigned char Temp[32 * 32]; // Data required by the sub-pel filter + unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1); + + // U & V + for (i = 0; i < 2; i++) { + if (_o16x16mv.as_int & 0x000f000f) { + // Copy extended MB into Temp array, applying the spatial filter + filter_mb(pSrc - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride, + Temp, len, len, len); + + // Sub-pel filter + xd->subpixel_predict8x8(pTemp, len, + _o16x16mv.as_mv.col & 15, + _o16x16mv.as_mv.row & 15, + pDst, dst_uvstride); + } else { + filter_mb(pSrc, pre_stride, pDst, dst_uvstride, 8, 8); + } + + // V + pSrc = vptr; + pDst = dst_v; + } + } else +#endif + if (_o16x16mv.as_int & 0x000f000f) { + xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15, + _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride); + xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15, + _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride); + } else { + vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride); + vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride); + } +} + + +void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd, + unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_ystride, int dst_uvstride) { + vp9_build_1st_inter16x16_predictors_mby(xd, dst_y, dst_ystride, + xd->mode_info_context->mbmi.need_to_clamp_mvs); + vp9_build_1st_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride); +} + +#if CONFIG_SUPERBLOCKS +void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x, + unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_ystride, + int dst_uvstride) { + uint8_t *y1 = x->pre.y_buffer, *u1 = x->pre.u_buffer, *v1 = x->pre.v_buffer; + uint8_t *y2 = x->second_pre.y_buffer, *u2 = x->second_pre.u_buffer, + *v2 = x->second_pre.v_buffer; + int edge[4], n; + + edge[0] = x->mb_to_top_edge; + edge[1] = x->mb_to_bottom_edge; + edge[2] = x->mb_to_left_edge; + edge[3] = x->mb_to_right_edge; + + for (n = 0; n < 4; n++) { + const int x_idx = n & 1, y_idx = n >> 1; + + x->mb_to_top_edge = edge[0] - ((y_idx * 16) << 3); + x->mb_to_bottom_edge = edge[1] + (((1 - y_idx) * 16) << 3); + x->mb_to_left_edge = edge[2] - ((x_idx * 16) << 3); + x->mb_to_right_edge = edge[3] + (((1 - x_idx) * 16) << 3); + + x->pre.y_buffer = y1 + y_idx * 16 * x->pre.y_stride + x_idx * 16; + x->pre.u_buffer = u1 + y_idx * 8 * x->pre.uv_stride + x_idx * 8; + x->pre.v_buffer = v1 + y_idx * 8 * x->pre.uv_stride + x_idx * 8; + + vp9_build_1st_inter16x16_predictors_mb(x, + dst_y + y_idx * 16 * dst_ystride + x_idx * 16, + dst_u + y_idx * 8 * dst_uvstride + x_idx * 8, + dst_v + y_idx * 8 * dst_uvstride + x_idx * 8, + dst_ystride, dst_uvstride); + if (x->mode_info_context->mbmi.second_ref_frame > 0) { + x->second_pre.y_buffer = y2 + y_idx * 16 * x->pre.y_stride + x_idx * 16; + x->second_pre.u_buffer = u2 + y_idx * 8 * x->pre.uv_stride + x_idx * 8; + x->second_pre.v_buffer = v2 + y_idx * 8 * x->pre.uv_stride + x_idx * 8; + + vp9_build_2nd_inter16x16_predictors_mb(x, + dst_y + y_idx * 16 * dst_ystride + x_idx * 16, + dst_u + y_idx * 8 * dst_uvstride + x_idx * 8, + dst_v + y_idx * 8 * dst_uvstride + x_idx * 8, + dst_ystride, dst_uvstride); + } + } + + x->mb_to_top_edge = edge[0]; + x->mb_to_bottom_edge = edge[1]; + x->mb_to_left_edge = edge[2]; + x->mb_to_right_edge = edge[3]; + + x->pre.y_buffer = y1; + x->pre.u_buffer = u1; + x->pre.v_buffer = v1; + + if (x->mode_info_context->mbmi.second_ref_frame > 0) { + x->second_pre.y_buffer = y2; + x->second_pre.u_buffer = u2; + x->second_pre.v_buffer = v2; + } +} +#endif + +/* + * The following functions should be called after an initial + * call to vp9_build_1st_inter16x16_predictors_mb() or _mby()/_mbuv(). + * It will run a second sixtap filter on a (different) ref + * frame and average the result with the output of the + * first sixtap filter. The second reference frame is stored + * in x->second_pre (the reference frame index is in + * x->mode_info_context->mbmi.second_ref_frame). The second + * motion vector is x->mode_info_context->mbmi.second_mv. + * + * This allows blending prediction from two reference frames + * which sometimes leads to better prediction than from a + * single reference framer. + */ +void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd, + unsigned char *dst_y, + int dst_ystride) { + unsigned char *ptr; + + int_mv _16x16mv; + int mv_row; + int mv_col; + + unsigned char *ptr_base = xd->second_pre.y_buffer; + int pre_stride = xd->block[0].pre_stride; + + _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int; + + if (xd->mode_info_context->mbmi.need_to_clamp_secondmv) + clamp_mv_to_umv_border(&_16x16mv.as_mv, xd); + + mv_row = _16x16mv.as_mv.row; + mv_col = _16x16mv.as_mv.col; + + ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); + +#if CONFIG_PRED_FILTER + if (xd->mode_info_context->mbmi.pred_filter_enabled) { + if ((mv_row | mv_col) & 7) { + // Sub-pel filter needs extended input + int len = 15 + (VP9_INTERP_EXTEND << 1); + unsigned char Temp[32 * 32]; // Data required by sub-pel filter + unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1); + + // Copy extended MB into Temp array, applying the spatial filter + filter_mb(ptr - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride, + Temp, len, len, len); + + // Sub-pel filter + xd->subpixel_predict_avg16x16(pTemp, len, (mv_col & 7) << 1, + (mv_row & 7) << 1, dst_y, dst_ystride); + } else { + // TODO Needs to AVERAGE with the dst_y + // For now, do not apply the prediction filter in these cases! + vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride); + } + } else +#endif // CONFIG_PRED_FILTER + { + if ((mv_row | mv_col) & 7) { + xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1, + (mv_row & 7) << 1, dst_y, dst_ystride); + } else { + vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride); + } + } +} + +void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_uvstride) { + int offset; + unsigned char *uptr, *vptr; + + int_mv _16x16mv; + int mv_row; + int mv_col; + int omv_row, omv_col; + + int pre_stride = xd->block[0].pre_stride; + + _16x16mv.as_int = xd->mode_info_context->mbmi.mv[1].as_int; + + if (xd->mode_info_context->mbmi.need_to_clamp_secondmv) + clamp_mv_to_umv_border(&_16x16mv.as_mv, xd); + + mv_row = _16x16mv.as_mv.row; + mv_col = _16x16mv.as_mv.col; + + /* calc uv motion vectors */ + omv_row = mv_row; + omv_col = mv_col; + mv_row = (mv_row + (mv_row > 0)) >> 1; + mv_col = (mv_col + (mv_col > 0)) >> 1; + + mv_row &= xd->fullpixel_mask; + mv_col &= xd->fullpixel_mask; + + pre_stride >>= 1; + offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); + uptr = xd->second_pre.u_buffer + offset; + vptr = xd->second_pre.v_buffer + offset; + +#if CONFIG_PRED_FILTER + if (xd->mode_info_context->mbmi.pred_filter_enabled) { + int i; + int len = 7 + (VP9_INTERP_EXTEND << 1); + unsigned char Temp[32 * 32]; // Data required by sub-pel filter + unsigned char *pTemp = Temp + (VP9_INTERP_EXTEND - 1) * (len + 1); + unsigned char *pSrc = uptr; + unsigned char *pDst = dst_u; + + // U & V + for (i = 0; i < 2; i++) { + if ((omv_row | omv_col) & 15) { + // Copy extended MB into Temp array, applying the spatial filter + filter_mb(pSrc - (VP9_INTERP_EXTEND - 1) * (pre_stride + 1), pre_stride, + Temp, len, len, len); + + // Sub-pel filter + xd->subpixel_predict_avg8x8(pTemp, len, omv_col & 15, + omv_row & 15, pDst, dst_uvstride); + } else { + // TODO Needs to AVERAGE with the dst_[u|v] + // For now, do not apply the prediction filter here! + vp9_avg_mem8x8(pSrc, pre_stride, pDst, dst_uvstride); + } + + // V + pSrc = vptr; + pDst = dst_v; + } + } else +#endif // CONFIG_PRED_FILTER + if ((omv_row | omv_col) & 15) { + xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15, + omv_row & 15, dst_u, dst_uvstride); + xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15, + omv_row & 15, dst_v, dst_uvstride); + } else { + vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride); + vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride); + } +} + +void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd, + unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_ystride, + int dst_uvstride) { + vp9_build_2nd_inter16x16_predictors_mby(xd, dst_y, dst_ystride); + vp9_build_2nd_inter16x16_predictors_mbuv(xd, dst_u, dst_v, dst_uvstride); +} + +static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) { + int i; + MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi; + BLOCKD *blockd = xd->block; + + if (xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4) { + blockd[ 0].bmi = xd->mode_info_context->bmi[ 0]; + blockd[ 2].bmi = xd->mode_info_context->bmi[ 2]; + blockd[ 8].bmi = xd->mode_info_context->bmi[ 8]; + blockd[10].bmi = xd->mode_info_context->bmi[10]; + + if (mbmi->need_to_clamp_mvs) { + clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.first.as_mv, xd); + clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.first.as_mv, xd); + clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.first.as_mv, xd); + clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.first.as_mv, xd); + if (mbmi->second_ref_frame > 0) { + clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.second.as_mv, xd); + clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.second.as_mv, xd); + clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.second.as_mv, xd); + clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.second.as_mv, xd); + } + } + + + vp9_build_inter_predictors4b(xd, &blockd[ 0], 16); + vp9_build_inter_predictors4b(xd, &blockd[ 2], 16); + vp9_build_inter_predictors4b(xd, &blockd[ 8], 16); + vp9_build_inter_predictors4b(xd, &blockd[10], 16); + + if (mbmi->second_ref_frame > 0) { + vp9_build_2nd_inter_predictors4b(xd, &blockd[ 0], 16); + vp9_build_2nd_inter_predictors4b(xd, &blockd[ 2], 16); + vp9_build_2nd_inter_predictors4b(xd, &blockd[ 8], 16); + vp9_build_2nd_inter_predictors4b(xd, &blockd[10], 16); + } + } else { + for (i = 0; i < 16; i += 2) { + BLOCKD *d0 = &blockd[i]; + BLOCKD *d1 = &blockd[i + 1]; + + blockd[i + 0].bmi = xd->mode_info_context->bmi[i + 0]; + blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1]; + + if (mbmi->need_to_clamp_mvs) { + clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.first.as_mv, xd); + clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.first.as_mv, xd); + if (mbmi->second_ref_frame > 0) { + clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.second.as_mv, xd); + clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.second.as_mv, xd); + } + } + + if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) + build_inter_predictors2b(xd, d0, 16); + else { + vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict); + vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict); + } + + if (mbmi->second_ref_frame > 0) { + vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg); + vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg); + } + } + } + + for (i = 16; i < 24; i += 2) { + BLOCKD *d0 = &blockd[i]; + BLOCKD *d1 = &blockd[i + 1]; + + if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) + build_inter_predictors2b(xd, d0, 8); + else { + vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict); + vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict); + } + + if (mbmi->second_ref_frame > 0) { + vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg); + vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg); + } + } +} + +static +void build_4x4uvmvs(MACROBLOCKD *xd) { + int i, j; + BLOCKD *blockd = xd->block; + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + int yoffset = i * 8 + j * 2; + int uoffset = 16 + i * 2 + j; + int voffset = 20 + i * 2 + j; + + int temp; + + temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.row + + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.row + + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.row + + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.row; + + if (temp < 0) temp -= 4; + else temp += 4; + + blockd[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) & + xd->fullpixel_mask; + + temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.col + + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.col + + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.col + + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.col; + + if (temp < 0) temp -= 4; + else temp += 4; + + blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) & + xd->fullpixel_mask; + + // if (x->mode_info_context->mbmi.need_to_clamp_mvs) + clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd); + + // if (x->mode_info_context->mbmi.need_to_clamp_mvs) + clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd); + + blockd[voffset].bmi.as_mv.first.as_mv.row = + blockd[uoffset].bmi.as_mv.first.as_mv.row; + blockd[voffset].bmi.as_mv.first.as_mv.col = + blockd[uoffset].bmi.as_mv.first.as_mv.col; + + if (xd->mode_info_context->mbmi.second_ref_frame > 0) { + temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.row + + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.row + + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.row + + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.row; + + if (temp < 0) { + temp -= 4; + } else { + temp += 4; + } + + blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) & + xd->fullpixel_mask; + + temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.col + + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.col + + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.col + + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.col; + + if (temp < 0) { + temp -= 4; + } else { + temp += 4; + } + + blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) & + xd->fullpixel_mask; + + // if (mbmi->need_to_clamp_mvs) + clamp_uvmv_to_umv_border( + &blockd[uoffset].bmi.as_mv.second.as_mv, xd); + + // if (mbmi->need_to_clamp_mvs) + clamp_uvmv_to_umv_border( + &blockd[uoffset].bmi.as_mv.second.as_mv, xd); + + blockd[voffset].bmi.as_mv.second.as_mv.row = + blockd[uoffset].bmi.as_mv.second.as_mv.row; + blockd[voffset].bmi.as_mv.second.as_mv.col = + blockd[uoffset].bmi.as_mv.second.as_mv.col; + } + } + } +} + +void vp9_build_inter_predictors_mb(MACROBLOCKD *xd) { + if (xd->mode_info_context->mbmi.mode != SPLITMV) { + vp9_build_1st_inter16x16_predictors_mb(xd, xd->predictor, + &xd->predictor[256], + &xd->predictor[320], 16, 8); + + if (xd->mode_info_context->mbmi.second_ref_frame > 0) { + /* 256 = offset of U plane in Y+U+V buffer; + * 320 = offset of V plane in Y+U+V buffer. + * (256=16x16, 320=16x16+8x8). */ + vp9_build_2nd_inter16x16_predictors_mb(xd, xd->predictor, + &xd->predictor[256], + &xd->predictor[320], 16, 8); + } +#if CONFIG_COMP_INTERINTRA_PRED + else if (xd->mode_info_context->mbmi.second_ref_frame == INTRA_FRAME) { + vp9_build_interintra_16x16_predictors_mb(xd, xd->predictor, + &xd->predictor[256], + &xd->predictor[320], 16, 8); + } +#endif + } else { + build_4x4uvmvs(xd); + build_inter4x4_predictors_mb(xd); + } +} diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h new file mode 100644 index 000000000..25a8adc3b --- /dev/null +++ b/vp9/common/vp9_reconinter.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef __INC_RECONINTER_H +#define __INC_RECONINTER_H + +#include "vp9_onyxc_int.h" + +extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd, + unsigned char *dst_y, + int dst_ystride, + int clamp_mvs); + +extern void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_uvstride); + +extern void vp9_build_1st_inter16x16_predictors_mb(MACROBLOCKD *xd, + unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_ystride, + int dst_uvstride); + +extern void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd, + unsigned char *dst_y, + int dst_ystride); + +extern void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_uvstride); + +extern void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd, + unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_ystride, + int dst_uvstride); + +#if CONFIG_SUPERBLOCKS +extern void vp9_build_inter32x32_predictors_sb(MACROBLOCKD *x, + unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_ystride, + int dst_uvstride); +#endif + +extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd); + +extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, + vp9_subpix_fn_t sppf); + +extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch, + vp9_subpix_fn_t sppf); + +extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, + int pitch); + +extern void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd, + BLOCKD *d, int pitch); + +extern void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd); + +extern void vp9_setup_interp_filters(MACROBLOCKD *xd, + INTERPOLATIONFILTERTYPE filter, + VP9_COMMON *cm); + +#endif // __INC_RECONINTER_H diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c new file mode 100644 index 000000000..8199b94ae --- /dev/null +++ b/vp9/common/vp9_reconintra.c @@ -0,0 +1,819 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "vpx_ports/config.h" +#include "vp9_rtcd.h" +#include "vp9_reconintra.h" +#include "vpx_mem/vpx_mem.h" + +/* For skip_recon_mb(), add vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) + * and vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd). + */ + +static void d27_predictor(uint8_t *ypred_ptr, int y_stride, int n, + uint8_t *yabove_row, uint8_t *yleft_col) { + int r, c, h, w, v; + int a, b; + r = 0; + for (c = 0; c < n - 2; c++) { + if (c & 1) + a = yleft_col[r + 1]; + else + a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1; + b = yabove_row[c + 2]; + ypred_ptr[c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3); + } + for (r = 1; r < n / 2 - 1; r++) { + for (c = 0; c < n - 2 - 2 * r; c++) { + if (c & 1) + a = yleft_col[r + 1]; + else + a = (yleft_col[r] + yleft_col[r + 1] + 1) >> 1; + b = ypred_ptr[(r - 1) * y_stride + c + 2]; + ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3); + } + } + for (; r < n - 1; ++r) { + for (c = 0; c < n; c++) { + v = (c & 1 ? yleft_col[r + 1] : (yleft_col[r] + yleft_col[r + 1] + 1) >> 1); + h = r - c / 2; + ypred_ptr[h * y_stride + c] = v; + } + } + c = 0; + r = n - 1; + ypred_ptr[r * y_stride] = (ypred_ptr[(r - 1) * y_stride] + + yleft_col[r] + 1) >> 1; + for (r = n - 2; r >= n / 2; --r) { + w = c + (n - 1 - r) * 2; + ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] + + ypred_ptr[r * y_stride + w - 1] + 1) >> 1; + } + for (c = 1; c < n; c++) { + for (r = n - 1; r >= n / 2 + c / 2; --r) { + w = c + (n - 1 - r) * 2; + ypred_ptr[r * y_stride + w] = (ypred_ptr[(r - 1) * y_stride + w] + + ypred_ptr[r * y_stride + w - 1] + 1) >> 1; + } + } +} + +static void d63_predictor(uint8_t *ypred_ptr, int y_stride, int n, + uint8_t *yabove_row, uint8_t *yleft_col) { + int r, c, h, w, v; + int a, b; + c = 0; + for (r = 0; r < n - 2; r++) { + if (r & 1) + a = yabove_row[c + 1]; + else + a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1; + b = yleft_col[r + 2]; + ypred_ptr[r * y_stride] = (2 * a + (r + 1) * b + (r + 3) / 2) / (r + 3); + } + for (c = 1; c < n / 2 - 1; c++) { + for (r = 0; r < n - 2 - 2 * c; r++) { + if (r & 1) + a = yabove_row[c + 1]; + else + a = (yabove_row[c] + yabove_row[c + 1] + 1) >> 1; + b = ypred_ptr[(r + 2) * y_stride + c - 1]; + ypred_ptr[r * y_stride + c] = (2 * a + (c + 1) * b + (c + 3) / 2) / (c + 3); + } + } + for (; c < n - 1; ++c) { + for (r = 0; r < n; r++) { + v = (r & 1 ? yabove_row[c + 1] : (yabove_row[c] + yabove_row[c + 1] + 1) >> 1); + w = c - r / 2; + ypred_ptr[r * y_stride + w] = v; + } + } + r = 0; + c = n - 1; + ypred_ptr[c] = (ypred_ptr[(c - 1)] + yabove_row[c] + 1) >> 1; + for (c = n - 2; c >= n / 2; --c) { + h = r + (n - 1 - c) * 2; + ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] + + ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1; + } + for (r = 1; r < n; r++) { + for (c = n - 1; c >= n / 2 + r / 2; --c) { + h = r + (n - 1 - c) * 2; + ypred_ptr[h * y_stride + c] = (ypred_ptr[h * y_stride + c - 1] + + ypred_ptr[(h - 1) * y_stride + c] + 1) >> 1; + } + } +} + +static void d45_predictor(uint8_t *ypred_ptr, int y_stride, int n, + uint8_t *yabove_row, uint8_t *yleft_col) { + int r, c; + for (r = 0; r < n - 1; ++r) { + for (c = 0; c <= r; ++c) { + ypred_ptr[(r - c) * y_stride + c] = + (yabove_row[r + 1] * (c + 1) + + yleft_col[r + 1] * (r - c + 1) + r / 2 + 1) / (r + 2); + } + } + for (c = 0; c <= r; ++c) { + int yabove_ext = yabove_row[r]; // 2*yabove_row[r] - yabove_row[r-1]; + int yleft_ext = yleft_col[r]; // 2*yleft_col[r] - yleft_col[r-1]; + yabove_ext = (yabove_ext > 255 ? 255 : (yabove_ext < 0 ? 0 : yabove_ext)); + yleft_ext = (yleft_ext > 255 ? 255 : (yleft_ext < 0 ? 0 : yleft_ext)); + ypred_ptr[(r - c) * y_stride + c] = + (yabove_ext * (c + 1) + + yleft_ext * (r - c + 1) + r / 2 + 1) / (r + 2); + } + for (r = 1; r < n; ++r) { + for (c = n - r; c < n; ++c) + ypred_ptr[r * y_stride + c] = (ypred_ptr[(r - 1) * y_stride + c] + + ypred_ptr[r * y_stride + c - 1] + 1) >> 1; + } +} + +static void d117_predictor(uint8_t *ypred_ptr, int y_stride, int n, + uint8_t *yabove_row, uint8_t *yleft_col) { + int r, c; + for (c = 0; c < n; c++) + ypred_ptr[c] = (yabove_row[c - 1] + yabove_row[c] + 1) >> 1; + ypred_ptr += y_stride; + for (c = 0; c < n; c++) + ypred_ptr[c] = yabove_row[c - 1]; + ypred_ptr += y_stride; + for (r = 2; r < n; ++r) { + ypred_ptr[0] = yleft_col[r - 2]; + for (c = 1; c < n; c++) + ypred_ptr[c] = ypred_ptr[-2 * y_stride + c - 1]; + ypred_ptr += y_stride; + } +} + +static void d135_predictor(uint8_t *ypred_ptr, int y_stride, int n, + uint8_t *yabove_row, uint8_t *yleft_col) { + int r, c; + ypred_ptr[0] = yabove_row[-1]; + for (c = 1; c < n; c++) + ypred_ptr[c] = yabove_row[c - 1]; + for (r = 1; r < n; ++r) + ypred_ptr[r * y_stride] = yleft_col[r - 1]; + + ypred_ptr += y_stride; + for (r = 1; r < n; ++r) { + for (c = 1; c < n; c++) { + ypred_ptr[c] = ypred_ptr[-y_stride + c - 1]; + } + ypred_ptr += y_stride; + } +} + +static void d153_predictor(uint8_t *ypred_ptr, int y_stride, int n, + uint8_t *yabove_row, uint8_t *yleft_col) { + int r, c; + ypred_ptr[0] = (yabove_row[-1] + yleft_col[0] + 1) >> 1; + for (r = 1; r < n; r++) + ypred_ptr[r * y_stride] = (yleft_col[r - 1] + yleft_col[r] + 1) >> 1; + ypred_ptr++; + ypred_ptr[0] = yabove_row[-1]; + for (r = 1; r < n; r++) + ypred_ptr[r * y_stride] = yleft_col[r - 1]; + ypred_ptr++; + + for (c = 0; c < n - 2; c++) + ypred_ptr[c] = yabove_row[c]; + ypred_ptr += y_stride; + for (r = 1; r < n; ++r) { + for (c = 0; c < n - 2; c++) + ypred_ptr[c] = ypred_ptr[-y_stride + c - 2]; + ypred_ptr += y_stride; + } +} + +static void corner_predictor(unsigned char *ypred_ptr, int y_stride, int n, + unsigned char *yabove_row, + unsigned char *yleft_col) { + int mh, mv, maxgradh, maxgradv, x, y, nx, ny; + int i, j; + int top_left = yabove_row[-1]; + mh = mv = 0; + maxgradh = yabove_row[1] - top_left; + maxgradv = yleft_col[1] - top_left; + for (i = 2; i < n; ++i) { + int gh = yabove_row[i] - yabove_row[i - 2]; + int gv = yleft_col[i] - yleft_col[i - 2]; + if (gh > maxgradh) { + maxgradh = gh; + mh = i - 1; + } + if (gv > maxgradv) { + maxgradv = gv; + mv = i - 1; + } + } + nx = mh + mv + 3; + ny = 2 * n + 1 - nx; + + x = top_left; + for (i = 0; i <= mh; ++i) x += yabove_row[i]; + for (i = 0; i <= mv; ++i) x += yleft_col[i]; + x += (nx >> 1); + x /= nx; + y = 0; + for (i = mh + 1; i < n; ++i) y += yabove_row[i]; + for (i = mv + 1; i < n; ++i) y += yleft_col[i]; + y += (ny >> 1); + y /= ny; + + for (i = 0; i < n; ++i) { + for (j = 0; j < n; ++j) + ypred_ptr[j] = (i <= mh && j <= mv ? x : y); + ypred_ptr += y_stride; + } +} + +void vp9_recon_intra_mbuv(MACROBLOCKD *xd) { + int i; + for (i = 16; i < 24; i += 2) { + BLOCKD *b = &xd->block[i]; + vp9_recon2b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + } +} + +void vp9_build_intra_predictors_internal(unsigned char *src, int src_stride, + unsigned char *ypred_ptr, + int y_stride, int mode, int bsize, + int up_available, int left_available) { + + unsigned char *yabove_row = src - src_stride; + unsigned char yleft_col[32]; + unsigned char ytop_left = yabove_row[-1]; + int r, c, i; + + for (i = 0; i < bsize; i++) { + yleft_col[i] = src[i * src_stride - 1]; + } + + /* for Y */ + switch (mode) { + case DC_PRED: { + int expected_dc; + int i; + int shift; + int average = 0; + int log2_bsize_minus_1; + + assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32); + if (bsize == 4) { + log2_bsize_minus_1 = 1; + } else if (bsize == 8) { + log2_bsize_minus_1 = 2; + } else if (bsize == 16) { + log2_bsize_minus_1 = 3; + } else /* bsize == 32 */ { + log2_bsize_minus_1 = 4; + } + + if (up_available || left_available) { + if (up_available) { + for (i = 0; i < bsize; i++) { + average += yabove_row[i]; + } + } + + if (left_available) { + for (i = 0; i < bsize; i++) { + average += yleft_col[i]; + } + } + shift = log2_bsize_minus_1 + up_available + left_available; + expected_dc = (average + (1 << (shift - 1))) >> shift; + } else { + expected_dc = 128; + } + + for (r = 0; r < bsize; r++) { + vpx_memset(ypred_ptr, expected_dc, bsize); + ypred_ptr += y_stride; + } + } + break; + case V_PRED: { + for (r = 0; r < bsize; r++) { + memcpy(ypred_ptr, yabove_row, bsize); + ypred_ptr += y_stride; + } + } + break; + case H_PRED: { + for (r = 0; r < bsize; r++) { + vpx_memset(ypred_ptr, yleft_col[r], bsize); + ypred_ptr += y_stride; + } + } + break; + case TM_PRED: { + for (r = 0; r < bsize; r++) { + for (c = 0; c < bsize; c++) { + int pred = yleft_col[r] + yabove_row[ c] - ytop_left; + + if (pred < 0) + pred = 0; + + if (pred > 255) + pred = 255; + + ypred_ptr[c] = pred; + } + + ypred_ptr += y_stride; + } + } + break; + case D45_PRED: { + d45_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); + } + break; + case D135_PRED: { + d135_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); + } + break; + case D117_PRED: { + d117_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); + } + break; + case D153_PRED: { + d153_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); + } + break; + case D27_PRED: { + d27_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); + } + break; + case D63_PRED: { + d63_predictor(ypred_ptr, y_stride, bsize, yabove_row, yleft_col); + } + break; + case I8X8_PRED: + case B_PRED: + case NEARESTMV: + case NEARMV: + case ZEROMV: + case NEWMV: + case SPLITMV: + case MB_MODE_COUNT: + break; + } +} + +#if CONFIG_COMP_INTERINTRA_PRED +static void combine_interintra(MB_PREDICTION_MODE mode, + unsigned char *interpred, + int interstride, + unsigned char *intrapred, + int intrastride, + int size) { + // TODO(debargha): Explore different ways of combining predictors + // or designing the tables below + static const int scale_bits = 8; + static const int scale_max = 1 << scale_bits; + static const int scale_round = (1 << scale_bits) - 1; + // This table is a function A + B*exp(-kx), where x is hor. index + static const int weights1d[32] = { + 128, 122, 116, 111, 107, 103, 99, 96, + 93, 90, 88, 85, 83, 81, 80, 78, + 77, 76, 75, 74, 73, 72, 71, 70, + 70, 69, 69, 68, 68, 68, 67, 67, + }; + // This table is a function A + B*exp(-k.sqrt(xy)), where x, y are + // hor. and vert. indices + static const int weights2d[1024] = { + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 122, 120, 118, 116, 115, 114, 113, + 112, 111, 111, 110, 109, 109, 108, 107, + 107, 106, 106, 105, 105, 104, 104, 104, + 103, 103, 102, 102, 102, 101, 101, 101, + 128, 120, 116, 114, 112, 111, 109, 108, + 107, 106, 105, 104, 103, 102, 102, 101, + 100, 100, 99, 99, 98, 97, 97, 96, + 96, 96, 95, 95, 94, 94, 93, 93, + 128, 118, 114, 111, 109, 107, 106, 104, + 103, 102, 101, 100, 99, 98, 97, 97, + 96, 95, 95, 94, 93, 93, 92, 92, + 91, 91, 90, 90, 90, 89, 89, 88, + 128, 116, 112, 109, 107, 105, 103, 102, + 100, 99, 98, 97, 96, 95, 94, 93, + 93, 92, 91, 91, 90, 90, 89, 89, + 88, 88, 87, 87, 86, 86, 85, 85, + 128, 115, 111, 107, 105, 103, 101, 99, + 98, 97, 96, 94, 93, 93, 92, 91, + 90, 89, 89, 88, 88, 87, 86, 86, + 85, 85, 84, 84, 84, 83, 83, 82, + 128, 114, 109, 106, 103, 101, 99, 97, + 96, 95, 93, 92, 91, 90, 90, 89, + 88, 87, 87, 86, 85, 85, 84, 84, + 83, 83, 82, 82, 82, 81, 81, 80, + 128, 113, 108, 104, 102, 99, 97, 96, + 94, 93, 92, 91, 90, 89, 88, 87, + 86, 85, 85, 84, 84, 83, 83, 82, + 82, 81, 81, 80, 80, 79, 79, 79, + 128, 112, 107, 103, 100, 98, 96, 94, + 93, 91, 90, 89, 88, 87, 86, 85, + 85, 84, 83, 83, 82, 82, 81, 80, + 80, 80, 79, 79, 78, 78, 78, 77, + 128, 111, 106, 102, 99, 97, 95, 93, + 91, 90, 89, 88, 87, 86, 85, 84, + 83, 83, 82, 81, 81, 80, 80, 79, + 79, 78, 78, 77, 77, 77, 76, 76, + 128, 111, 105, 101, 98, 96, 93, 92, + 90, 89, 88, 86, 85, 84, 84, 83, + 82, 81, 81, 80, 80, 79, 79, 78, + 78, 77, 77, 76, 76, 76, 75, 75, + 128, 110, 104, 100, 97, 94, 92, 91, + 89, 88, 86, 85, 84, 83, 83, 82, + 81, 80, 80, 79, 79, 78, 78, 77, + 77, 76, 76, 75, 75, 75, 74, 74, + 128, 109, 103, 99, 96, 93, 91, 90, + 88, 87, 85, 84, 83, 82, 82, 81, + 80, 79, 79, 78, 78, 77, 77, 76, + 76, 75, 75, 75, 74, 74, 74, 73, + 128, 109, 102, 98, 95, 93, 90, 89, + 87, 86, 84, 83, 82, 81, 81, 80, + 79, 78, 78, 77, 77, 76, 76, 75, + 75, 75, 74, 74, 73, 73, 73, 73, + 128, 108, 102, 97, 94, 92, 90, 88, + 86, 85, 84, 83, 82, 81, 80, 79, + 78, 78, 77, 77, 76, 76, 75, 75, + 74, 74, 73, 73, 73, 73, 72, 72, + 128, 107, 101, 97, 93, 91, 89, 87, + 85, 84, 83, 82, 81, 80, 79, 78, + 78, 77, 76, 76, 75, 75, 74, 74, + 74, 73, 73, 73, 72, 72, 72, 71, + 128, 107, 100, 96, 93, 90, 88, 86, + 85, 83, 82, 81, 80, 79, 78, 78, + 77, 76, 76, 75, 75, 74, 74, 73, + 73, 73, 72, 72, 72, 71, 71, 71, + 128, 106, 100, 95, 92, 89, 87, 85, + 84, 83, 81, 80, 79, 78, 78, 77, + 76, 76, 75, 75, 74, 74, 73, 73, + 72, 72, 72, 72, 71, 71, 71, 70, + 128, 106, 99, 95, 91, 89, 87, 85, + 83, 82, 81, 80, 79, 78, 77, 76, + 76, 75, 75, 74, 74, 73, 73, 72, + 72, 72, 71, 71, 71, 71, 70, 70, + 128, 105, 99, 94, 91, 88, 86, 84, + 83, 81, 80, 79, 78, 77, 77, 76, + 75, 75, 74, 74, 73, 73, 72, 72, + 72, 71, 71, 71, 70, 70, 70, 70, + 128, 105, 98, 93, 90, 88, 85, 84, + 82, 81, 80, 79, 78, 77, 76, 75, + 75, 74, 74, 73, 73, 72, 72, 71, + 71, 71, 71, 70, 70, 70, 70, 69, + 128, 104, 97, 93, 90, 87, 85, 83, + 82, 80, 79, 78, 77, 76, 76, 75, + 74, 74, 73, 73, 72, 72, 71, 71, + 71, 70, 70, 70, 70, 69, 69, 69, + 128, 104, 97, 92, 89, 86, 84, 83, + 81, 80, 79, 78, 77, 76, 75, 74, + 74, 73, 73, 72, 72, 71, 71, 71, + 70, 70, 70, 70, 69, 69, 69, 69, + 128, 104, 96, 92, 89, 86, 84, 82, + 80, 79, 78, 77, 76, 75, 75, 74, + 73, 73, 72, 72, 71, 71, 71, 70, + 70, 70, 70, 69, 69, 69, 69, 68, + 128, 103, 96, 91, 88, 85, 83, 82, + 80, 79, 78, 77, 76, 75, 74, 74, + 73, 72, 72, 72, 71, 71, 70, 70, + 70, 70, 69, 69, 69, 69, 68, 68, + 128, 103, 96, 91, 88, 85, 83, 81, + 80, 78, 77, 76, 75, 75, 74, 73, + 73, 72, 72, 71, 71, 70, 70, 70, + 70, 69, 69, 69, 69, 68, 68, 68, + 128, 102, 95, 90, 87, 84, 82, 81, + 79, 78, 77, 76, 75, 74, 73, 73, + 72, 72, 71, 71, 71, 70, 70, 70, + 69, 69, 69, 69, 68, 68, 68, 68, + 128, 102, 95, 90, 87, 84, 82, 80, + 79, 77, 76, 75, 75, 74, 73, 73, + 72, 72, 71, 71, 70, 70, 70, 69, + 69, 69, 69, 68, 68, 68, 68, 68, + 128, 102, 94, 90, 86, 84, 82, 80, + 78, 77, 76, 75, 74, 73, 73, 72, + 72, 71, 71, 70, 70, 70, 69, 69, + 69, 69, 68, 68, 68, 68, 68, 67, + 128, 101, 94, 89, 86, 83, 81, 79, + 78, 77, 76, 75, 74, 73, 73, 72, + 71, 71, 71, 70, 70, 69, 69, 69, + 69, 68, 68, 68, 68, 68, 67, 67, + 128, 101, 93, 89, 85, 83, 81, 79, + 78, 76, 75, 74, 74, 73, 72, 72, + 71, 71, 70, 70, 70, 69, 69, 69, + 68, 68, 68, 68, 68, 67, 67, 67, + 128, 101, 93, 88, 85, 82, 80, 79, + 77, 76, 75, 74, 73, 73, 72, 71, + 71, 70, 70, 70, 69, 69, 69, 68, + 68, 68, 68, 68, 67, 67, 67, 67, + }; + int size_scale = (size == 32 ? 1 : + size == 16 ? 2 : + size == 8 ? 4 : 8); + int i, j; + switch (mode) { + case V_PRED: + for (i = 0; i < size; ++i) { + for (j = 0; j < size; ++j) { + int k = i * interstride + j; + int scale = weights1d[i * size_scale]; + interpred[k] = + ((scale_max - scale) * interpred[k] + + scale * intrapred[i * intrastride + j] + scale_round) + >> scale_bits; + } + } + break; + + case H_PRED: + for (i = 0; i < size; ++i) { + for (j = 0; j < size; ++j) { + int k = i * interstride + j; + int scale = weights1d[j * size_scale]; + interpred[k] = + ((scale_max - scale) * interpred[k] + + scale * intrapred[i * intrastride + j] + scale_round) + >> scale_bits; + } + } + break; + + case D63_PRED: + case D117_PRED: + for (i = 0; i < size; ++i) { + for (j = 0; j < size; ++j) { + int k = i * interstride + j; + int scale = (weights2d[i * size_scale * 32 + j * size_scale] + + weights1d[i * size_scale]) >> 1; + interpred[k] = + ((scale_max - scale) * interpred[k] + + scale * intrapred[i * intrastride + j] + scale_round) + >> scale_bits; + } + } + break; + + case D27_PRED: + case D153_PRED: + for (i = 0; i < size; ++i) { + for (j = 0; j < size; ++j) { + int k = i * interstride + j; + int scale = (weights2d[i * size_scale * 32 + j * size_scale] + + weights1d[j * size_scale]) >> 1; + interpred[k] = + ((scale_max - scale) * interpred[k] + + scale * intrapred[i * intrastride + j] + scale_round) + >> scale_bits; + } + } + break; + + case D135_PRED: + for (i = 0; i < size; ++i) { + for (j = 0; j < size; ++j) { + int k = i * interstride + j; + int scale = weights2d[i * size_scale * 32 + j * size_scale]; + interpred[k] = + ((scale_max - scale) * interpred[k] + + scale * intrapred[i * intrastride + j] + scale_round) + >> scale_bits; + } + } + break; + + case D45_PRED: + case DC_PRED: + case TM_PRED: + default: + // simple average + for (i = 0; i < size; ++i) { + for (j = 0; j < size; ++j) { + int k = i * interstride + j; + interpred[k] = (interpred[k] + intrapred[i * intrastride + j]) >> 1; + } + } + break; + } +} + +void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd, + unsigned char *ypred, + unsigned char *upred, + unsigned char *vpred, + int ystride, int uvstride) { + vp9_build_interintra_16x16_predictors_mby(xd, ypred, ystride); + vp9_build_interintra_16x16_predictors_mbuv(xd, upred, vpred, uvstride); +} + +void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd, + unsigned char *ypred, + int ystride) { + static const int scale_bits = 6; + unsigned char intrapredictor[256]; + int i, j; + vp9_build_intra_predictors_internal( + xd->dst.y_buffer, xd->dst.y_stride, + intrapredictor, 16, + xd->mode_info_context->mbmi.interintra_mode, 16, + xd->up_available, xd->left_available); + combine_interintra(xd->mode_info_context->mbmi.interintra_mode, + ypred, ystride, intrapredictor, 16, 16); +} + +void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd, + unsigned char *upred, + unsigned char *vpred, + int uvstride) { + int i, j; + unsigned char uintrapredictor[64]; + unsigned char vintrapredictor[64]; + vp9_build_intra_predictors_internal( + xd->dst.u_buffer, xd->dst.uv_stride, + uintrapredictor, 8, + xd->mode_info_context->mbmi.interintra_uv_mode, 8, + xd->up_available, xd->left_available); + vp9_build_intra_predictors_internal( + xd->dst.v_buffer, xd->dst.uv_stride, + vintrapredictor, 8, + xd->mode_info_context->mbmi.interintra_uv_mode, 8, + xd->up_available, xd->left_available); + combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode, + upred, uvstride, uintrapredictor, 8, 8); + combine_interintra(xd->mode_info_context->mbmi.interintra_uv_mode, + vpred, uvstride, vintrapredictor, 8, 8); +} +#endif + +void vp9_build_intra_predictors_mby(MACROBLOCKD *xd) { + vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, + xd->predictor, 16, + xd->mode_info_context->mbmi.mode, 16, + xd->up_available, xd->left_available); +} + +void vp9_build_intra_predictors_mby_s(MACROBLOCKD *xd) { + vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, + xd->dst.y_buffer, xd->dst.y_stride, + xd->mode_info_context->mbmi.mode, 16, + xd->up_available, xd->left_available); +} + +#if CONFIG_SUPERBLOCKS +void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd) { + vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, + xd->dst.y_buffer, xd->dst.y_stride, + xd->mode_info_context->mbmi.mode, 32, + xd->up_available, xd->left_available); +} +#endif + +#if CONFIG_COMP_INTRA_PRED +void vp9_build_comp_intra_predictors_mby(MACROBLOCKD *xd) { + unsigned char predictor[2][256]; + int i; + + vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, + predictor[0], 16, + xd->mode_info_context->mbmi.mode, + 16, xd->up_available, + xd->left_available); + vp9_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, + predictor[1], 16, + xd->mode_info_context->mbmi.second_mode, + 16, xd->up_available, + xd->left_available); + + for (i = 0; i < 256; i++) { + xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1; + } +} +#endif + +void vp9_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd, + unsigned char *upred_ptr, + unsigned char *vpred_ptr, + int uv_stride, + int mode, int bsize) { + vp9_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride, + upred_ptr, uv_stride, mode, bsize, + xd->up_available, xd->left_available); + vp9_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride, + vpred_ptr, uv_stride, mode, bsize, + xd->up_available, xd->left_available); +} + +void vp9_build_intra_predictors_mbuv(MACROBLOCKD *xd) { + vp9_build_intra_predictors_mbuv_internal(xd, &xd->predictor[256], + &xd->predictor[320], 8, + xd->mode_info_context->mbmi.uv_mode, + 8); +} + +void vp9_build_intra_predictors_mbuv_s(MACROBLOCKD *xd) { + vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer, + xd->dst.v_buffer, + xd->dst.uv_stride, + xd->mode_info_context->mbmi.uv_mode, + 8); +} + +#if CONFIG_SUPERBLOCKS +void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd) { + vp9_build_intra_predictors_mbuv_internal(xd, xd->dst.u_buffer, + xd->dst.v_buffer, xd->dst.uv_stride, + xd->mode_info_context->mbmi.uv_mode, + 16); +} +#endif + +#if CONFIG_COMP_INTRA_PRED +void vp9_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) { + unsigned char predictor[2][2][64]; + int i; + + vp9_build_intra_predictors_mbuv_internal( + xd, predictor[0][0], predictor[1][0], 8, + xd->mode_info_context->mbmi.uv_mode, 8); + vp9_build_intra_predictors_mbuv_internal( + xd, predictor[0][1], predictor[1][1], 8, + xd->mode_info_context->mbmi.second_uv_mode, 8); + for (i = 0; i < 64; i++) { + xd->predictor[256 + i] = (predictor[0][0][i] + predictor[0][1][i] + 1) >> 1; + xd->predictor[256 + 64 + i] = (predictor[1][0][i] + + predictor[1][1][i] + 1) >> 1; + } +} +#endif + +void vp9_intra8x8_predict(BLOCKD *xd, + int mode, + unsigned char *predictor) { + vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst, + xd->dst_stride, predictor, 16, + mode, 8, 1, 1); +} + +#if CONFIG_COMP_INTRA_PRED +void vp9_comp_intra8x8_predict(BLOCKD *xd, + int mode, int second_mode, + unsigned char *out_predictor) { + unsigned char predictor[2][8 * 16]; + int i, j; + + vp9_intra8x8_predict(xd, mode, predictor[0]); + vp9_intra8x8_predict(xd, second_mode, predictor[1]); + + for (i = 0; i < 8 * 16; i += 16) { + for (j = i; j < i + 8; j++) { + out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1; + } + } +} +#endif + +void vp9_intra_uv4x4_predict(BLOCKD *xd, + int mode, + unsigned char *predictor) { + vp9_build_intra_predictors_internal(*(xd->base_dst) + xd->dst, + xd->dst_stride, predictor, 8, + mode, 4, 1, 1); +} + +#if CONFIG_COMP_INTRA_PRED +void vp9_comp_intra_uv4x4_predict(BLOCKD *xd, + int mode, int mode2, + unsigned char *out_predictor) { + unsigned char predictor[2][8 * 4]; + int i, j; + + vp9_intra_uv4x4_predict(xd, mode, predictor[0]); + vp9_intra_uv4x4_predict(xd, mode2, predictor[1]); + + for (i = 0; i < 4 * 8; i += 8) { + for (j = i; j < i + 4; j++) { + out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1; + } + } +} +#endif + +/* TODO: try different ways of use Y-UV mode correlation + Current code assumes that a uv 4x4 block use same mode + as corresponding Y 8x8 area + */ diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h new file mode 100644 index 000000000..dd35b06ad --- /dev/null +++ b/vp9/common/vp9_reconintra.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef __INC_RECONINTRA_H +#define __INC_RECONINTRA_H + +#include "vp9_blockd.h" + +extern void vp9_recon_intra_mbuv(MACROBLOCKD *xd); +extern B_PREDICTION_MODE vp9_find_dominant_direction(unsigned char *ptr, + int stride, int n); +extern B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x); +#if CONFIG_COMP_INTERINTRA_PRED +extern void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd, + unsigned char *ypred, + unsigned char *upred, + unsigned char *vpred, + int ystride, + int uvstride); +extern void vp9_build_interintra_16x16_predictors_mby(MACROBLOCKD *xd, + unsigned char *ypred, + int ystride); +extern void vp9_build_interintra_16x16_predictors_mbuv(MACROBLOCKD *xd, + unsigned char *upred, + unsigned char *vpred, + int uvstride); +#endif + +#endif // __INC_RECONINTRA_H diff --git a/vp9/common/vp9_reconintra4x4.c b/vp9/common/vp9_reconintra4x4.c new file mode 100644 index 000000000..d5203acd0 --- /dev/null +++ b/vp9/common/vp9_reconintra4x4.c @@ -0,0 +1,472 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vpx_mem/vpx_mem.h" +#include "vp9_reconintra.h" +#include "vp9_rtcd.h" + +#if CONFIG_NEWBINTRAMODES +static int find_grad_measure(unsigned char *x, int stride, int n, int t, + int dx, int dy) { + int i, j; + int count = 0, gsum = 0, gdiv; + /* TODO: Make this code more efficient by breaking up into two loops */ + for (i = -t; i < n; ++i) + for (j = -t; j < n; ++j) { + int g; + if (i >= 0 && j >= 0) continue; + if (i + dy >= 0 && j + dx >= 0) continue; + if (i + dy < -t || i + dy >= n || j + dx < -t || j + dx >= n) continue; + g = abs(x[(i + dy) * stride + j + dx] - x[i * stride + j]); + gsum += g * g; + count++; + } + gdiv = (dx * dx + dy * dy) * count; + return ((gsum << 8) + (gdiv >> 1)) / gdiv; +} + +#if CONTEXT_PRED_REPLACEMENTS == 6 +B_PREDICTION_MODE vp9_find_dominant_direction( + unsigned char *ptr, int stride, int n) { + int g[8], i, imin, imax; + g[1] = find_grad_measure(ptr, stride, n, 4, 2, 1); + g[2] = find_grad_measure(ptr, stride, n, 4, 1, 1); + g[3] = find_grad_measure(ptr, stride, n, 4, 1, 2); + g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2); + g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1); + g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1); + imin = 1; + for (i = 2; i < 8; i += 1 + (i == 3)) + imin = (g[i] < g[imin] ? i : imin); + imax = 1; + for (i = 2; i < 8; i += 1 + (i == 3)) + imax = (g[i] > g[imax] ? i : imax); + /* + printf("%d %d %d %d %d %d = %d %d\n", + g[1], g[2], g[3], g[5], g[6], g[7], imin, imax); + */ + switch (imin) { + case 1: + return B_HD_PRED; + case 2: + return B_RD_PRED; + case 3: + return B_VR_PRED; + case 5: + return B_VL_PRED; + case 6: + return B_LD_PRED; + case 7: + return B_HU_PRED; + default: + assert(0); + } +} +#elif CONTEXT_PRED_REPLACEMENTS == 4 +B_PREDICTION_MODE vp9_find_dominant_direction( + unsigned char *ptr, int stride, int n) { + int g[8], i, imin, imax; + g[1] = find_grad_measure(ptr, stride, n, 4, 2, 1); + g[3] = find_grad_measure(ptr, stride, n, 4, 1, 2); + g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2); + g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1); + imin = 1; + for (i = 3; i < 8; i+=2) + imin = (g[i] < g[imin] ? i : imin); + imax = 1; + for (i = 3; i < 8; i+=2) + imax = (g[i] > g[imax] ? i : imax); + /* + printf("%d %d %d %d = %d %d\n", + g[1], g[3], g[5], g[7], imin, imax); + */ + switch (imin) { + case 1: + return B_HD_PRED; + case 3: + return B_VR_PRED; + case 5: + return B_VL_PRED; + case 7: + return B_HU_PRED; + default: + assert(0); + } +} +#elif CONTEXT_PRED_REPLACEMENTS == 0 +B_PREDICTION_MODE vp9_find_dominant_direction( + unsigned char *ptr, int stride, int n) { + int g[8], i, imin, imin2, imax; + g[0] = find_grad_measure(ptr, stride, n, 4, 1, 0); + g[1] = find_grad_measure(ptr, stride, n, 4, 2, 1); + g[2] = find_grad_measure(ptr, stride, n, 4, 1, 1); + g[3] = find_grad_measure(ptr, stride, n, 4, 1, 2); + g[4] = find_grad_measure(ptr, stride, n, 4, 0, 1); + g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2); + g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1); + g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1); + imax = 0; + for (i = 1; i < 8; i++) + imax = (g[i] > g[imax] ? i : imax); + imin = 0; + for (i = 1; i < 8; i++) + imin = (g[i] < g[imin] ? i : imin); + + switch (imin) { + case 0: + return B_HE_PRED; + case 1: + return B_HD_PRED; + case 2: + return B_RD_PRED; + case 3: + return B_VR_PRED; + case 4: + return B_VE_PRED; + case 5: + return B_VL_PRED; + case 6: + return B_LD_PRED; + case 7: + return B_HU_PRED; + default: + assert(0); + } +} +#endif + +B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x) { + unsigned char *ptr = *(x->base_dst) + x->dst; + int stride = x->dst_stride; + return vp9_find_dominant_direction(ptr, stride, 4); +} +#endif + +void vp9_intra4x4_predict(BLOCKD *x, + int b_mode, + unsigned char *predictor) { + int i, r, c; + + unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride; + unsigned char Left[4]; + unsigned char top_left = Above[-1]; + + Left[0] = (*(x->base_dst))[x->dst - 1]; + Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride]; + Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride]; + Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride]; + +#if CONFIG_NEWBINTRAMODES + if (b_mode == B_CONTEXT_PRED) + b_mode = x->bmi.as_mode.context; +#endif + + switch (b_mode) { + case B_DC_PRED: { + int expected_dc = 0; + + for (i = 0; i < 4; i++) { + expected_dc += Above[i]; + expected_dc += Left[i]; + } + + expected_dc = (expected_dc + 4) >> 3; + + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) { + predictor[c] = expected_dc; + } + + predictor += 16; + } + } + break; + case B_TM_PRED: { + /* prediction similar to true_motion prediction */ + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) { + int pred = Above[c] - top_left + Left[r]; + + if (pred < 0) + pred = 0; + + if (pred > 255) + pred = 255; + + predictor[c] = pred; + } + + predictor += 16; + } + } + break; + + case B_VE_PRED: { + + unsigned int ap[4]; + ap[0] = Above[0]; + ap[1] = Above[1]; + ap[2] = Above[2]; + ap[3] = Above[3]; + + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) { + + predictor[c] = ap[c]; + } + + predictor += 16; + } + + } + break; + + + case B_HE_PRED: { + + unsigned int lp[4]; + lp[0] = Left[0]; + lp[1] = Left[1]; + lp[2] = Left[2]; + lp[3] = Left[3]; + + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) { + predictor[c] = lp[r]; + } + + predictor += 16; + } + } + break; + case B_LD_PRED: { + unsigned char *ptr = Above; + predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2; + predictor[0 * 16 + 1] = + predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2; + predictor[0 * 16 + 2] = + predictor[1 * 16 + 1] = + predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2; + predictor[0 * 16 + 3] = + predictor[1 * 16 + 2] = + predictor[2 * 16 + 1] = + predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2; + predictor[1 * 16 + 3] = + predictor[2 * 16 + 2] = + predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2; + predictor[2 * 16 + 3] = + predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2; + predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2; + + } + break; + case B_RD_PRED: { + + unsigned char pp[9]; + + pp[0] = Left[3]; + pp[1] = Left[2]; + pp[2] = Left[1]; + pp[3] = Left[0]; + pp[4] = top_left; + pp[5] = Above[0]; + pp[6] = Above[1]; + pp[7] = Above[2]; + pp[8] = Above[3]; + + predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[3 * 16 + 1] = + predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[3 * 16 + 2] = + predictor[2 * 16 + 1] = + predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[3 * 16 + 3] = + predictor[2 * 16 + 2] = + predictor[1 * 16 + 1] = + predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * 16 + 3] = + predictor[1 * 16 + 2] = + predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[1 * 16 + 3] = + predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + + } + break; + case B_VR_PRED: { + + unsigned char pp[9]; + + pp[0] = Left[3]; + pp[1] = Left[2]; + pp[2] = Left[1]; + pp[3] = Left[0]; + pp[4] = top_left; + pp[5] = Above[0]; + pp[6] = Above[1]; + pp[7] = Above[2]; + pp[8] = Above[3]; + + + predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[3 * 16 + 1] = + predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * 16 + 1] = + predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1; + predictor[3 * 16 + 2] = + predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[2 * 16 + 2] = + predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1; + predictor[3 * 16 + 3] = + predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + predictor[2 * 16 + 3] = + predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1; + predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2; + predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1; + + } + break; + case B_VL_PRED: { + + unsigned char *pp = Above; + + predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[2 * 16 + 0] = + predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1; + predictor[1 * 16 + 1] = + predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 1] = + predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1; + predictor[3 * 16 + 1] = + predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[0 * 16 + 3] = + predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1; + predictor[1 * 16 + 3] = + predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + } + break; + + case B_HD_PRED: { + unsigned char pp[9]; + pp[0] = Left[3]; + pp[1] = Left[2]; + pp[2] = Left[1]; + pp[3] = Left[0]; + pp[4] = top_left; + pp[5] = Above[0]; + pp[6] = Above[1]; + pp[7] = Above[2]; + pp[8] = Above[3]; + + + predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[2 * 16 + 0] = + predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1; + predictor[2 * 16 + 1] = + predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 2] = + predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1; + predictor[2 * 16 + 3] = + predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2; + predictor[1 * 16 + 2] = + predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1; + predictor[1 * 16 + 3] = + predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2; + predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2; + predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2; + } + break; + + + case B_HU_PRED: { + unsigned char *pp = Left; + predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1; + predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2; + predictor[0 * 16 + 2] = + predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1; + predictor[0 * 16 + 3] = + predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2; + predictor[1 * 16 + 2] = + predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1; + predictor[1 * 16 + 3] = + predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2; + predictor[2 * 16 + 2] = + predictor[2 * 16 + 3] = + predictor[3 * 16 + 0] = + predictor[3 * 16 + 1] = + predictor[3 * 16 + 2] = + predictor[3 * 16 + 3] = pp[3]; + } + break; + +#if CONFIG_NEWBINTRAMODES + case B_CONTEXT_PRED: + break; + /* + case B_CORNER_PRED: + corner_predictor(predictor, 16, 4, Above, Left); + break; + */ +#endif + } +} + +#if CONFIG_COMP_INTRA_PRED +void vp9_comp_intra4x4_predict_c(BLOCKD *x, + int b_mode, int b_mode2, + unsigned char *out_predictor) { + unsigned char predictor[2][4 * 16]; + int i, j; + + vp9_intra4x4_predict(x, b_mode, predictor[0]); + vp9_intra4x4_predict(x, b_mode2, predictor[1]); + + for (i = 0; i < 16 * 4; i += 16) { + for (j = i; j < i + 4; j++) { + out_predictor[j] = (predictor[0][j] + predictor[1][j] + 1) >> 1; + } + } +} +#endif + +/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and + * to the right prediction have filled in pixels to use. + */ +void vp9_intra_prediction_down_copy(MACROBLOCKD *xd) { + int extend_edge = (xd->mb_to_right_edge == 0 && xd->mb_index < 2); + unsigned char *above_right = *(xd->block[0].base_dst) + xd->block[0].dst - + xd->block[0].dst_stride + 16; + unsigned int *src_ptr = (unsigned int *) + (above_right - (xd->mb_index == 3 ? 16 * xd->block[0].dst_stride : 0)); + + unsigned int *dst_ptr0 = (unsigned int *)above_right; + unsigned int *dst_ptr1 = + (unsigned int *)(above_right + 4 * xd->block[0].dst_stride); + unsigned int *dst_ptr2 = + (unsigned int *)(above_right + 8 * xd->block[0].dst_stride); + unsigned int *dst_ptr3 = + (unsigned int *)(above_right + 12 * xd->block[0].dst_stride); + + if (extend_edge) { + *src_ptr = ((uint8_t *) src_ptr)[-1] * 0x01010101U; + } + + *dst_ptr0 = *src_ptr; + *dst_ptr1 = *src_ptr; + *dst_ptr2 = *src_ptr; + *dst_ptr3 = *src_ptr; +} diff --git a/vp9/common/vp9_reconintra4x4.h b/vp9/common/vp9_reconintra4x4.h new file mode 100644 index 000000000..79a048076 --- /dev/null +++ b/vp9/common/vp9_reconintra4x4.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_RECONINTRA4x4_H +#define __INC_RECONINTRA4x4_H + +extern void vp9_intra_prediction_down_copy(MACROBLOCKD *xd); + +#endif diff --git a/vp9/common/vp9_rtcd.c b/vp9/common/vp9_rtcd.c new file mode 100644 index 000000000..277d5b217 --- /dev/null +++ b/vp9/common/vp9_rtcd.c @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "vpx_config.h" +#define RTCD_C +#include "vp9_rtcd.h" +#include "vpx_ports/vpx_once.h" + +extern void vpx_scale_rtcd(void); + +void vp9_rtcd() +{ + vpx_scale_rtcd(); + once(setup_rtcd_internal); +} diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh new file mode 100644 index 000000000..ea134a854 --- /dev/null +++ b/vp9/common/vp9_rtcd_defs.sh @@ -0,0 +1,689 @@ +vp9_common_forward_decls() { +cat <segmentation_enabled && + (xd->segment_feature_mask[segment_id] & + (0x01 << feature_id))); +} + +void vp9_clearall_segfeatures(MACROBLOCKD *xd) { + vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data)); + vpx_memset(xd->segment_feature_mask, 0, sizeof(xd->segment_feature_mask)); +} + +void vp9_enable_segfeature(MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id) { + xd->segment_feature_mask[segment_id] |= (0x01 << feature_id); +} + +void vp9_disable_segfeature(MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id) { + xd->segment_feature_mask[segment_id] &= ~(1 << feature_id); +} + +int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id) { + return seg_feature_data_bits[feature_id]; +} + +int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) { + return (segfeaturedata_signed[feature_id]); +} + +void vp9_clear_segdata(MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id) { + xd->segment_feature_data[segment_id][feature_id] = 0; +} + +void vp9_set_segdata(MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id, + int seg_data) { + xd->segment_feature_data[segment_id][feature_id] = seg_data; +} + +int vp9_get_segdata(const MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id) { + return xd->segment_feature_data[segment_id][feature_id]; +} + +void vp9_clear_segref(MACROBLOCKD *xd, int segment_id) { + xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] = 0; +} + +void vp9_set_segref(MACROBLOCKD *xd, + int segment_id, + MV_REFERENCE_FRAME ref_frame) { + xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] |= + (1 << ref_frame); +} + +int vp9_check_segref(const MACROBLOCKD *xd, + int segment_id, + MV_REFERENCE_FRAME ref_frame) { + return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] & + (1 << ref_frame)) ? 1 : 0; +} + +int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id) { + return (xd->segment_feature_data[segment_id][SEG_LVL_REF_FRAME] & + ~(1 << INTRA_FRAME)) ? 1 : 0; +} + +int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id) { + if (vp9_segfeature_active(xd, segment_id, SEG_LVL_TRANSFORM)) + return vp9_get_segdata(xd, segment_id, SEG_LVL_TRANSFORM); + else + return TX_4X4; +} +// TBD? Functions to read and write segment data with range / validity checking diff --git a/vp9/common/vp9_seg_common.h b/vp9/common/vp9_seg_common.h new file mode 100644 index 000000000..fb0570c6b --- /dev/null +++ b/vp9/common/vp9_seg_common.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9_type_aliases.h" +#include "vp9_onyxc_int.h" +#include "vp9/common/vp9_blockd.h" + +#ifndef __INC_SEG_COMMON_H__ +#define __INC_SEG_COMMON_H__ 1 + +int vp9_segfeature_active(const MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id); + +void vp9_clearall_segfeatures(MACROBLOCKD *xd); + +void vp9_enable_segfeature(MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id); + +void vp9_disable_segfeature(MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id); + +int vp9_seg_feature_data_bits(SEG_LVL_FEATURES feature_id); + +int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id); + +void vp9_clear_segdata(MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id); + +void vp9_set_segdata(MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id, + int seg_data); + +int vp9_get_segdata(const MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id); + +void vp9_clear_segref(MACROBLOCKD *xd, int segment_id); + +void vp9_set_segref(MACROBLOCKD *xd, + int segment_id, + MV_REFERENCE_FRAME ref_frame); + +int vp9_check_segref(const MACROBLOCKD *xd, + int segment_id, + MV_REFERENCE_FRAME ref_frame); + +int vp9_check_segref_inter(MACROBLOCKD *xd, int segment_id); + +int vp9_get_seg_tx_type(MACROBLOCKD *xd, int segment_id); + +#endif /* __INC_SEG_COMMON_H__ */ + diff --git a/vp9/common/vp9_setupintrarecon.c b/vp9/common/vp9_setupintrarecon.c new file mode 100644 index 000000000..3fe4e3e50 --- /dev/null +++ b/vp9/common/vp9_setupintrarecon.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9_setupintrarecon.h" +#include "vpx_mem/vpx_mem.h" + +void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf) { + int i; + + /* set up frame new frame for intra coded blocks */ + vpx_memset(ybf->y_buffer - 1 - ybf->y_stride, 127, ybf->y_width + 5); + for (i = 0; i < ybf->y_height; i++) + ybf->y_buffer[ybf->y_stride * i - 1] = (unsigned char) 129; + + vpx_memset(ybf->u_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); + for (i = 0; i < ybf->uv_height; i++) + ybf->u_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129; + + vpx_memset(ybf->v_buffer - 1 - ybf->uv_stride, 127, ybf->uv_width + 5); + for (i = 0; i < ybf->uv_height; i++) + ybf->v_buffer[ybf->uv_stride * i - 1] = (unsigned char) 129; + +} diff --git a/vp9/common/vp9_setupintrarecon.h b/vp9/common/vp9_setupintrarecon.h new file mode 100644 index 000000000..1a55d0ad6 --- /dev/null +++ b/vp9/common/vp9_setupintrarecon.h @@ -0,0 +1,13 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_scale/yv12config.h" +extern void vp9_setup_intra_recon(YV12_BUFFER_CONFIG *ybf); diff --git a/vp9/common/vp9_subpelvar.h b/vp9/common/vp9_subpelvar.h new file mode 100644 index 000000000..b3c3fcdaf --- /dev/null +++ b/vp9/common/vp9_subpelvar.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9/common/vp9_filter.h" + + + +static void variance(const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + int w, + int h, + unsigned int *sse, + int *sum) { + int i, j; + int diff; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + diff = src_ptr[j] - ref_ptr[j]; + *sum += diff; + *sse += diff * diff; + } + + src_ptr += source_stride; + ref_ptr += recon_stride; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_first_pass + * + * INPUTS : UINT8 *src_ptr : Pointer to source block. + * UINT32 src_pixels_per_line : Stride of input block. + * UINT32 pixel_step : Offset between filter input samples (see notes). + * UINT32 output_height : Input block height. + * UINT32 output_width : Input block width. + * INT32 *vp9_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : INT32 *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in + * either horizontal or vertical direction to produce the + * filtered output block. Used to implement first-pass + * of 2-D separable filter. + * + * SPECIAL NOTES : Produces INT32 output to retain precision for next pass. + * Two filter taps should sum to VP9_FILTER_WEIGHT. + * pixel_step defines whether the filter is applied + * horizontally (pixel_step=1) or vertically (pixel_step=stride). + * It defines the offset required to move from one input + * to the next. + * + ****************************************************************************/ +static void var_filter_block2d_bil_first_pass(const unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp9_filter) { + unsigned int i, j; + + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + // Apply bilinear filter + output_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) + + ((int)src_ptr[pixel_step] * vp9_filter[1]) + + (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT; + src_ptr++; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_second_pass + * + * INPUTS : INT32 *src_ptr : Pointer to source block. + * UINT32 src_pixels_per_line : Stride of input block. + * UINT32 pixel_step : Offset between filter input samples (see notes). + * UINT32 output_height : Input block height. + * UINT32 output_width : Input block width. + * INT32 *vp9_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : UINT16 *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in + * either horizontal or vertical direction to produce the + * filtered output block. Used to implement second-pass + * of 2-D separable filter. + * + * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. + * Two filter taps should sum to VP9_FILTER_WEIGHT. + * pixel_step defines whether the filter is applied + * horizontally (pixel_step=1) or vertically (pixel_step=stride). + * It defines the offset required to move from one input + * to the next. + * + ****************************************************************************/ +static void var_filter_block2d_bil_second_pass(const unsigned short *src_ptr, + unsigned char *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp9_filter) { + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + // Apply filter + Temp = ((int)src_ptr[0] * vp9_filter[0]) + + ((int)src_ptr[pixel_step] * vp9_filter[1]) + + (VP9_FILTER_WEIGHT / 2); + output_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT); + src_ptr++; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + diff --git a/vp9/common/vp9_subpixel.h b/vp9/common/vp9_subpixel.h new file mode 100644 index 000000000..2b8429198 --- /dev/null +++ b/vp9/common/vp9_subpixel.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef SUBPIXEL_H +#define SUBPIXEL_H + +#define prototype_subpixel_predict(sym) \ + void sym(unsigned char *src, int src_pitch, int xofst, int yofst, \ + unsigned char *dst, int dst_pitch) + +typedef prototype_subpixel_predict((*vp9_subpix_fn_t)); + +#endif diff --git a/vp9/common/vp9_swapyv12buffer.c b/vp9/common/vp9_swapyv12buffer.c new file mode 100644 index 000000000..7882e0be0 --- /dev/null +++ b/vp9/common/vp9_swapyv12buffer.c @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9_swapyv12buffer.h" + +void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *last_frame) { + unsigned char *temp; + + temp = last_frame->buffer_alloc; + last_frame->buffer_alloc = new_frame->buffer_alloc; + new_frame->buffer_alloc = temp; + + temp = last_frame->y_buffer; + last_frame->y_buffer = new_frame->y_buffer; + new_frame->y_buffer = temp; + + temp = last_frame->u_buffer; + last_frame->u_buffer = new_frame->u_buffer; + new_frame->u_buffer = temp; + + temp = last_frame->v_buffer; + last_frame->v_buffer = new_frame->v_buffer; + new_frame->v_buffer = temp; +} diff --git a/vp9/common/vp9_swapyv12buffer.h b/vp9/common/vp9_swapyv12buffer.h new file mode 100644 index 000000000..44ed5e84d --- /dev/null +++ b/vp9/common/vp9_swapyv12buffer.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef __SWAPYV12_BUFFER_H +#define __SWAPYV12_BUFFER_H + +#include "vpx_scale/yv12config.h" + +void vp9_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, + YV12_BUFFER_CONFIG *last_frame); + +#endif // __SWAPYV12_BUFFER_H diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h new file mode 100644 index 000000000..5d778bcd0 --- /dev/null +++ b/vp9/common/vp9_systemdependent.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#if ARCH_X86 || ARCH_X86_64 +void vpx_reset_mmx_state(void); +#define vp9_clear_system_state() vpx_reset_mmx_state() +#else +#define vp9_clear_system_state() +#endif + +struct VP9Common; +void vp9_machine_specific_config(struct VP9Common *); diff --git a/vp9/common/vp9_tapify.py b/vp9/common/vp9_tapify.py new file mode 100644 index 000000000..99529cff0 --- /dev/null +++ b/vp9/common/vp9_tapify.py @@ -0,0 +1,106 @@ +""" + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. +""" +#!/usr/bin/env python +import sys,string,os,re,math,numpy +scale = 2**16 +def dist(p1,p2): + x1,y1 = p1 + x2,y2 = p2 + if x1==x2 and y1==y2 : + return 1.0 + return 1/ math.sqrt((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2)) + +def gettaps(p): + def l(b): + return int(math.floor(b)) + def h(b): + return int(math.ceil(b)) + def t(b,p,s): + return int((scale*dist(b,p)+s/2)/s) + r,c = p + ul=[l(r),l(c)] + ur=[l(r),h(c)] + ll=[h(r),l(c)] + lr=[h(r),h(c)] + sum = dist(ul,p)+dist(ur,p)+dist(ll,p)+dist(lr,p) + t4 = scale - t(ul,p,sum) - t(ur,p,sum) - t(ll,p,sum); + return [[ul,t(ul,p,sum)],[ur,t(ur,p,sum)], + [ll,t(ll,p,sum)],[lr,t4]] + +def print_mb_taps(angle,blocksize): + theta = angle / 57.2957795; + affine = [[math.cos(theta),-math.sin(theta)], + [math.sin(theta),math.cos(theta)]] + radius = (float(blocksize)-1)/2 + print " // angle of",angle,"degrees" + for y in range(blocksize) : + for x in range(blocksize) : + r,c = numpy.dot(affine,[y-radius, x-radius]) + tps = gettaps([r+radius,c+radius]) + for t in tps : + p,t = t + tr,tc = p + print " %2d, %2d, %5d, " % (tr,tc,t,), + print " // %2d,%2d " % (y,x) + +i=float(sys.argv[1]) +while i <= float(sys.argv[2]) : + print_mb_taps(i,float(sys.argv[4])) + i=i+float(sys.argv[3]) +""" + +taps = [] +pt=dict() +ptr=dict() +for y in range(16) : + for x in range(16) : + r,c = numpy.dot(affine,[y-7.5, x-7.5]) + tps = gettaps([r+7.5,c+7.5]) + j=0 + for tp in tps : + p,i = tp + r,c = p + pt[y,x,j]= [p,i] + try: + ptr[r,j,c].append([y,x]) + except: + ptr[r,j,c]=[[y,x]] + j = j+1 + +for key in sorted(pt.keys()) : + print key,pt[key] + +lr = -99 +lj = -99 +lc = 0 + +shuf="" +mask="" +for r,j,c in sorted(ptr.keys()) : + for y,x in ptr[r,j,c] : + if lr != r or lj != j : + print "shuf_"+str(lr)+"_"+str(lj)+"_"+shuf.ljust(16,"0"), lc + shuf="" + lc = 0 + for i in range(lc,c-1) : + shuf = shuf +"0" + shuf = shuf + hex(x)[2] + lc =c + break + lr = r + lj = j +# print r,j,c,ptr[r,j,c] +# print + +for r,j,c in sorted(ptr.keys()) : + for y,x in ptr[r,j,c] : + print r,j,c,y,x + break +""" diff --git a/vp9/common/vp9_textblit.c b/vp9/common/vp9_textblit.c new file mode 100644 index 000000000..52c6b87c6 --- /dev/null +++ b/vp9/common/vp9_textblit.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/common/vp9_textblit.h" + +void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) { + int letter_bitmap; + unsigned char *output_pos = address; + int colpos; + const int font[] = { + 0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000, + 0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110, + 0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA, + 0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20, + 0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF, + 0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F, + 0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2, + 0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731, + 0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820 + }; + colpos = 0; + + while (msg[colpos] != 0) { + char letter = msg[colpos]; + int fontcol, fontrow; + + if (letter <= 'Z' && letter >= ' ') + letter_bitmap = font[letter - ' ']; + else if (letter <= 'z' && letter >= 'a') + letter_bitmap = font[letter - 'a' + 'A' - ' ']; + else + letter_bitmap = font[0]; + + for (fontcol = 6; fontcol >= 0; fontcol--) + for (fontrow = 0; fontrow < 5; fontrow++) + output_pos[fontrow * pitch + fontcol] = + ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0); + + output_pos += 7; + colpos++; + } +} + +static void plot(const int x, const int y, unsigned char *image, const int pitch) { + image [x + y * pitch] ^= 255; +} + +/* Bresenham line algorithm */ +void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, const int pitch) { + int steep = abs(y1 - y0) > abs(x1 - x0); + int deltax, deltay; + int error, ystep, y, x; + + if (steep) { + int t; + t = x0; + x0 = y0; + y0 = t; + + t = x1; + x1 = y1; + y1 = t; + } + + if (x0 > x1) { + int t; + t = x0; + x0 = x1; + x1 = t; + + t = y0; + y0 = y1; + y1 = t; + } + + deltax = x1 - x0; + deltay = abs(y1 - y0); + error = deltax / 2; + + y = y0; + + if (y0 < y1) + ystep = 1; + else + ystep = -1; + + if (steep) { + for (x = x0; x <= x1; x++) { + plot(y, x, image, pitch); + + error = error - deltay; + if (error < 0) { + y = y + ystep; + error = error + deltax; + } + } + } else { + for (x = x0; x <= x1; x++) { + plot(x, y, image, pitch); + + error = error - deltay; + if (error < 0) { + y = y + ystep; + error = error + deltax; + } + } + } +} diff --git a/vp9/common/vp9_textblit.h b/vp9/common/vp9_textblit.h new file mode 100644 index 000000000..39edbb09d --- /dev/null +++ b/vp9/common/vp9_textblit.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef __INC_TEXTBLIT_H +#define __INC_TEXTBLIT_H + +extern void vp9_blit_text(const char *msg, unsigned char *address, + const int pitch); +extern void vp9_blit_line(int x0, int x1, int y0, int y1, + unsigned char *image, const int pitch); + +#endif // __INC_TEXTBLIT_H diff --git a/vp9/common/vp9_treecoder.c b/vp9/common/vp9_treecoder.c new file mode 100644 index 000000000..096dd59e5 --- /dev/null +++ b/vp9/common/vp9_treecoder.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" + +#if defined(CONFIG_DEBUG) && CONFIG_DEBUG +#include +#endif +#include + +#include "vp9_treecoder.h" + +static void tree2tok( + struct vp9_token_struct *const p, + vp9_tree t, + int i, + int v, + int L +) { + v += v; + ++L; + + do { + const vp9_tree_index j = t[i++]; + + if (j <= 0) { + p[-j].value = v; + p[-j].Len = L; + } else + tree2tok(p, t, j, v, L); + } while (++v & 1); +} + +void vp9_tokens_from_tree(struct vp9_token_struct *p, vp9_tree t) { + tree2tok(p, t, 0, 0, 0); +} + +void vp9_tokens_from_tree_offset(struct vp9_token_struct *p, vp9_tree t, + int offset) { + tree2tok(p - offset, t, 0, 0, 0); +} + +static void branch_counts( + int n, /* n = size of alphabet */ + vp9_token tok [ /* n */ ], + vp9_tree tree, + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ] +) { + const int tree_len = n - 1; + int t = 0; + +#if CONFIG_DEBUG + assert(tree_len); +#endif + + do { + branch_ct[t][0] = branch_ct[t][1] = 0; + } while (++t < tree_len); + + t = 0; + + do { + int L = tok[t].Len; + const int enc = tok[t].value; + const unsigned int ct = num_events[t]; + + vp9_tree_index i = 0; + + do { + const int b = (enc >> --L) & 1; + const int j = i >> 1; +#if CONFIG_DEBUG + assert(j < tree_len && 0 <= L); +#endif + + branch_ct [j] [b] += ct; + i = tree[ i + b]; + } while (i > 0); + +#if CONFIG_DEBUG + assert(!L); +#endif + } while (++t < n); + +} + + +void vp9_tree_probs_from_distribution( + int n, /* n = size of alphabet */ + vp9_token tok [ /* n */ ], + vp9_tree tree, + vp9_prob probs [ /* n-1 */ ], + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ], + unsigned int Pfac, + int rd +) { + const int tree_len = n - 1; + int t = 0; + + branch_counts(n, tok, tree, branch_ct, num_events); + + do { + const unsigned int *const c = branch_ct[t]; + const unsigned int tot = c[0] + c[1]; + +#if CONFIG_DEBUG + assert(tot < (1 << 24)); /* no overflow below */ +#endif + + if (tot) { + const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot; + probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */ + } else + probs[t] = vp9_prob_half; + } while (++t < tree_len); +} + +vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]) { + int tot_count = counts[0] + counts[1]; + vp9_prob prob; + if (tot_count) { + prob = (counts[0] * 255 + (tot_count >> 1)) / tot_count; + prob += !prob; + } else { + prob = 128; + } + return prob; +} diff --git a/vp9/common/vp9_treecoder.h b/vp9/common/vp9_treecoder.h new file mode 100644 index 000000000..92b92ef55 --- /dev/null +++ b/vp9/common/vp9_treecoder.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_TREECODER_H +#define __INC_TREECODER_H + +typedef unsigned char vp9_prob; + +#define vp9_prob_half ( (vp9_prob) 128) + +typedef signed char vp9_tree_index; +struct bool_coder_spec; + +typedef struct bool_coder_spec bool_coder_spec; +typedef struct bool_writer bool_writer; +typedef struct bool_reader bool_reader; + +typedef const bool_coder_spec c_bool_coder_spec; +typedef const bool_writer c_bool_writer; +typedef const bool_reader c_bool_reader; + + + +# define vp9_complement( x) (255 - x) + + +/* We build coding trees compactly in arrays. + Each node of the tree is a pair of vp9_tree_indices. + Array index often references a corresponding probability table. + Index <= 0 means done encoding/decoding and value = -Index, + Index > 0 means need another bit, specification at index. + Nonnegative indices are always even; processing begins at node 0. */ + +typedef const vp9_tree_index vp9_tree[], *vp9_tree_p; + + +typedef const struct vp9_token_struct { + int value; + int Len; +} vp9_token; + +/* Construct encoding array from tree. */ + +void vp9_tokens_from_tree(struct vp9_token_struct *, vp9_tree); +void vp9_tokens_from_tree_offset(struct vp9_token_struct *, vp9_tree, + int offset); + + +/* Convert array of token occurrence counts into a table of probabilities + for the associated binary encoding tree. Also writes count of branches + taken for each node on the tree; this facilitiates decisions as to + probability updates. */ + +void vp9_tree_probs_from_distribution( + int n, /* n = size of alphabet */ + vp9_token tok [ /* n */ ], + vp9_tree tree, + vp9_prob probs [ /* n-1 */ ], + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ], + unsigned int Pfactor, + int Round +); + +static __inline int clip_prob(int p) { + if (p > 255) + return 255; + else if (p < 1) + return 1; + return p; +} + +vp9_prob vp9_bin_prob_from_distribution(const unsigned int counts[2]); + +#endif diff --git a/vp9/common/vp9_type_aliases.h b/vp9/common/vp9_type_aliases.h new file mode 100644 index 000000000..91939ee19 --- /dev/null +++ b/vp9/common/vp9_type_aliases.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/**************************************************************************** +* +* Module Title : vp9_type_aliases.h +* +* Description : Standard type aliases +* +****************************************************************************/ +#ifndef __INC_TYPE_ALIASES_H +#define __INC_TYPE_ALIASES_H + +/**************************************************************************** +* Macros +****************************************************************************/ +#define EXPORT +#define IMPORT extern /* Used to declare imported data & routines */ +#define PRIVATE static /* Used to declare & define module-local data */ +#define LOCAL static /* Used to define all persistent routine-local data */ +#define STD_IN_PATH 0 /* Standard input path */ +#define STD_OUT_PATH 1 /* Standard output path */ +#define STD_ERR_PATH 2 /* Standard error path */ +#define STD_IN_FILE stdin /* Standard input file pointer */ +#define STD_OUT_FILE stdout /* Standard output file pointer */ +#define STD_ERR_FILE stderr /* Standard error file pointer */ +#define max_int 0x7FFFFFFF + +#define __export +#define _export + +#define CCONV + +#ifndef NULL +#ifdef __cplusplus +#define NULL 0 +#else +#define NULL ((void *)0) +#endif +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +#ifndef TRUE +#define TRUE 1 +#endif + +/**************************************************************************** +* Typedefs +****************************************************************************/ +#ifndef TYPE_INT8 +#define TYPE_INT8 +typedef signed char INT8; +#endif + +#ifndef TYPE_INT16 +/*#define TYPE_INT16*/ +typedef signed short INT16; +#endif + +#ifndef TYPE_INT32 +/*#define TYPE_INT32*/ +typedef signed int INT32; +#endif + +#ifndef TYPE_UINT8 +/*#define TYPE_UINT8*/ +typedef unsigned char UINT8; +#endif + +#ifndef TYPE_UINT32 +/*#define TYPE_UINT32*/ +typedef unsigned int UINT32; +#endif + +#ifndef TYPE_UINT16 +/*#define TYPE_UINT16*/ +typedef unsigned short UINT16; +#endif + +#ifndef TYPE_BOOL +/*#define TYPE_BOOL*/ +typedef int BOOL; +#endif + +typedef unsigned char BOOLEAN; + +#ifdef _MSC_VER +typedef __int64 INT64; +#ifndef INT64_MAX +#define INT64_MAX LLONG_MAX +#endif +#else + +#ifndef TYPE_INT64 +#ifdef _TMS320C6X +/* for now we only have 40bits */ +typedef long INT64; +#else +typedef long long INT64; +#endif +#endif + +#endif + +/* Floating point */ +typedef double FLOAT64; +typedef float FLOAT32; + +#endif diff --git a/vp9/common/x86/filter_sse2.c b/vp9/common/x86/filter_sse2.c deleted file mode 100644 index 92d0a2e5b..000000000 --- a/vp9/common/x86/filter_sse2.c +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include // for alignment checks -#include // SSE2 -#include "vp9/common/filter.h" -#include "vpx_ports/mem.h" // for DECLARE_ALIGNED -#include "vp9_rtcd.h" - -// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is -// just a quick partial snapshot so that other can already use some -// speedup. -// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap -// filtering. -// TODO(cd): Add some comments, better variable naming. -// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum -// of positive above 128), or have higher precision filter -// coefficients. - -DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = { - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, -}; - -// Creating a macro to do more than four pixels at once to hide instruction -// latency is actually slower :-( -#define DO_FOUR_PIXELS(result, src_ptr, offset) \ - { \ - /* Do shifted load to achieve require shuffles through unpacking */ \ - const __m128i src0 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \ - const __m128i src1 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \ - const __m128i src2 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \ - const __m128i src3 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \ - const __m128i src01 = _mm_unpacklo_epi8(src0, src1); \ - const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero); \ - const __m128i src23 = _mm_unpacklo_epi8(src2, src3); \ - const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero); \ - /* Shit by 4 bytes through suffle to get additional shifted loads */ \ - const __m128i src4 = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src5 = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src6 = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src7 = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src45 = _mm_unpacklo_epi8(src4, src5); \ - const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero); \ - const __m128i src67 = _mm_unpacklo_epi8(src6, src7); \ - const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero); \ - /* multiply accumulate them */ \ - const __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \ - const __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \ - const __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \ - const __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \ - const __m128i mad0123 = _mm_add_epi32(mad01, mad23); \ - const __m128i mad4567 = _mm_add_epi32(mad45, mad67); \ - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \ - mad_all = _mm_add_epi32(mad_all, rounding); \ - result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \ - } - -void vp9_filter_block2d_4x4_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - __m128i intermediateA, intermediateB, intermediateC; - - const int kInterp_Extend = 4; - - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c); - - // check alignment - assert(0 == ((long)HFilter_aligned16)%16); - assert(0 == ((long)VFilter_aligned16)%16); - - { - __m128i transpose3_0; - __m128i transpose3_1; - __m128i transpose3_2; - __m128i transpose3_3; - - // Horizontal pass (src -> intermediate). - { - const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3)); - src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); - - { - __m128i mad_all0; - __m128i mad_all1; - __m128i mad_all2; - __m128i mad_all3; - DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateA = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateB = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all2); - intermediateC = _mm_packus_epi16(mad_all0, mad_all2); - } - } - - // Transpose result (intermediate -> transpose3_x) - { - // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 - // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73 - // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx - const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB); - const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB); - const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC); - const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC); - // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73 - // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx - // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx - const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); - const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3); - const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3); - // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63 - // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73 - // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx - // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx - const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1); - const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1); - const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3); - const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3); - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx - // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx - transpose3_0 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose2_2), - _MM_SHUFFLE(1, 0, 1, 0))); - transpose3_1 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose2_2), - _MM_SHUFFLE(3, 2, 3, 2))); - transpose3_2 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose2_3), - _MM_SHUFFLE(1, 0, 1, 0))); - transpose3_3 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose2_3), - _MM_SHUFFLE(3, 2, 3, 2))); - // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx - // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx - // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx - // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx - } - - // Vertical pass (transpose3_x -> dst). - { - const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3)); - __m128i col0, col1, col2, col3; - DECLARE_ALIGNED(16, unsigned char, temp[32]); - { - _mm_store_si128((__m128i *)temp, transpose3_0); - DO_FOUR_PIXELS(col0, temp, 0); - } - { - _mm_store_si128((__m128i *)temp, transpose3_1); - DO_FOUR_PIXELS(col1, temp, 0); - } - { - _mm_store_si128((__m128i *)temp, transpose3_2); - DO_FOUR_PIXELS(col2, temp, 0); - } - { - _mm_store_si128((__m128i *)temp, transpose3_3); - DO_FOUR_PIXELS(col3, temp, 0); - } - // transpose - { - __m128i T0 = _mm_unpacklo_epi32(col0, col1); - __m128i T1 = _mm_unpacklo_epi32(col2, col3); - __m128i T2 = _mm_unpackhi_epi32(col0, col1); - __m128i T3 = _mm_unpackhi_epi32(col2, col3); - col0 = _mm_unpacklo_epi64(T0, T1); - col1 = _mm_unpackhi_epi64(T0, T1); - col2 = _mm_unpacklo_epi64(T2, T3); - col3 = _mm_unpackhi_epi64(T2, T3); - } - // saturate to 8 bit - { - col0 = _mm_packs_epi32(col0, col0); - col0 = _mm_packus_epi16(col0, col0); - col1 = _mm_packs_epi32(col1, col1); - col1 = _mm_packus_epi16(col1, col1); - col2 = _mm_packs_epi32 (col2, col2); - col2 = _mm_packus_epi16(col2, col2); - col3 = _mm_packs_epi32 (col3, col3); - col3 = _mm_packus_epi16(col3, col3); - } - // store - { - *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0); - *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1); - *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2); - *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3); - } - } - } -} - -void vp9_filter_block2d_8x4_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int j; - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j, dst_stride); - } -} - -void vp9_filter_block2d_8x8_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<8; i+=4) { - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} - -void vp9_filter_block2d_16x16_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<16; i+=4) { - for (j=0; j<16; j+=4) { - vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} diff --git a/vp9/common/x86/filter_sse4.c b/vp9/common/x86/filter_sse4.c deleted file mode 100644 index 9efe9f44a..000000000 --- a/vp9/common/x86/filter_sse4.c +++ /dev/null @@ -1,362 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include // for alignment checks -#include // SSE4.1 -#include "vp9/common/filter.h" -#include "vpx_ports/mem.h" // for DECLARE_ALIGNED -#include "vp9_rtcd.h" - -// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is -// just a quick partial snapshot so that other can already use some -// speedup. -// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap -// filtering. -// TODO(cd): Reduce source size by using macros instead of current code -// duplication. -// TODO(cd): Add some comments, better variable naming. -// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum -// of positive above 128), or have higher precision filter -// coefficients. - -DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = { - 0x00, 0x01, - 0x01, 0x02, - 0x02, 0x03, - 0x03, 0x04, - 0x02, 0x03, - 0x03, 0x04, - 0x04, 0x05, - 0x05, 0x06, -}; -DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = { - 0x04, 0x05, - 0x05, 0x06, - 0x06, 0x07, - 0x07, 0x08, - 0x06, 0x07, - 0x07, 0x08, - 0x08, 0x09, - 0x09, 0x0A, -}; -DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = { - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, -}; -DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = { - 0, 4, 8, 12, - 1, 5, 9, 13, - 2, 6, 10, 14, - 3, 7, 11, 15 -}; - -// Creating a macro to do more than four pixels at once to hide instruction -// latency is actually slower :-( -#define DO_FOUR_PIXELS(result, offset) \ - { \ - /*load pixels*/ \ - __m128i src = _mm_loadu_si128((const __m128i *)(src_ptr + offset)); \ - /* extract the ones used for first column */ \ - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); \ - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); \ - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); \ - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); \ - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); \ - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); \ - /* multiply accumulate them */ \ - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \ - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \ - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \ - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \ - __m128i mad0123 = _mm_add_epi32(mad01, mad23); \ - __m128i mad4567 = _mm_add_epi32(mad45, mad67); \ - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \ - mad_all = _mm_add_epi32(mad_all, rounding); \ - result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \ - } - -void vp9_filter_block2d_4x4_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - __m128i intermediateA, intermediateB, intermediateC; - - const int kInterp_Extend = 4; - - const __m128i zero = _mm_set1_epi16(0); - const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c); - const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c); - const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c); - const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c); - - // check alignment - assert(0 == ((long)HFilter_aligned16)%16); - assert(0 == ((long)VFilter_aligned16)%16); - - { - __m128i transpose3_0; - __m128i transpose3_1; - __m128i transpose3_2; - __m128i transpose3_3; - - // Horizontal pass (src -> intermediate). - { - const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3)); - src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); - - { - __m128i mad_all0; - __m128i mad_all1; - __m128i mad_all2; - __m128i mad_all3; - DO_FOUR_PIXELS(mad_all0, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateA = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateB = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, 2*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all2); - intermediateC = _mm_packus_epi16(mad_all0, mad_all2); - } - } - - // Transpose result (intermediate -> transpose3_x) - { - // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 - // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73 - // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx - const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose); - const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose); - const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose); - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx - const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1); - const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1); - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - transpose3_0 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(0, 0, 1, 0))); - transpose3_1 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(1, 1, 3, 2))); - transpose3_2 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(2, 2, 1, 0))); - transpose3_3 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(3, 3, 3, 2))); - // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx - // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx - // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx - // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx - } - - // Vertical pass (transpose3_x -> dst). - { - const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3)); - __m128i col0, col1, col2, col3; - { - //load pixels - __m128i src = transpose3_0; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col0 = _mm_packus_epi16(mad_all, mad_all); - } - { - //load pixels - __m128i src = transpose3_1; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col1 = _mm_packus_epi16(mad_all, mad_all); - } - { - //load pixels - __m128i src = transpose3_2; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col2 = _mm_packus_epi16(mad_all, mad_all); - } - { - //load pixels - __m128i src = transpose3_3; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col3 = _mm_packus_epi16(mad_all, mad_all); - } - { - __m128i col01 = _mm_unpacklo_epi8(col0, col1); - __m128i col23 = _mm_unpacklo_epi8(col2, col3); - __m128i col0123 = _mm_unpacklo_epi16(col01, col23); - //TODO(cd): look into Ronald's comment: - // Future suggestion: I believe here, too, you can merge the - // packs_epi32() and pacus_epi16() for the 4 cols above, so that - // you get the data in a single register, and then use pshufb - // (shuffle_epi8()) instead of the unpacks here. Should be - // 2+3+2 instructions faster. - *((unsigned int *)&dst_ptr[dst_stride * 0]) = - _mm_extract_epi32(col0123, 0); - *((unsigned int *)&dst_ptr[dst_stride * 1]) = - _mm_extract_epi32(col0123, 1); - *((unsigned int *)&dst_ptr[dst_stride * 2]) = - _mm_extract_epi32(col0123, 2); - *((unsigned int *)&dst_ptr[dst_stride * 3]) = - _mm_extract_epi32(col0123, 3); - } - } - } -} - -void vp9_filter_block2d_8x4_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int j; - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j, dst_stride); - } -} - -void vp9_filter_block2d_8x8_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<8; i+=4) { - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} - -void vp9_filter_block2d_16x16_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<16; i+=4) { - for (j=0; j<16; j+=4) { - vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} diff --git a/vp9/common/x86/idct_x86.h b/vp9/common/x86/idct_x86.h deleted file mode 100644 index 297ab0d33..000000000 --- a/vp9/common/x86/idct_x86.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef IDCT_X86_H -#define IDCT_X86_H - -/* Note: - * - * This platform is commonly built for runtime CPU detection. If you modify - * any of the function mappings present in this file, be sure to also update - * them in the function pointer initialization code - */ - -#if HAVE_MMX -extern prototype_idct(vp9_short_idct4x4llm_1_mmx); -extern prototype_idct(vp9_short_idct4x4llm_mmx); -extern prototype_idct_scalar_add(vp9_dc_only_idct_add_mmx); - -extern prototype_second_order(vp9_short_inv_walsh4x4_mmx); -extern prototype_second_order(vp9_short_inv_walsh4x4_1_mmx); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_idct_idct1 -#define vp9_idct_idct1 vp9_short_idct4x4llm_1_mmx - -#undef vp9_idct_idct16 -#define vp9_idct_idct16 vp9_short_idct4x4llm_mmx - -#undef vp9_idct_idct1_scalar_add -#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_mmx - -#undef vp9_idct_iwalsh16 -#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_mmx - -#undef vp9_idct_iwalsh1 -#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_mmx - -#endif -#endif - -#if HAVE_SSE2 - -extern prototype_second_order(vp9_short_inv_walsh4x4_sse2); - -#if !CONFIG_RUNTIME_CPU_DETECT - -#undef vp9_idct_iwalsh16 -#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_sse2 - -#endif - -#endif - - - -#endif diff --git a/vp9/common/x86/idctllm_mmx.asm b/vp9/common/x86/idctllm_mmx.asm deleted file mode 100644 index 15e81addb..000000000 --- a/vp9/common/x86/idctllm_mmx.asm +++ /dev/null @@ -1,241 +0,0 @@ -; -; Copyright (c) 2012 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -align 16 -x_s1sqr2: times 4 dw 0x8A8C -align 16 -x_c1sqr2less1: times 4 dw 0x4E7B -align 16 -pw_16: times 4 dw 16 - -SECTION .text - - -; /**************************************************************************** -; * Notes: -; * -; * This implementation makes use of 16 bit fixed point version of two multiply -; * constants: -; * 1. sqrt(2) * cos (pi/8) -; * 2. sqrt(2) * sin (pi/8) -; * Because the first constant is bigger than 1, to maintain the same 16 bit -; * fixed point precision as the second one, we use a trick of -; * x * a = x + x*(a-1) -; * so -; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). -; * -; * For the second constant, because of the 16bit version is 35468, which -; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative -; * number. -; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x -; * -; **************************************************************************/ - -INIT_MMX - -;void short_idct4x4llm_mmx(short *input, short *output, int pitch) -cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit - mova m0, [inpq +0] - mova m1, [inpq +8] - - mova m2, [inpq+16] - mova m3, [inpq+24] - - psubw m0, m2 ; b1= 0-2 - paddw m2, m2 ; - - mova m5, m1 - paddw m2, m0 ; a1 =0+2 - - pmulhw m5, [x_s1sqr2] ; - paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) - - mova m7, m3 ; - pmulhw m7, [x_c1sqr2less1] ; - - paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) - psubw m7, m5 ; c1 - - mova m5, m1 - mova m4, m3 - - pmulhw m5, [x_c1sqr2less1] - paddw m5, m1 - - pmulhw m3, [x_s1sqr2] - paddw m3, m4 - - paddw m3, m5 ; d1 - mova m6, m2 ; a1 - - mova m4, m0 ; b1 - paddw m2, m3 ;0 - - paddw m4, m7 ;1 - psubw m0, m7 ;2 - - psubw m6, m3 ;3 - - mova m1, m2 ; 03 02 01 00 - mova m3, m4 ; 23 22 21 20 - - punpcklwd m1, m0 ; 11 01 10 00 - punpckhwd m2, m0 ; 13 03 12 02 - - punpcklwd m3, m6 ; 31 21 30 20 - punpckhwd m4, m6 ; 33 23 32 22 - - mova m0, m1 ; 11 01 10 00 - mova m5, m2 ; 13 03 12 02 - - punpckldq m0, m3 ; 30 20 10 00 - punpckhdq m1, m3 ; 31 21 11 01 - - punpckldq m2, m4 ; 32 22 12 02 - punpckhdq m5, m4 ; 33 23 13 03 - - mova m3, m5 ; 33 23 13 03 - - psubw m0, m2 ; b1= 0-2 - paddw m2, m2 ; - - mova m5, m1 - paddw m2, m0 ; a1 =0+2 - - pmulhw m5, [x_s1sqr2] ; - paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) - - mova m7, m3 ; - pmulhw m7, [x_c1sqr2less1] ; - - paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) - psubw m7, m5 ; c1 - - mova m5, m1 - mova m4, m3 - - pmulhw m5, [x_c1sqr2less1] - paddw m5, m1 - - pmulhw m3, [x_s1sqr2] - paddw m3, m4 - - paddw m3, m5 ; d1 - paddw m0, [pw_16] - - paddw m2, [pw_16] - mova m6, m2 ; a1 - - mova m4, m0 ; b1 - paddw m2, m3 ;0 - - paddw m4, m7 ;1 - psubw m0, m7 ;2 - - psubw m6, m3 ;3 - psraw m2, 5 - - psraw m0, 5 - psraw m4, 5 - - psraw m6, 5 - - mova m1, m2 ; 03 02 01 00 - mova m3, m4 ; 23 22 21 20 - - punpcklwd m1, m0 ; 11 01 10 00 - punpckhwd m2, m0 ; 13 03 12 02 - - punpcklwd m3, m6 ; 31 21 30 20 - punpckhwd m4, m6 ; 33 23 32 22 - - mova m0, m1 ; 11 01 10 00 - mova m5, m2 ; 13 03 12 02 - - punpckldq m0, m3 ; 30 20 10 00 - punpckhdq m1, m3 ; 31 21 11 01 - - punpckldq m2, m4 ; 32 22 12 02 - punpckhdq m5, m4 ; 33 23 13 03 - - mova [outq], m0 - - mova [outq+r2], m1 - mova [outq+pitq*2], m2 - - add outq, pitq - mova [outq+pitq*2], m5 - RET - -;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch) -cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit - movh m0, [inpq] - paddw m0, [pw_16] - psraw m0, 5 - punpcklwd m0, m0 - punpckldq m0, m0 - - mova [outq], m0 - mova [outq+pitq], m0 - - mova [outq+pitq*2], m0 - add r1, r2 - - mova [outq+pitq*2], m0 - RET - - -;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride) -cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride -%if ARCH_X86_64 - movsxd strideq, dword stridem -%else - mov strideq, stridem -%endif - pxor m0, m0 - - movh m5, in_dcq ; dc - paddw m5, [pw_16] - - psraw m5, 5 - - punpcklwd m5, m5 - punpckldq m5, m5 - - movh m1, [predq] - punpcklbw m1, m0 - paddsw m1, m5 - packuswb m1, m0 ; pack and unpack to saturate - movh [dstq], m1 - - movh m2, [predq+pitq] - punpcklbw m2, m0 - paddsw m2, m5 - packuswb m2, m0 ; pack and unpack to saturate - movh [dstq+strideq], m2 - - movh m3, [predq+2*pitq] - punpcklbw m3, m0 - paddsw m3, m5 - packuswb m3, m0 ; pack and unpack to saturate - movh [dstq+2*strideq], m3 - - add dstq, strideq - add predq, pitq - movh m4, [predq+2*pitq] - punpcklbw m4, m0 - paddsw m4, m5 - packuswb m4, m0 ; pack and unpack to saturate - movh [dstq+2*strideq], m4 - RET - diff --git a/vp9/common/x86/idctllm_sse2.asm b/vp9/common/x86/idctllm_sse2.asm deleted file mode 100644 index daa572e01..000000000 --- a/vp9/common/x86/idctllm_sse2.asm +++ /dev/null @@ -1,712 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_idct_dequant_0_2x_sse2 -; ( -; short *qcoeff - 0 -; short *dequant - 1 -; unsigned char *pre - 2 -; unsigned char *dst - 3 -; int dst_stride - 4 -; int blk_stride - 5 -; ) - -global sym(vp9_idct_dequant_0_2x_sse2) -sym(vp9_idct_dequant_0_2x_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - ; end prolog - - mov rdx, arg(1) ; dequant - mov rax, arg(0) ; qcoeff - - movd xmm4, [rax] - movd xmm5, [rdx] - - pinsrw xmm4, [rax+32], 4 - pinsrw xmm5, [rdx], 4 - - pmullw xmm4, xmm5 - - ; Zero out xmm5, for use unpacking - pxor xmm5, xmm5 - - ; clear coeffs - movd [rax], xmm5 - movd [rax+32], xmm5 -;pshufb - pshuflw xmm4, xmm4, 00000000b - pshufhw xmm4, xmm4, 00000000b - - mov rax, arg(2) ; pre - paddw xmm4, [GLOBAL(fours)] - - movsxd rcx, dword ptr arg(5) ; blk_stride - psraw xmm4, 3 - - movq xmm0, [rax] - movq xmm1, [rax+rcx] - movq xmm2, [rax+2*rcx] - lea rcx, [3*rcx] - movq xmm3, [rax+rcx] - - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpcklbw xmm2, xmm5 - punpcklbw xmm3, xmm5 - - mov rax, arg(3) ; dst - movsxd rdx, dword ptr arg(4) ; dst_stride - - ; Add to predict buffer - paddw xmm0, xmm4 - paddw xmm1, xmm4 - paddw xmm2, xmm4 - paddw xmm3, xmm4 - - ; pack up before storing - packuswb xmm0, xmm5 - packuswb xmm1, xmm5 - packuswb xmm2, xmm5 - packuswb xmm3, xmm5 - - ; store blocks back out - movq [rax], xmm0 - movq [rax + rdx], xmm1 - - lea rax, [rax + 2*rdx] - - movq [rax], xmm2 - movq [rax + rdx], xmm3 - - ; begin epilog - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -global sym(vp9_idct_dequant_full_2x_sse2) -sym(vp9_idct_dequant_full_2x_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ; special case when 2 blocks have 0 or 1 coeffs - ; dc is set as first coeff, so no need to load qcoeff - mov rax, arg(0) ; qcoeff - mov rsi, arg(2) ; pre - mov rdi, arg(3) ; dst - movsxd rcx, dword ptr arg(5) ; blk_stride - - ; Zero out xmm7, for use unpacking - pxor xmm7, xmm7 - - mov rdx, arg(1) ; dequant - - ; note the transpose of xmm1 and xmm2, necessary for shuffle - ; to spit out sensicle data - movdqa xmm0, [rax] - movdqa xmm2, [rax+16] - movdqa xmm1, [rax+32] - movdqa xmm3, [rax+48] - - ; Clear out coeffs - movdqa [rax], xmm7 - movdqa [rax+16], xmm7 - movdqa [rax+32], xmm7 - movdqa [rax+48], xmm7 - - ; dequantize qcoeff buffer - pmullw xmm0, [rdx] - pmullw xmm2, [rdx+16] - pmullw xmm1, [rdx] - pmullw xmm3, [rdx+16] - - ; repack so block 0 row x and block 1 row x are together - movdqa xmm4, xmm0 - punpckldq xmm0, xmm1 - punpckhdq xmm4, xmm1 - - pshufd xmm0, xmm0, 11011000b - pshufd xmm1, xmm4, 11011000b - - movdqa xmm4, xmm2 - punpckldq xmm2, xmm3 - punpckhdq xmm4, xmm3 - - pshufd xmm2, xmm2, 11011000b - pshufd xmm3, xmm4, 11011000b - - ; first pass - psubw xmm0, xmm2 ; b1 = 0-2 - paddw xmm2, xmm2 ; - - movdqa xmm5, xmm1 - paddw xmm2, xmm0 ; a1 = 0+2 - - pmulhw xmm5, [GLOBAL(x_s1sqr2)] - paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) - - movdqa xmm7, xmm3 - pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] - - paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw xmm7, xmm5 ; c1 - - movdqa xmm5, xmm1 - movdqa xmm4, xmm3 - - pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] - paddw xmm5, xmm1 - - pmulhw xmm3, [GLOBAL(x_s1sqr2)] - paddw xmm3, xmm4 - - paddw xmm3, xmm5 ; d1 - movdqa xmm6, xmm2 ; a1 - - movdqa xmm4, xmm0 ; b1 - paddw xmm2, xmm3 ;0 - - paddw xmm4, xmm7 ;1 - psubw xmm0, xmm7 ;2 - - psubw xmm6, xmm3 ;3 - - ; transpose for the second pass - movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 - punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 - punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 - - movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 - punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 - punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 - - - movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 - punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 - punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 - - movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 - punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 - punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 - - - movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 - punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 - punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 - - movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 - punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 - punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 - - pshufd xmm0, xmm2, 11011000b - pshufd xmm2, xmm1, 11011000b - - pshufd xmm1, xmm5, 11011000b - pshufd xmm3, xmm7, 11011000b - - ; second pass - psubw xmm0, xmm2 ; b1 = 0-2 - paddw xmm2, xmm2 - - movdqa xmm5, xmm1 - paddw xmm2, xmm0 ; a1 = 0+2 - - pmulhw xmm5, [GLOBAL(x_s1sqr2)] - paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) - - movdqa xmm7, xmm3 - pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] - - paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw xmm7, xmm5 ; c1 - - movdqa xmm5, xmm1 - movdqa xmm4, xmm3 - - pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] - paddw xmm5, xmm1 - - pmulhw xmm3, [GLOBAL(x_s1sqr2)] - paddw xmm3, xmm4 - - paddw xmm3, xmm5 ; d1 - paddw xmm0, [GLOBAL(fours)] - - paddw xmm2, [GLOBAL(fours)] - movdqa xmm6, xmm2 ; a1 - - movdqa xmm4, xmm0 ; b1 - paddw xmm2, xmm3 ;0 - - paddw xmm4, xmm7 ;1 - psubw xmm0, xmm7 ;2 - - psubw xmm6, xmm3 ;3 - psraw xmm2, 3 - - psraw xmm0, 3 - psraw xmm4, 3 - - psraw xmm6, 3 - - ; transpose to save - movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 - punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 - punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 - - movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 - punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 - punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 - - - movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 - punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 - punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 - - movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 - punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 - punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 - - - movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 - punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 - punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 - - movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 - punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 - punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 - - pshufd xmm0, xmm2, 11011000b - pshufd xmm2, xmm1, 11011000b - - pshufd xmm1, xmm5, 11011000b - pshufd xmm3, xmm7, 11011000b - - pxor xmm7, xmm7 - - ; Load up predict blocks - movq xmm4, [rsi] - movq xmm5, [rsi+rcx] - - punpcklbw xmm4, xmm7 - punpcklbw xmm5, xmm7 - - paddw xmm0, xmm4 - paddw xmm1, xmm5 - - movq xmm4, [rsi+2*rcx] - lea rcx, [3*rcx] - movq xmm5, [rsi+rcx] - - punpcklbw xmm4, xmm7 - punpcklbw xmm5, xmm7 - - paddw xmm2, xmm4 - paddw xmm3, xmm5 - -.finish: - - ; pack up before storing - packuswb xmm0, xmm7 - packuswb xmm1, xmm7 - packuswb xmm2, xmm7 - packuswb xmm3, xmm7 - - ; Load destination stride before writing out, - ; doesn't need to persist - movsxd rdx, dword ptr arg(4) ; dst_stride - - ; store blocks back out - movq [rdi], xmm0 - movq [rdi + rdx], xmm1 - - lea rdi, [rdi + 2*rdx] - - movq [rdi], xmm2 - movq [rdi + rdx], xmm3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_idct_dequant_dc_0_2x_sse2 -; ( -; short *qcoeff - 0 -; short *dequant - 1 -; unsigned char *pre - 2 -; unsigned char *dst - 3 -; int dst_stride - 4 -; short *dc - 5 -; ) -global sym(vp9_idct_dequant_dc_0_2x_sse2) -sym(vp9_idct_dequant_dc_0_2x_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ; special case when 2 blocks have 0 or 1 coeffs - ; dc is set as first coeff, so no need to load qcoeff - mov rax, arg(0) ; qcoeff - mov rsi, arg(2) ; pre - mov rdi, arg(3) ; dst - mov rdx, arg(5) ; dc - - ; Zero out xmm5, for use unpacking - pxor xmm5, xmm5 - - ; load up 2 dc words here == 2*16 = doubleword - movd xmm4, [rdx] - - ; Load up predict blocks - movq xmm0, [rsi] - movq xmm1, [rsi+16] - movq xmm2, [rsi+32] - movq xmm3, [rsi+48] - - ; Duplicate and expand dc across - punpcklwd xmm4, xmm4 - punpckldq xmm4, xmm4 - - ; Rounding to dequant and downshift - paddw xmm4, [GLOBAL(fours)] - psraw xmm4, 3 - - ; Predict buffer needs to be expanded from bytes to words - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpcklbw xmm2, xmm5 - punpcklbw xmm3, xmm5 - - ; Add to predict buffer - paddw xmm0, xmm4 - paddw xmm1, xmm4 - paddw xmm2, xmm4 - paddw xmm3, xmm4 - - ; pack up before storing - packuswb xmm0, xmm5 - packuswb xmm1, xmm5 - packuswb xmm2, xmm5 - packuswb xmm3, xmm5 - - ; Load destination stride before writing out, - ; doesn't need to persist - movsxd rdx, dword ptr arg(4) ; dst_stride - - ; store blocks back out - movq [rdi], xmm0 - movq [rdi + rdx], xmm1 - - lea rdi, [rdi + 2*rdx] - - movq [rdi], xmm2 - movq [rdi + rdx], xmm3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -global sym(vp9_idct_dequant_dc_full_2x_sse2) -sym(vp9_idct_dequant_dc_full_2x_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ; special case when 2 blocks have 0 or 1 coeffs - ; dc is set as first coeff, so no need to load qcoeff - mov rax, arg(0) ; qcoeff - mov rsi, arg(2) ; pre - mov rdi, arg(3) ; dst - - ; Zero out xmm7, for use unpacking - pxor xmm7, xmm7 - - mov rdx, arg(1) ; dequant - - ; note the transpose of xmm1 and xmm2, necessary for shuffle - ; to spit out sensicle data - movdqa xmm0, [rax] - movdqa xmm2, [rax+16] - movdqa xmm1, [rax+32] - movdqa xmm3, [rax+48] - - ; Clear out coeffs - movdqa [rax], xmm7 - movdqa [rax+16], xmm7 - movdqa [rax+32], xmm7 - movdqa [rax+48], xmm7 - - ; dequantize qcoeff buffer - pmullw xmm0, [rdx] - pmullw xmm2, [rdx+16] - pmullw xmm1, [rdx] - pmullw xmm3, [rdx+16] - - ; DC component - mov rdx, arg(5) - - ; repack so block 0 row x and block 1 row x are together - movdqa xmm4, xmm0 - punpckldq xmm0, xmm1 - punpckhdq xmm4, xmm1 - - pshufd xmm0, xmm0, 11011000b - pshufd xmm1, xmm4, 11011000b - - movdqa xmm4, xmm2 - punpckldq xmm2, xmm3 - punpckhdq xmm4, xmm3 - - pshufd xmm2, xmm2, 11011000b - pshufd xmm3, xmm4, 11011000b - - ; insert DC component - pinsrw xmm0, [rdx], 0 - pinsrw xmm0, [rdx+2], 4 - - ; first pass - psubw xmm0, xmm2 ; b1 = 0-2 - paddw xmm2, xmm2 ; - - movdqa xmm5, xmm1 - paddw xmm2, xmm0 ; a1 = 0+2 - - pmulhw xmm5, [GLOBAL(x_s1sqr2)] - paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) - - movdqa xmm7, xmm3 - pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] - - paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw xmm7, xmm5 ; c1 - - movdqa xmm5, xmm1 - movdqa xmm4, xmm3 - - pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] - paddw xmm5, xmm1 - - pmulhw xmm3, [GLOBAL(x_s1sqr2)] - paddw xmm3, xmm4 - - paddw xmm3, xmm5 ; d1 - movdqa xmm6, xmm2 ; a1 - - movdqa xmm4, xmm0 ; b1 - paddw xmm2, xmm3 ;0 - - paddw xmm4, xmm7 ;1 - psubw xmm0, xmm7 ;2 - - psubw xmm6, xmm3 ;3 - - ; transpose for the second pass - movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 - punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 - punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 - - movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 - punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 - punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 - - - movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 - punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 - punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 - - movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 - punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 - punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 - - - movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 - punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 - punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 - - movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 - punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 - punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 - - pshufd xmm0, xmm2, 11011000b - pshufd xmm2, xmm1, 11011000b - - pshufd xmm1, xmm5, 11011000b - pshufd xmm3, xmm7, 11011000b - - ; second pass - psubw xmm0, xmm2 ; b1 = 0-2 - paddw xmm2, xmm2 - - movdqa xmm5, xmm1 - paddw xmm2, xmm0 ; a1 = 0+2 - - pmulhw xmm5, [GLOBAL(x_s1sqr2)] - paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) - - movdqa xmm7, xmm3 - pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] - - paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) - psubw xmm7, xmm5 ; c1 - - movdqa xmm5, xmm1 - movdqa xmm4, xmm3 - - pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] - paddw xmm5, xmm1 - - pmulhw xmm3, [GLOBAL(x_s1sqr2)] - paddw xmm3, xmm4 - - paddw xmm3, xmm5 ; d1 - paddw xmm0, [GLOBAL(fours)] - - paddw xmm2, [GLOBAL(fours)] - movdqa xmm6, xmm2 ; a1 - - movdqa xmm4, xmm0 ; b1 - paddw xmm2, xmm3 ;0 - - paddw xmm4, xmm7 ;1 - psubw xmm0, xmm7 ;2 - - psubw xmm6, xmm3 ;3 - psraw xmm2, 3 - - psraw xmm0, 3 - psraw xmm4, 3 - - psraw xmm6, 3 - - ; transpose to save - movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 - punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 - punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 - - movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 - punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 - punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 - - - movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 - punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 - punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 - - movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 - punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 - punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 - - - movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 - punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 - punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 - - movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 - punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 - punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 - - pshufd xmm0, xmm2, 11011000b - pshufd xmm2, xmm1, 11011000b - - pshufd xmm1, xmm5, 11011000b - pshufd xmm3, xmm7, 11011000b - - pxor xmm7, xmm7 - - ; Load up predict blocks - movq xmm4, [rsi] - movq xmm5, [rsi+16] - - punpcklbw xmm4, xmm7 - punpcklbw xmm5, xmm7 - - paddw xmm0, xmm4 - paddw xmm1, xmm5 - - movq xmm4, [rsi+32] - movq xmm5, [rsi+48] - - punpcklbw xmm4, xmm7 - punpcklbw xmm5, xmm7 - - paddw xmm2, xmm4 - paddw xmm3, xmm5 - -.finish: - - ; pack up before storing - packuswb xmm0, xmm7 - packuswb xmm1, xmm7 - packuswb xmm2, xmm7 - packuswb xmm3, xmm7 - - ; Load destination stride before writing out, - ; doesn't need to persist - movsxd rdx, dword ptr arg(4) ; dst_stride - - ; store blocks back out - movq [rdi], xmm0 - movq [rdi + rdx], xmm1 - - lea rdi, [rdi + 2*rdx] - - movq [rdi], xmm2 - movq [rdi + rdx], xmm3 - - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -fours: - times 8 dw 0x0004 -align 16 -x_s1sqr2: - times 8 dw 0x8A8C -align 16 -x_c1sqr2less1: - times 8 dw 0x4E7B diff --git a/vp9/common/x86/iwalsh_mmx.asm b/vp9/common/x86/iwalsh_mmx.asm deleted file mode 100644 index 6b276b95a..000000000 --- a/vp9/common/x86/iwalsh_mmx.asm +++ /dev/null @@ -1,173 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output) -global sym(vp9_short_inv_walsh4x4_1_mmx) -sym(vp9_short_inv_walsh4x4_1_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) - mov rax, 3 - - mov rdi, arg(1) - add rax, [rsi] ;input[0] + 3 - - movd mm0, eax - - punpcklwd mm0, mm0 ;x x val val - - punpckldq mm0, mm0 ;val val val val - - psraw mm0, 3 ;(input[0] + 3) >> 3 - - movq [rdi + 0], mm0 - movq [rdi + 8], mm0 - movq [rdi + 16], mm0 - movq [rdi + 24], mm0 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_short_inv_walsh4x4_mmx(short *input, short *output) -global sym(vp9_short_inv_walsh4x4_mmx) -sym(vp9_short_inv_walsh4x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prolog - - mov rax, 3 - mov rsi, arg(0) - mov rdi, arg(1) - shl rax, 16 - - movq mm0, [rsi + 0] ;ip[0] - movq mm1, [rsi + 8] ;ip[4] - or rax, 3 ;00030003h - - movq mm2, [rsi + 16] ;ip[8] - movq mm3, [rsi + 24] ;ip[12] - - movq mm7, rax - movq mm4, mm0 - - punpcklwd mm7, mm7 ;0003000300030003h - movq mm5, mm1 - - paddw mm4, mm3 ;ip[0] + ip[12] aka al - paddw mm5, mm2 ;ip[4] + ip[8] aka bl - - movq mm6, mm4 ;temp al - - paddw mm4, mm5 ;al + bl - psubw mm6, mm5 ;al - bl - - psubw mm0, mm3 ;ip[0] - ip[12] aka d1 - psubw mm1, mm2 ;ip[4] - ip[8] aka c1 - - movq mm5, mm0 ;temp dl - - paddw mm0, mm1 ;dl + cl - psubw mm5, mm1 ;dl - cl - - ; 03 02 01 00 - ; 13 12 11 10 - ; 23 22 21 20 - ; 33 32 31 30 - - movq mm3, mm4 ; 03 02 01 00 - punpcklwd mm4, mm0 ; 11 01 10 00 - punpckhwd mm3, mm0 ; 13 03 12 02 - - movq mm1, mm6 ; 23 22 21 20 - punpcklwd mm6, mm5 ; 31 21 30 20 - punpckhwd mm1, mm5 ; 33 23 32 22 - - movq mm0, mm4 ; 11 01 10 00 - movq mm2, mm3 ; 13 03 12 02 - - punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] - punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] - - punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] - punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] -;~~~~~~~~~~~~~~~~~~~~~ - movq mm1, mm0 - movq mm5, mm4 - - paddw mm1, mm3 ;ip[0] + ip[12] aka al - paddw mm5, mm2 ;ip[4] + ip[8] aka bl - - movq mm6, mm1 ;temp al - - paddw mm1, mm5 ;al + bl - psubw mm6, mm5 ;al - bl - - psubw mm0, mm3 ;ip[0] - ip[12] aka d1 - psubw mm4, mm2 ;ip[4] - ip[8] aka c1 - - movq mm5, mm0 ;temp dl - - paddw mm0, mm4 ;dl + cl - psubw mm5, mm4 ;dl - cl -;~~~~~~~~~~~~~~~~~~~~~ - movq mm3, mm1 ; 03 02 01 00 - punpcklwd mm1, mm0 ; 11 01 10 00 - punpckhwd mm3, mm0 ; 13 03 12 02 - - movq mm4, mm6 ; 23 22 21 20 - punpcklwd mm6, mm5 ; 31 21 30 20 - punpckhwd mm4, mm5 ; 33 23 32 22 - - movq mm0, mm1 ; 11 01 10 00 - movq mm2, mm3 ; 13 03 12 02 - - punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] - punpckhdq mm1, mm6 ; 31 21 11 01 aka ip[4] - - punpckldq mm2, mm4 ; 32 22 12 02 aka ip[8] - punpckhdq mm3, mm4 ; 33 23 13 03 aka ip[12] - - paddw mm0, mm7 - paddw mm1, mm7 - paddw mm2, mm7 - paddw mm3, mm7 - - psraw mm0, 3 - psraw mm1, 3 - psraw mm2, 3 - psraw mm3, 3 - - movq [rdi + 0], mm0 - movq [rdi + 8], mm1 - movq [rdi + 16], mm2 - movq [rdi + 24], mm3 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - diff --git a/vp9/common/x86/iwalsh_sse2.asm b/vp9/common/x86/iwalsh_sse2.asm deleted file mode 100644 index 143cce87d..000000000 --- a/vp9/common/x86/iwalsh_sse2.asm +++ /dev/null @@ -1,119 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_short_inv_walsh4x4_sse2(short *input, short *output) -global sym(vp9_short_inv_walsh4x4_sse2) -sym(vp9_short_inv_walsh4x4_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - SAVE_XMM 6 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) - mov rdi, arg(1) - mov rax, 3 - - movdqa xmm0, [rsi + 0] ;ip[4] ip[0] - movdqa xmm1, [rsi + 16] ;ip[12] ip[8] - - shl rax, 16 - or rax, 3 ;00030003h - - pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] - movdqa xmm3, xmm0 ;ip[4] ip[0] - - paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 - psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 - - movdqa xmm4, xmm0 - punpcklqdq xmm0, xmm3 ;d1 a1 - punpckhqdq xmm4, xmm3 ;c1 b1 - movd xmm6, eax - - movdqa xmm1, xmm4 ;c1 b1 - paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] - psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] - -;;;temp output -;; movdqu [rdi + 0], xmm4 -;; movdqu [rdi + 16], xmm3 - -;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; 13 12 11 10 03 02 01 00 - ; - ; 33 32 31 30 23 22 21 20 - ; - movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 - punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 - punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 - movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 - punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 - punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 - ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] - movdqa xmm3, xmm4 ;ip[4] ip[0] - - pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03 - - paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 - psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 - - movdqa xmm5, xmm4 - punpcklqdq xmm4, xmm3 ;d1 a1 - punpckhqdq xmm5, xmm3 ;c1 b1 - - movdqa xmm1, xmm5 ;c1 b1 - paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] - psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] -;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; 13 12 11 10 03 02 01 00 - ; - ; 33 32 31 30 23 22 21 20 - ; - movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00 - punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00 - punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10 - movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00 - punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00 - punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02 -;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - paddw xmm5, xmm6 - paddw xmm1, xmm6 - - psraw xmm5, 3 - psraw xmm1, 3 - - movdqa [rdi + 0], xmm5 - movdqa [rdi + 16], xmm1 - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -x_s1sqr2: - times 4 dw 0x8A8C -align 16 -x_c1sqr2less1: - times 4 dw 0x4E7B -align 16 -fours: - times 4 dw 0x0004 diff --git a/vp9/common/x86/loopfilter_mmx.asm b/vp9/common/x86/loopfilter_mmx.asm deleted file mode 100644 index ac3f74eda..000000000 --- a/vp9/common/x86/loopfilter_mmx.asm +++ /dev/null @@ -1,969 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - - -;void vp9_loop_filter_horizontal_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp9_loop_filter_horizontal_edge_mmx) -sym(vp9_loop_filter_horizontal_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - movsxd rcx, dword ptr arg(5) ;count -.next8_h: - mov rdx, arg(3) ;limit - movq mm7, [rdx] - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - - ; calculate breakout conditions - movq mm2, [rdi+2*rax] ; q3 - movq mm1, [rsi+2*rax] ; q2 - movq mm6, mm1 ; q2 - psubusb mm1, mm2 ; q2-=q3 - psubusb mm2, mm6 ; q3-=q2 - por mm1, mm2 ; abs(q3-q2) - psubusb mm1, mm7 ; - - - movq mm4, [rsi+rax] ; q1 - movq mm3, mm4 ; q1 - psubusb mm4, mm6 ; q1-=q2 - psubusb mm6, mm3 ; q2-=q1 - por mm4, mm6 ; abs(q2-q1) - - psubusb mm4, mm7 - por mm1, mm4 - - movq mm4, [rsi] ; q0 - movq mm0, mm4 ; q0 - psubusb mm4, mm3 ; q0-=q1 - psubusb mm3, mm0 ; q1-=q0 - por mm4, mm3 ; abs(q0-q1) - movq t0, mm4 ; save to t0 - psubusb mm4, mm7 - por mm1, mm4 - - - neg rax ; negate pitch to deal with above border - - movq mm2, [rsi+4*rax] ; p3 - movq mm4, [rdi+4*rax] ; p2 - movq mm5, mm4 ; p2 - psubusb mm4, mm2 ; p2-=p3 - psubusb mm2, mm5 ; p3-=p2 - por mm4, mm2 ; abs(p3 - p2) - psubusb mm4, mm7 - por mm1, mm4 - - - movq mm4, [rsi+2*rax] ; p1 - movq mm3, mm4 ; p1 - psubusb mm4, mm5 ; p1-=p2 - psubusb mm5, mm3 ; p2-=p1 - por mm4, mm5 ; abs(p2 - p1) - psubusb mm4, mm7 - por mm1, mm4 - - movq mm2, mm3 ; p1 - - movq mm4, [rsi+rax] ; p0 - movq mm5, mm4 ; p0 - psubusb mm4, mm3 ; p0-=p1 - psubusb mm3, mm5 ; p1-=p0 - por mm4, mm3 ; abs(p1 - p0) - movq t1, mm4 ; save to t1 - psubusb mm4, mm7 - por mm1, mm4 - - movq mm3, [rdi] ; q1 - movq mm4, mm3 ; q1 - psubusb mm3, mm2 ; q1-=p1 - psubusb mm2, mm4 ; p1-=q1 - por mm2, mm3 ; abs(p1-q1) - pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm2, 1 ; abs(p1-q1)/2 - - movq mm6, mm5 ; p0 - movq mm3, [rsi] ; q0 - psubusb mm5, mm3 ; p0-=q0 - psubusb mm3, mm6 ; q0-=p0 - por mm5, mm3 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; get blimit - movq mm7, [rdx] ; blimit - - psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por mm1, mm5 - pxor mm5, mm5 - pcmpeqb mm1, mm5 ; mask mm1 - - ; calculate high edge variance - mov rdx, arg(4) ;thresh ; get thresh - movq mm7, [rdx] ; - movq mm4, t0 ; get abs (q1 - q0) - psubusb mm4, mm7 - movq mm3, t1 ; get abs (p1 - p0) - psubusb mm3, mm7 - paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - - pcmpeqb mm4, mm5 - - pcmpeqb mm5, mm5 - pxor mm4, mm5 - - - ; start work on filters - movq mm2, [rsi+2*rax] ; p1 - movq mm7, [rdi] ; q1 - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb mm2, mm7 ; p1 - q1 - pand mm2, mm4 ; high var mask (hvm)(p1 - q1) - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) - paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) - paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) - pand mm1, mm2 ; mask filter values we don't care about - movq mm2, mm1 - paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 - paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 - - pxor mm0, mm0 ; - pxor mm5, mm5 - punpcklbw mm0, mm2 ; - punpckhbw mm5, mm2 ; - psraw mm0, 11 ; - psraw mm5, 11 - packsswb mm0, mm5 - movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; - - pxor mm0, mm0 ; 0 - movq mm5, mm1 ; abcdefgh - punpcklbw mm0, mm1 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - pxor mm1, mm1 ; 0 - punpckhbw mm1, mm5 ; a0b0c0d0 - psraw mm1, 11 ; sign extended shift right by 3 - movq mm5, mm0 ; save results - - packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 - paddsw mm5, [GLOBAL(ones)] - paddsw mm1, [GLOBAL(ones)] - psraw mm5, 1 ; partial shifted one more time for 2nd tap - psraw mm1, 1 ; partial shifted one more time for 2nd tap - packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 - pandn mm4, mm5 ; high edge variance additive - - paddsb mm6, mm2 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - movq [rsi+rax], mm6 ; write back - - movq mm6, [rsi+2*rax] ; p1 - pxor mm6, [GLOBAL(t80)] ; reoffset - paddsb mm6, mm4 ; p1+= p1 add - pxor mm6, [GLOBAL(t80)] ; unoffset - movq [rsi+2*rax], mm6 ; write back - - psubsb mm3, mm0 ; q0-= q0 add - pxor mm3, [GLOBAL(t80)] ; unoffset - movq [rsi], mm3 ; write back - - psubsb mm7, mm4 ; q1-= q1 add - pxor mm7, [GLOBAL(t80)] ; unoffset - movq [rdi], mm7 ; write back - - add rsi,8 - neg rax - dec rcx - jnz .next8_h - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_loop_filter_vertical_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp9_loop_filter_vertical_edge_mmx) -sym(vp9_loop_filter_vertical_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 64 ; reserve 64 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - lea rsi, [rsi + rax*4 - 4] - - movsxd rcx, dword ptr arg(5) ;count -.next8_v: - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - - - ;transpose - movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 - movq mm7, mm6 ; 77 76 75 74 73 72 71 70 - - punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64 - punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60 - - movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 - movq mm5, mm4 ; 47 46 45 44 43 42 41 40 - - punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44 - punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40 - - movq mm3, mm5 ; 57 47 56 46 55 45 54 44 - punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 - - punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 - movq mm2, mm4 ; 53 43 52 42 51 41 50 40 - - punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 - punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 - - neg rax - movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 - - movq mm1, mm6 ; 27 26 25 24 23 22 21 20 - punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24 - - punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20 - movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 - - punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 - movq mm0, mm7 ; 17 07 16 06 15 05 14 04 - - punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 - punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 - - movq mm6, mm7 ; 37 27 17 07 36 26 16 06 - punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 - - punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 - - movq mm5, mm6 ; 76 66 56 46 36 26 16 06 - psubusb mm5, mm7 ; q2-q3 - - psubusb mm7, mm6 ; q3-q2 - por mm7, mm5; ; mm7=abs (q3-q2) - - movq mm5, mm0 ; 35 25 15 05 34 24 14 04 - punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 - - punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 - movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 - - psubusb mm3, mm6 ; q1-q2 - psubusb mm6, mm5 ; q2-q1 - - por mm6, mm3 ; mm6=abs(q2-q1) - lea rdx, srct - - movq [rdx+24], mm5 ; save q1 - movq [rdx+16], mm0 ; save q0 - - movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 - punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 - - movq mm0, mm3 ; 13 03 12 02 11 01 10 00 - punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 - - punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 - movq mm1, mm0 ; 31 21 11 01 30 20 10 00 - - punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 - punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 - - movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 - psubusb mm2, mm0 ; p2-p3 - - psubusb mm0, mm1 ; p3-p2 - por mm0, mm2 ; mm0=abs(p3-p2) - - movq mm2, mm3 ; 33 23 13 03 32 22 12 02 - punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 - - punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 - movq [rdx+8], mm3 ; save p0 - - movq [rdx], mm2 ; save p1 - movq mm5, mm2 ; mm5 = p1 - - psubusb mm2, mm1 ; p1-p2 - psubusb mm1, mm5 ; p2-p1 - - por mm1, mm2 ; mm1=abs(p2-p1) - mov rdx, arg(3) ;limit - - movq mm4, [rdx] ; mm4 = limit - psubusb mm7, mm4 - - psubusb mm0, mm4 - psubusb mm1, mm4 - - psubusb mm6, mm4 - por mm7, mm6 - - por mm0, mm1 - por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit - - movq mm1, mm5 ; p1 - - movq mm7, mm3 ; mm3=mm7=p0 - psubusb mm7, mm5 ; p0 - p1 - - psubusb mm5, mm3 ; p1 - p0 - por mm5, mm7 ; abs(p1-p0) - - movq t0, mm5 ; save abs(p1-p0) - lea rdx, srct - - psubusb mm5, mm4 - por mm0, mm5 ; mm0=mask - - movq mm5, [rdx+16] ; mm5=q0 - movq mm7, [rdx+24] ; mm7=q1 - - movq mm6, mm5 ; mm6=q0 - movq mm2, mm7 ; q1 - psubusb mm5, mm7 ; q0-q1 - - psubusb mm7, mm6 ; q1-q0 - por mm7, mm5 ; abs(q1-q0) - - movq t1, mm7 ; save abs(q1-q0) - psubusb mm7, mm4 - - por mm0, mm7 ; mask - - movq mm5, mm2 ; q1 - psubusb mm5, mm1 ; q1-=p1 - psubusb mm1, mm2 ; p1-=q1 - por mm5, mm1 ; abs(p1-q1) - pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm5, 1 ; abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; - - movq mm4, [rdx] ;blimit - movq mm1, mm3 ; mm1=mm3=p0 - - movq mm7, mm6 ; mm7=mm6=q0 - psubusb mm1, mm7 ; p0-q0 - - psubusb mm7, mm3 ; q0-p0 - por mm1, mm7 ; abs(q0-p0) - paddusb mm1, mm1 ; abs(q0-p0)*2 - paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por mm1, mm0; ; mask - - pxor mm0, mm0 - pcmpeqb mm1, mm0 - - ; calculate high edge variance - mov rdx, arg(4) ;thresh ; get thresh - movq mm7, [rdx] - ; - movq mm4, t0 ; get abs (q1 - q0) - psubusb mm4, mm7 - - movq mm3, t1 ; get abs (p1 - p0) - psubusb mm3, mm7 - - por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - pcmpeqb mm4, mm0 - - pcmpeqb mm0, mm0 - pxor mm4, mm0 - - - - ; start work on filters - lea rdx, srct - - movq mm2, [rdx] ; p1 - movq mm7, [rdx+24] ; q1 - - movq mm6, [rdx+8] ; p0 - movq mm0, [rdx+16] ; q0 - - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - - psubsb mm2, mm7 ; p1 - q1 - pand mm2, mm4 ; high var mask (hvm)(p1 - q1) - - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - - paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) - paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) - - paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) - pand mm1, mm2 ; mask filter values we don't care about - - movq mm2, mm1 - paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 - - paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 - pxor mm0, mm0 ; - - pxor mm5, mm5 - punpcklbw mm0, mm2 ; - - punpckhbw mm5, mm2 ; - psraw mm0, 11 ; - - psraw mm5, 11 - packsswb mm0, mm5 - - movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; - - pxor mm0, mm0 ; 0 - movq mm5, mm1 ; abcdefgh - - punpcklbw mm0, mm1 ; e0f0g0h0 - psraw mm0, 11 ; sign extended shift right by 3 - - pxor mm1, mm1 ; 0 - punpckhbw mm1, mm5 ; a0b0c0d0 - - psraw mm1, 11 ; sign extended shift right by 3 - movq mm5, mm0 ; save results - - packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 - paddsw mm5, [GLOBAL(ones)] - - paddsw mm1, [GLOBAL(ones)] - psraw mm5, 1 ; partial shifted one more time for 2nd tap - - psraw mm1, 1 ; partial shifted one more time for 2nd tap - packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 - - pandn mm4, mm5 ; high edge variance additive - - paddsb mm6, mm2 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - - ; mm6=p0 ; - movq mm1, [rdx] ; p1 - pxor mm1, [GLOBAL(t80)] ; reoffset - - paddsb mm1, mm4 ; p1+= p1 add - pxor mm1, [GLOBAL(t80)] ; unoffset - ; mm6 = p0 mm1 = p1 - - psubsb mm3, mm0 ; q0-= q0 add - pxor mm3, [GLOBAL(t80)] ; unoffset - - ; mm3 = q0 - psubsb mm7, mm4 ; q1-= q1 add - pxor mm7, [GLOBAL(t80)] ; unoffset - ; mm7 = q1 - - ; tranpose and write back - ; mm1 = 72 62 52 42 32 22 12 02 - ; mm6 = 73 63 53 43 33 23 13 03 - ; mm3 = 74 64 54 44 34 24 14 04 - ; mm7 = 75 65 55 45 35 25 15 05 - - movq mm2, mm1 ; 72 62 52 42 32 22 12 02 - punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 - - movq mm4, mm3 ; 74 64 54 44 34 24 14 04 - punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 - - punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 - punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 - - movq mm6, mm2 ; 33 32 23 22 13 12 03 02 - punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 - - punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 - movq mm5, mm1 ; 73 72 63 62 53 52 43 42 - - punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 - punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 - - - ; mm2 = 15 14 13 12 05 04 03 02 - ; mm6 = 35 34 33 32 25 24 23 22 - ; mm5 = 55 54 53 52 45 44 43 42 - ; mm1 = 75 74 73 72 65 64 63 62 - - - - movd [rsi+rax*4+2], mm2 - psrlq mm2, 32 - - movd [rdi+rax*4+2], mm2 - movd [rsi+rax*2+2], mm6 - - psrlq mm6, 32 - movd [rsi+rax+2],mm6 - - movd [rsi+2], mm1 - psrlq mm1, 32 - - movd [rdi+2], mm1 - neg rax - - movd [rdi+rax+2],mm5 - psrlq mm5, 32 - - movd [rdi+rax*2+2], mm5 - - lea rsi, [rsi+rax*8] - dec rcx - jnz .next8_v - - add rsp, 64 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_loop_filter_simple_horizontal_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit -;) -global sym(vp9_loop_filter_simple_horizontal_edge_mmx) -sym(vp9_loop_filter_simple_horizontal_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - mov rcx, 2 ; count -.nexts8_h: - mov rdx, arg(2) ;blimit ; get blimit - movq mm3, [rdx] ; - - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - neg rax - - ; calculate mask - movq mm1, [rsi+2*rax] ; p1 - movq mm0, [rdi] ; q1 - movq mm2, mm1 - movq mm7, mm0 - movq mm4, mm0 - psubusb mm0, mm1 ; q1-=p1 - psubusb mm1, mm4 ; p1-=q1 - por mm1, mm0 ; abs(p1-q1) - pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm1, 1 ; abs(p1-q1)/2 - - movq mm5, [rsi+rax] ; p0 - movq mm4, [rsi] ; q0 - movq mm0, mm4 ; q0 - movq mm6, mm5 ; p0 - psubusb mm5, mm4 ; p0-=q0 - psubusb mm4, mm6 ; q0-=p0 - por mm5, mm4 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor mm3, mm3 - pcmpeqb mm5, mm3 - - ; start work on filters - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb mm2, mm7 ; p1 - q1 - - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) - paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) - paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) - pand mm5, mm2 ; mask filter values we don't care about - - ; do + 4 side - paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - movq mm1, mm5 ; get a copy of filters - psraw mm1, 11 ; arithmetic shift right 11 - psllw mm1, 8 ; shift left 8 to put it back - - por mm0, mm1 ; put the two together to get result - - psubsb mm3, mm0 ; q0-= q0 add - pxor mm3, [GLOBAL(t80)] ; unoffset - movq [rsi], mm3 ; write back - - - ; now do +3 side - psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - psraw mm5, 11 ; arithmetic shift right 11 - psllw mm5, 8 ; shift left 8 to put it back - por mm0, mm5 ; put the two together to get result - - - paddsb mm6, mm0 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - movq [rsi+rax], mm6 ; write back - - add rsi,8 - neg rax - dec rcx - jnz .nexts8_h - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_loop_filter_simple_vertical_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit -;) -global sym(vp9_loop_filter_simple_vertical_edge_mmx) -sym(vp9_loop_filter_simple_vertical_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - lea rsi, [rsi + rax*4- 2]; ; - mov rcx, 2 ; count -.nexts8_v: - - lea rdi, [rsi + rax]; - movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70 - - movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60 - punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 - - movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50 - movd mm4, [rsi] ; xx xx xx xx 43 42 41 40 - - punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 - movq mm5, mm4 ; 53 43 52 42 51 41 50 40 - - punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40 - punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42 - - neg rax - - movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30 - movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20 - - punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20 - movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10 - - movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00 - punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00 - - movq mm2, mm0 ; 13 03 12 02 11 01 10 00 - punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00 - - punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02 - movq mm1, mm0 ; 13 03 12 02 11 01 10 00 - - punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1 - movq mm3, mm2 ; 33 23 13 03 32 22 12 02 - - punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0 - punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0 - - punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1 - - - ; calculate mask - movq mm6, mm0 ; p1 - movq mm7, mm3 ; q1 - psubusb mm7, mm6 ; q1-=p1 - psubusb mm6, mm3 ; p1-=q1 - por mm6, mm7 ; abs(p1-q1) - pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm6, 1 ; abs(p1-q1)/2 - - movq mm5, mm1 ; p0 - movq mm4, mm2 ; q0 - - psubusb mm5, mm2 ; p0-=q0 - psubusb mm4, mm1 ; q0-=p0 - - por mm5, mm4 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; get blimit - movq mm7, [rdx] - - psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor mm7, mm7 - pcmpeqb mm5, mm7 ; mm5 = mask - - ; start work on filters - movq t0, mm0 - movq t1, mm3 - - pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values - - psubsb mm0, mm3 ; p1 - q1 - movq mm6, mm1 ; p0 - - movq mm7, mm2 ; q0 - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - - pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm7 ; offseted ; q0 - - psubsb mm7, mm6 ; q0 - p0 - paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0) - - paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0) - paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0) - - pand mm5, mm0 ; mask filter values we don't care about - - paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - - movq mm7, mm5 ; get a copy of filters - psraw mm7, 11 ; arithmetic shift right 11 - psllw mm7, 8 ; shift left 8 to put it back - - por mm0, mm7 ; put the two together to get result - - psubsb mm3, mm0 ; q0-= q0sz add - pxor mm3, [GLOBAL(t80)] ; unoffset - - ; now do +3 side - psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - - psraw mm5, 11 ; arithmetic shift right 11 - psllw mm5, 8 ; shift left 8 to put it back - por mm0, mm5 ; put the two together to get result - - paddsb mm6, mm0 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - - - movq mm0, t0 - movq mm4, t1 - - ; mm0 = 70 60 50 40 30 20 10 00 - ; mm6 = 71 61 51 41 31 21 11 01 - ; mm3 = 72 62 52 42 32 22 12 02 - ; mm4 = 73 63 53 43 33 23 13 03 - ; transpose back to write out - - movq mm1, mm0 ; - punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00 - - punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40 - movq mm2, mm3 ; - - punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02 - movq mm5, mm1 ; 71 70 61 60 51 50 41 40 - - punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42 - movq mm6, mm0 ; 31 30 21 20 11 10 01 00 - - punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00 - punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20 - - movd [rsi+rax*4], mm0 ; write 03 02 01 00 - punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40 - - psrlq mm0, 32 ; xx xx xx xx 13 12 11 10 - punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60 - - movd [rdi+rax*4], mm0 ; write 13 12 11 10 - movd [rsi+rax*2], mm6 ; write 23 22 21 20 - - psrlq mm6, 32 ; 33 32 31 30 - movd [rsi], mm1 ; write 43 42 41 40 - - movd [rsi + rax], mm6 ; write 33 32 31 30 - neg rax - - movd [rsi + rax*2], mm5 ; write 63 62 61 60 - psrlq mm1, 32 ; 53 52 51 50 - - movd [rdi], mm1 ; write out 53 52 51 50 - psrlq mm5, 32 ; 73 72 71 70 - - movd [rdi + rax*2], mm5 ; write 73 72 71 70 - - lea rsi, [rsi+rax*8] ; next 8 - - dec rcx - jnz .nexts8_v - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - - -;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, -; int y_stride, -; loop_filter_info *lfi) -;{ -; -; -; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2); -; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2); -; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2); -;} - -SECTION_RODATA -align 16 -tfe: - times 8 db 0xfe -align 16 -t80: - times 8 db 0x80 -align 16 -t1s: - times 8 db 0x01 -align 16 -t3: - times 8 db 0x03 -align 16 -t4: - times 8 db 0x04 -align 16 -ones: - times 4 dw 0x0001 -align 16 -s27: - times 4 dw 0x1b00 -align 16 -s18: - times 4 dw 0x1200 -align 16 -s9: - times 4 dw 0x0900 -align 16 -s63: - times 4 dw 0x003f diff --git a/vp9/common/x86/loopfilter_sse2.asm b/vp9/common/x86/loopfilter_sse2.asm deleted file mode 100644 index 9c0c4b000..000000000 --- a/vp9/common/x86/loopfilter_sse2.asm +++ /dev/null @@ -1,1238 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -; Use of pmaxub instead of psubusb to compute filter mask was seen -; in ffvp8 - -%macro LFH_FILTER_AND_HEV_MASK 1 -%if %1 - movdqa xmm2, [rdi+2*rax] ; q3 - movdqa xmm1, [rsi+2*rax] ; q2 - movdqa xmm4, [rsi+rax] ; q1 - movdqa xmm5, [rsi] ; q0 - neg rax ; negate pitch to deal with above border -%else - movlps xmm2, [rsi + rcx*2] ; q3 - movlps xmm1, [rsi + rcx] ; q2 - movlps xmm4, [rsi] ; q1 - movlps xmm5, [rsi + rax] ; q0 - - movhps xmm2, [rdi + rcx*2] - movhps xmm1, [rdi + rcx] - movhps xmm4, [rdi] - movhps xmm5, [rdi + rax] - - lea rsi, [rsi + rax*4] - lea rdi, [rdi + rax*4] - - movdqa XMMWORD PTR [rsp], xmm1 ; store q2 - movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1 -%endif - - movdqa xmm6, xmm1 ; q2 - movdqa xmm3, xmm4 ; q1 - - psubusb xmm1, xmm2 ; q2-=q3 - psubusb xmm2, xmm6 ; q3-=q2 - - psubusb xmm4, xmm6 ; q1-=q2 - psubusb xmm6, xmm3 ; q2-=q1 - - por xmm4, xmm6 ; abs(q2-q1) - por xmm1, xmm2 ; abs(q3-q2) - - movdqa xmm0, xmm5 ; q0 - pmaxub xmm1, xmm4 - - psubusb xmm5, xmm3 ; q0-=q1 - psubusb xmm3, xmm0 ; q1-=q0 - - por xmm5, xmm3 ; abs(q0-q1) - movdqa t0, xmm5 ; save to t0 - - pmaxub xmm1, xmm5 - -%if %1 - movdqa xmm2, [rsi+4*rax] ; p3 - movdqa xmm4, [rdi+4*rax] ; p2 - movdqa xmm6, [rsi+2*rax] ; p1 -%else - movlps xmm2, [rsi + rax] ; p3 - movlps xmm4, [rsi] ; p2 - movlps xmm6, [rsi + rcx] ; p1 - - movhps xmm2, [rdi + rax] - movhps xmm4, [rdi] - movhps xmm6, [rdi + rcx] - - movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2 - movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1 -%endif - - movdqa xmm5, xmm4 ; p2 - movdqa xmm3, xmm6 ; p1 - - psubusb xmm4, xmm2 ; p2-=p3 - psubusb xmm2, xmm5 ; p3-=p2 - - psubusb xmm3, xmm5 ; p1-=p2 - pmaxub xmm1, xmm4 ; abs(p3 - p2) - - psubusb xmm5, xmm6 ; p2-=p1 - pmaxub xmm1, xmm2 ; abs(p3 - p2) - - pmaxub xmm1, xmm5 ; abs(p2 - p1) - movdqa xmm2, xmm6 ; p1 - - pmaxub xmm1, xmm3 ; abs(p2 - p1) -%if %1 - movdqa xmm4, [rsi+rax] ; p0 - movdqa xmm3, [rdi] ; q1 -%else - movlps xmm4, [rsi + rcx*2] ; p0 - movhps xmm4, [rdi + rcx*2] - movdqa xmm3, q1 ; q1 -%endif - - movdqa xmm5, xmm4 ; p0 - psubusb xmm4, xmm6 ; p0-=p1 - - psubusb xmm6, xmm5 ; p1-=p0 - - por xmm6, xmm4 ; abs(p1 - p0) - mov rdx, arg(2) ; get blimit - - movdqa t1, xmm6 ; save to t1 - - movdqa xmm4, xmm3 ; q1 - pmaxub xmm1, xmm6 - - psubusb xmm3, xmm2 ; q1-=p1 - psubusb xmm2, xmm4 ; p1-=q1 - - psubusb xmm1, xmm7 - por xmm2, xmm3 ; abs(p1-q1) - - movdqa xmm7, XMMWORD PTR [rdx] ; blimit - - movdqa xmm3, xmm0 ; q0 - pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero - - mov rdx, arg(4) ; hev get thresh - - movdqa xmm6, xmm5 ; p0 - psrlw xmm2, 1 ; abs(p1-q1)/2 - - psubusb xmm5, xmm3 ; p0-=q0 - - psubusb xmm3, xmm6 ; q0-=p0 - por xmm5, xmm3 ; abs(p0 - q0) - - paddusb xmm5, xmm5 ; abs(p0-q0)*2 - - movdqa xmm4, t0 ; hev get abs (q1 - q0) - - movdqa xmm3, t1 ; get abs (p1 - p0) - - paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - movdqa xmm2, XMMWORD PTR [rdx] ; hev - - psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - psubusb xmm4, xmm2 ; hev - - psubusb xmm3, xmm2 ; hev - por xmm1, xmm5 - - pxor xmm7, xmm7 - paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - - pcmpeqb xmm4, xmm5 ; hev - pcmpeqb xmm3, xmm3 ; hev - - pcmpeqb xmm1, xmm7 ; mask xmm1 - pxor xmm4, xmm3 ; hev -%endmacro - -%macro B_FILTER 1 -%if %1 == 0 - movdqa xmm2, p1 ; p1 - movdqa xmm7, q1 ; q1 -%elif %1 == 1 - movdqa xmm2, [rsi+2*rax] ; p1 - movdqa xmm7, [rdi] ; q1 -%elif %1 == 2 - lea rdx, srct - - movdqa xmm2, [rdx] ; p1 - movdqa xmm7, [rdx+48] ; q1 - movdqa xmm6, [rdx+16] ; p0 - movdqa xmm0, [rdx+32] ; q0 -%endif - - pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - - psubsb xmm2, xmm7 ; p1 - q1 - pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values - - pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) - pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values - - movdqa xmm3, xmm0 ; q0 - psubsb xmm0, xmm6 ; q0 - p0 - - paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) - - paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) - - paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) - - pand xmm1, xmm2 ; mask filter values we don't care about - - movdqa xmm2, xmm1 - - paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 - paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 - - punpckhbw xmm5, xmm2 ; axbxcxdx - punpcklbw xmm2, xmm2 ; exfxgxhx - - punpcklbw xmm0, xmm1 ; exfxgxhx - psraw xmm5, 11 ; sign extended shift right by 3 - - punpckhbw xmm1, xmm1 ; axbxcxdx - psraw xmm2, 11 ; sign extended shift right by 3 - - packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; - psraw xmm0, 11 ; sign extended shift right by 3 - - psraw xmm1, 11 ; sign extended shift right by 3 - movdqa xmm5, xmm0 ; save results - - packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 - paddsw xmm5, [GLOBAL(ones)] - - paddsw xmm1, [GLOBAL(ones)] - psraw xmm5, 1 ; partial shifted one more time for 2nd tap - - psraw xmm1, 1 ; partial shifted one more time for 2nd tap - - paddsb xmm6, xmm2 ; p0+= p0 add - packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 - -%if %1 == 0 - movdqa xmm1, p1 ; p1 -%elif %1 == 1 - movdqa xmm1, [rsi+2*rax] ; p1 -%elif %1 == 2 - movdqa xmm1, [rdx] ; p1 -%endif - pandn xmm4, xmm5 ; high edge variance additive - pxor xmm6, [GLOBAL(t80)] ; unoffset - - pxor xmm1, [GLOBAL(t80)] ; reoffset - psubsb xmm3, xmm0 ; q0-= q0 add - - paddsb xmm1, xmm4 ; p1+= p1 add - pxor xmm3, [GLOBAL(t80)] ; unoffset - - pxor xmm1, [GLOBAL(t80)] ; unoffset - psubsb xmm7, xmm4 ; q1-= q1 add - - pxor xmm7, [GLOBAL(t80)] ; unoffset -%if %1 == 0 - lea rsi, [rsi + rcx*2] - lea rdi, [rdi + rcx*2] - movq MMWORD PTR [rsi], xmm6 ; p0 - movhps MMWORD PTR [rdi], xmm6 - movq MMWORD PTR [rsi + rax], xmm1 ; p1 - movhps MMWORD PTR [rdi + rax], xmm1 - movq MMWORD PTR [rsi + rcx], xmm3 ; q0 - movhps MMWORD PTR [rdi + rcx], xmm3 - movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1 - movhps MMWORD PTR [rdi + rcx*2],xmm7 -%elif %1 == 1 - movdqa [rsi+rax], xmm6 ; write back - movdqa [rsi+2*rax], xmm1 ; write back - movdqa [rsi], xmm3 ; write back - movdqa [rdi], xmm7 ; write back -%endif - -%endmacro - - -;void vp9_loop_filter_horizontal_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp9_loop_filter_horizontal_edge_sse2) -sym(vp9_loop_filter_horizontal_edge_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step - - mov rdx, arg(3) ;limit - movdqa xmm7, XMMWORD PTR [rdx] - - lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing - - ; calculate breakout conditions and high edge variance - LFH_FILTER_AND_HEV_MASK 1 - ; filter and write back the result - B_FILTER 1 - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_loop_filter_horizontal_edge_uv_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp9_loop_filter_horizontal_edge_uv_sse2) -sym(vp9_loop_filter_horizontal_edge_uv_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 96 ; reserve 96 bytes - %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16]; - %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16]; - %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16]; - %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16]; - %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16]; - - mov rsi, arg(0) ; u - mov rdi, arg(5) ; v - movsxd rax, dword ptr arg(1) ; src_pixel_step - mov rcx, rax - neg rax ; negate pitch to deal with above border - - mov rdx, arg(3) ;limit - movdqa xmm7, XMMWORD PTR [rdx] - - lea rsi, [rsi + rcx] - lea rdi, [rdi + rcx] - - ; calculate breakout conditions and high edge variance - LFH_FILTER_AND_HEV_MASK 0 - ; filter and write back the result - B_FILTER 0 - - add rsp, 96 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -%macro TRANSPOSE_16X8 2 - movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 - movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 - movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 - movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 - movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 - movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 - - punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 - - movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 - - movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 - punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 - - movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 - - punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 -%if %1 - lea rsi, [rsi+rax*8] -%else - mov rsi, arg(5) ; v_ptr -%endif - - movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 - punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 - - punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 - - punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 -%if %1 - lea rdi, [rdi+rax*8] -%else - lea rsi, [rsi - 4] -%endif - - punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 -%if %1 - lea rdx, srct -%else - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing -%endif - - movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 - punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 - - movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 - punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - - punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 - - punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 - - punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 - - movdqa t0, xmm2 ; save to free XMM2 - movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 - movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 - movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 - movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 - movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 - - punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 - - movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 - - punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 - - movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 - - punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 - - movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 - - punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 - - movdqa xmm6, xmm1 ; - punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 - - punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 - movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 - - punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 - - punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 - - movdqa xmm0, xmm5 - punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 - - punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 - movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 - - punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 - - punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 - movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 - - punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 - - punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 -%if %2 - movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - - punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - - movdqa [rdx], xmm2 ; save 2 - - movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 - punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - - movdqa [rdx+16], xmm3 ; save 3 - - punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 - - movdqa [rdx+32], xmm4 ; save 4 - movdqa [rdx+48], xmm5 ; save 5 - movdqa xmm1, t0 ; get - - movdqa xmm2, xmm1 ; - punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - - punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 -%else - movdqa [rdx+112], xmm7 ; save 7 - - movdqa [rdx+96], xmm6 ; save 6 - - movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - - punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - - movdqa [rdx+32], xmm2 ; save 2 - - movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 - punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - - movdqa [rdx+48], xmm3 ; save 3 - - punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 - - movdqa [rdx+64], xmm4 ; save 4 - movdqa [rdx+80], xmm5 ; save 5 - movdqa xmm1, t0 ; get - - movdqa xmm2, xmm1 - punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - - punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - - movdqa [rdx+16], xmm1 - - movdqa [rdx], xmm2 -%endif -%endmacro - -%macro LFV_FILTER_MASK_HEV_MASK 1 - movdqa xmm0, xmm6 ; q2 - psubusb xmm0, xmm7 ; q2-q3 - - psubusb xmm7, xmm6 ; q3-q2 - movdqa xmm4, xmm5 ; q1 - - por xmm7, xmm0 ; abs (q3-q2) - psubusb xmm4, xmm6 ; q1-q2 - - movdqa xmm0, xmm1 - psubusb xmm6, xmm5 ; q2-q1 - - por xmm6, xmm4 ; abs (q2-q1) - psubusb xmm0, xmm2 ; p2 - p3; - - psubusb xmm2, xmm1 ; p3 - p2; - por xmm0, xmm2 ; abs(p2-p3) -%if %1 - movdqa xmm2, [rdx] ; p1 -%else - movdqa xmm2, [rdx+32] ; p1 -%endif - movdqa xmm5, xmm2 ; p1 - pmaxub xmm0, xmm7 - - psubusb xmm5, xmm1 ; p1-p2 - psubusb xmm1, xmm2 ; p2-p1 - - movdqa xmm7, xmm3 ; p0 - psubusb xmm7, xmm2 ; p0-p1 - - por xmm1, xmm5 ; abs(p2-p1) - pmaxub xmm0, xmm6 - - pmaxub xmm0, xmm1 - movdqa xmm1, xmm2 ; p1 - - psubusb xmm2, xmm3 ; p1-p0 - lea rdx, srct - - por xmm2, xmm7 ; abs(p1-p0) - - movdqa t0, xmm2 ; save abs(p1-p0) - - pmaxub xmm0, xmm2 - -%if %1 - movdqa xmm5, [rdx+32] ; q0 - movdqa xmm7, [rdx+48] ; q1 -%else - movdqa xmm5, [rdx+64] ; q0 - movdqa xmm7, [rdx+80] ; q1 -%endif - mov rdx, arg(3) ; limit - - movdqa xmm6, xmm5 ; q0 - movdqa xmm2, xmm7 ; q1 - - psubusb xmm5, xmm7 ; q0-q1 - psubusb xmm7, xmm6 ; q1-q0 - - por xmm7, xmm5 ; abs(q1-q0) - - movdqa t1, xmm7 ; save abs(q1-q0) - - movdqa xmm4, XMMWORD PTR [rdx]; limit - - pmaxub xmm0, xmm7 - mov rdx, arg(2) ; blimit - - psubusb xmm0, xmm4 - movdqa xmm5, xmm2 ; q1 - - psubusb xmm5, xmm1 ; q1-=p1 - psubusb xmm1, xmm2 ; p1-=q1 - - por xmm5, xmm1 ; abs(p1-q1) - movdqa xmm1, xmm3 ; p0 - - pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero - psubusb xmm1, xmm6 ; p0-q0 - - psrlw xmm5, 1 ; abs(p1-q1)/2 - psubusb xmm6, xmm3 ; q0-p0 - - movdqa xmm4, XMMWORD PTR [rdx]; blimit - - mov rdx, arg(4) ; get thresh - - por xmm1, xmm6 ; abs(q0-p0) - - movdqa xmm6, t0 ; get abs (q1 - q0) - - paddusb xmm1, xmm1 ; abs(q0-p0)*2 - - movdqa xmm3, t1 ; get abs (p1 - p0) - - movdqa xmm7, XMMWORD PTR [rdx] - - paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh - - psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh - - psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit - por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh - - por xmm1, xmm0 ; mask - pcmpeqb xmm6, xmm0 - - pxor xmm0, xmm0 - pcmpeqb xmm4, xmm4 - - pcmpeqb xmm1, xmm0 - pxor xmm4, xmm6 -%endmacro - -%macro BV_TRANSPOSE 0 - ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 - movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 - - movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 - punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - - punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 - - punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 - - movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 - punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 - - punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 - movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - - punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 - - punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 - ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 - ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 - ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 - ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 -%endmacro - -%macro BV_WRITEBACK 2 - movd [rsi+2], %1 - psrldq %1, 4 - - movd [rdi+2], %1 - psrldq %1, 4 - - movd [rsi+2*rax+2], %1 - psrldq %1, 4 - - movd [rdi+2*rax+2], %1 - - movd [rsi+4*rax+2], %2 - psrldq %2, 4 - - movd [rdi+4*rax+2], %2 - psrldq %2, 4 - - movd [rsi+2*rcx+2], %2 - psrldq %2, 4 - - movd [rdi+2*rcx+2], %2 -%endmacro - - -;void vp9_loop_filter_vertical_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; int count -;) -global sym(vp9_loop_filter_vertical_edge_sse2) -sym(vp9_loop_filter_vertical_edge_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 96 ; reserve 96 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; - - mov rsi, arg(0) ; src_ptr - movsxd rax, dword ptr arg(1) ; src_pixel_step - - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - lea rcx, [rax*2+rax] - - ;transpose 16x8 to 8x16, and store the 8-line result on stack. - TRANSPOSE_16X8 1, 1 - - ; calculate filter mask and high edge variance - LFV_FILTER_MASK_HEV_MASK 1 - - ; start work on filters - B_FILTER 2 - - ; tranpose and write back - only work on q1, q0, p0, p1 - BV_TRANSPOSE - ; store 16-line result - - lea rdx, [rax] - neg rdx - - BV_WRITEBACK xmm1, xmm5 - - lea rsi, [rsi+rdx*8] - lea rdi, [rdi+rdx*8] - BV_WRITEBACK xmm2, xmm6 - - add rsp, 96 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_loop_filter_vertical_edge_uv_sse2 -;( -; unsigned char *u, -; int src_pixel_step, -; const char *blimit, -; const char *limit, -; const char *thresh, -; unsigned char *v -;) -global sym(vp9_loop_filter_vertical_edge_uv_sse2) -sym(vp9_loop_filter_vertical_edge_uv_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 96 ; reserve 96 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; - - mov rsi, arg(0) ; u_ptr - movsxd rax, dword ptr arg(1) ; src_pixel_step - - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - lea rcx, [rax+2*rax] - - lea rdx, srct - - ;transpose 16x8 to 8x16, and store the 8-line result on stack. - TRANSPOSE_16X8 0, 1 - - ; calculate filter mask and high edge variance - LFV_FILTER_MASK_HEV_MASK 1 - - ; start work on filters - B_FILTER 2 - - ; tranpose and write back - only work on q1, q0, p0, p1 - BV_TRANSPOSE - - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - - ; store 16-line result - BV_WRITEBACK xmm1, xmm5 - - mov rsi, arg(0) ; u_ptr - lea rsi, [rsi - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - BV_WRITEBACK xmm2, xmm6 - - add rsp, 96 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_loop_filter_simple_horizontal_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -;) -global sym(vp9_loop_filter_simple_horizontal_edge_sse2) -sym(vp9_loop_filter_simple_horizontal_edge_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - mov rdx, arg(2) ;blimit - movdqa xmm3, XMMWORD PTR [rdx] - - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - neg rax - - ; calculate mask - movdqa xmm1, [rsi+2*rax] ; p1 - movdqa xmm0, [rdi] ; q1 - movdqa xmm2, xmm1 - movdqa xmm7, xmm0 - movdqa xmm4, xmm0 - psubusb xmm0, xmm1 ; q1-=p1 - psubusb xmm1, xmm4 ; p1-=q1 - por xmm1, xmm0 ; abs(p1-q1) - pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw xmm1, 1 ; abs(p1-q1)/2 - - movdqa xmm5, [rsi+rax] ; p0 - movdqa xmm4, [rsi] ; q0 - movdqa xmm0, xmm4 ; q0 - movdqa xmm6, xmm5 ; p0 - psubusb xmm5, xmm4 ; p0-=q0 - psubusb xmm4, xmm6 ; q0-=p0 - por xmm5, xmm4 ; abs(p0 - q0) - paddusb xmm5, xmm5 ; abs(p0-q0)*2 - paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor xmm3, xmm3 - pcmpeqb xmm5, xmm3 - - ; start work on filters - pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb xmm2, xmm7 ; p1 - q1 - - pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values - movdqa xmm3, xmm0 ; q0 - psubsb xmm0, xmm6 ; q0 - p0 - paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) - paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) - paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) - pand xmm5, xmm2 ; mask filter values we don't care about - - ; do + 4 side - paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movdqa xmm0, xmm5 ; get a copy of filters - psllw xmm0, 8 ; shift left 8 - psraw xmm0, 3 ; arithmetic shift right 11 - psrlw xmm0, 8 - movdqa xmm1, xmm5 ; get a copy of filters - psraw xmm1, 11 ; arithmetic shift right 11 - psllw xmm1, 8 ; shift left 8 to put it back - - por xmm0, xmm1 ; put the two together to get result - - psubsb xmm3, xmm0 ; q0-= q0 add - pxor xmm3, [GLOBAL(t80)] ; unoffset - movdqa [rsi], xmm3 ; write back - - ; now do +3 side - psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 - - movdqa xmm0, xmm5 ; get a copy of filters - psllw xmm0, 8 ; shift left 8 - psraw xmm0, 3 ; arithmetic shift right 11 - psrlw xmm0, 8 - psraw xmm5, 11 ; arithmetic shift right 11 - psllw xmm5, 8 ; shift left 8 to put it back - por xmm0, xmm5 ; put the two together to get result - - - paddsb xmm6, xmm0 ; p0+= p0 add - pxor xmm6, [GLOBAL(t80)] ; unoffset - movdqa [rsi+rax], xmm6 ; write back - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_loop_filter_simple_vertical_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -;) -global sym(vp9_loop_filter_simple_vertical_edge_sse2) -sym(vp9_loop_filter_simple_vertical_edge_sse2): - push rbp ; save old base pointer value. - mov rbp, rsp ; set new base pointer value. - SHADOW_ARGS_TO_STACK 3 - SAVE_XMM 7 - GET_GOT rbx ; save callee-saved reg - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - lea rsi, [rsi - 2 ] - lea rdi, [rsi + rax] - lea rdx, [rsi + rax*4] - lea rcx, [rdx + rax] - - movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 - movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 - movd xmm2, [rdi] ; 13 12 11 10 - movd xmm3, [rcx] ; 53 52 51 50 - punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 - punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 - - movd xmm4, [rsi + rax*2] ; 23 22 21 20 - movd xmm5, [rdx + rax*2] ; 63 62 61 60 - movd xmm6, [rdi + rax*2] ; 33 32 31 30 - movd xmm7, [rcx + rax*2] ; 73 72 71 70 - punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 - punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 - - punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 - punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 - - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 - punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 - - movdqa xmm2, xmm0 - punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 - punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - - movdqa t0, xmm0 ; save to t0 - movdqa t1, xmm2 ; save to t1 - - lea rsi, [rsi + rax*8] - lea rdi, [rsi + rax] - lea rdx, [rsi + rax*4] - lea rcx, [rdx + rax] - - movd xmm4, [rsi] ; 83 82 81 80 - movd xmm1, [rdx] ; c3 c2 c1 c0 - movd xmm6, [rdi] ; 93 92 91 90 - movd xmm3, [rcx] ; d3 d2 d1 d0 - punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 - punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 - - movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0 - movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 - movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0 - movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 - punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 - punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 - - punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 - punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 - - movdqa xmm1, xmm4 - punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 - punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 - - movdqa xmm6, xmm4 - punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 - punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 - - movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 - movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - - punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - - ; calculate mask - movdqa xmm6, xmm0 ; p1 - movdqa xmm7, xmm3 ; q1 - psubusb xmm7, xmm0 ; q1-=p1 - psubusb xmm6, xmm3 ; p1-=q1 - por xmm6, xmm7 ; abs(p1-q1) - pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw xmm6, 1 ; abs(p1-q1)/2 - - movdqa xmm5, xmm1 ; p0 - movdqa xmm4, xmm2 ; q0 - psubusb xmm5, xmm2 ; p0-=q0 - psubusb xmm4, xmm1 ; q0-=p0 - por xmm5, xmm4 ; abs(p0 - q0) - paddusb xmm5, xmm5 ; abs(p0-q0)*2 - paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit - movdqa xmm7, XMMWORD PTR [rdx] - - psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor xmm7, xmm7 - pcmpeqb xmm5, xmm7 ; mm5 = mask - - ; start work on filters - movdqa t0, xmm0 - movdqa t1, xmm3 - - pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values - - psubsb xmm0, xmm3 ; p1 - q1 - movdqa xmm6, xmm1 ; p0 - - movdqa xmm7, xmm2 ; q0 - pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values - - pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values - movdqa xmm3, xmm7 ; offseted ; q0 - - psubsb xmm7, xmm6 ; q0 - p0 - paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0) - - paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0) - paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0) - - pand xmm5, xmm0 ; mask filter values we don't care about - - - paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movdqa xmm0, xmm5 ; get a copy of filters - psllw xmm0, 8 ; shift left 8 - - psraw xmm0, 3 ; arithmetic shift right 11 - psrlw xmm0, 8 - - movdqa xmm7, xmm5 ; get a copy of filters - psraw xmm7, 11 ; arithmetic shift right 11 - - psllw xmm7, 8 ; shift left 8 to put it back - por xmm0, xmm7 ; put the two together to get result - - psubsb xmm3, xmm0 ; q0-= q0sz add - pxor xmm3, [GLOBAL(t80)] ; unoffset q0 - - ; now do +3 side - psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 - movdqa xmm0, xmm5 ; get a copy of filters - - psllw xmm0, 8 ; shift left 8 - psraw xmm0, 3 ; arithmetic shift right 11 - - psrlw xmm0, 8 - psraw xmm5, 11 ; arithmetic shift right 11 - - psllw xmm5, 8 ; shift left 8 to put it back - por xmm0, xmm5 ; put the two together to get result - - paddsb xmm6, xmm0 ; p0+= p0 add - pxor xmm6, [GLOBAL(t80)] ; unoffset p0 - - movdqa xmm0, t0 ; p1 - movdqa xmm4, t1 ; q1 - - ; transpose back to write out - ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 - punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 - - movdqa xmm5, xmm3 - punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 - punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - - movdqa xmm2, xmm0 - punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 - punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 - - movdqa xmm3, xmm1 - punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 - punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 - - ; write out order: xmm0 xmm2 xmm1 xmm3 - lea rdx, [rsi + rax*4] - - movd [rsi], xmm1 ; write the second 8-line result - psrldq xmm1, 4 - movd [rdi], xmm1 - psrldq xmm1, 4 - movd [rsi + rax*2], xmm1 - psrldq xmm1, 4 - movd [rdi + rax*2], xmm1 - - movd [rdx], xmm3 - psrldq xmm3, 4 - movd [rcx], xmm3 - psrldq xmm3, 4 - movd [rdx + rax*2], xmm3 - psrldq xmm3, 4 - movd [rcx + rax*2], xmm3 - - neg rax - lea rsi, [rsi + rax*8] - neg rax - lea rdi, [rsi + rax] - lea rdx, [rsi + rax*4] - lea rcx, [rdx + rax] - - movd [rsi], xmm0 ; write the first 8-line result - psrldq xmm0, 4 - movd [rdi], xmm0 - psrldq xmm0, 4 - movd [rsi + rax*2], xmm0 - psrldq xmm0, 4 - movd [rdi + rax*2], xmm0 - - movd [rdx], xmm2 - psrldq xmm2, 4 - movd [rcx], xmm2 - psrldq xmm2, 4 - movd [rdx + rax*2], xmm2 - psrldq xmm2, 4 - movd [rcx + rax*2], xmm2 - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -tfe: - times 16 db 0xfe -align 16 -t80: - times 16 db 0x80 -align 16 -t1s: - times 16 db 0x01 -align 16 -t3: - times 16 db 0x03 -align 16 -t4: - times 16 db 0x04 -align 16 -ones: - times 8 dw 0x0001 -align 16 -s9: - times 8 dw 0x0900 -align 16 -s63: - times 8 dw 0x003f diff --git a/vp9/common/x86/loopfilter_x86.c b/vp9/common/x86/loopfilter_x86.c deleted file mode 100644 index 1ce654092..000000000 --- a/vp9/common/x86/loopfilter_x86.c +++ /dev/null @@ -1,547 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include // SSE2 -#include "vpx_config.h" -#include "vp9/common/loopfilter.h" - -prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx); -prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx); - -prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2); -prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2); - -extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2; -extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2; - -#if HAVE_MMX -/* Horizontal MB filtering */ -void vp9_loop_filter_mbh_mmx(unsigned char *y_ptr, - unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, - struct loop_filter_info *lfi) { -} - -/* Vertical MB Filtering */ -void vp9_loop_filter_mbv_mmx(unsigned char *y_ptr, - unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, - struct loop_filter_info *lfi) { -} - -/* Horizontal B Filtering */ -void vp9_loop_filter_bh_mmx(unsigned char *y_ptr, - unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, - struct loop_filter_info *lfi) { - -} - -void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, - y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, - y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, - y_stride, blimit); -} - -/* Vertical B Filtering */ -void vp9_loop_filter_bv_mmx(unsigned char *y_ptr, - unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, - struct loop_filter_info *lfi) { - vp9_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp9_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); -} - -void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit); -} -#endif - -#if HAVE_SSE2 -void vp9_mbloop_filter_horizontal_edge_c_sse2(unsigned char *s, - int p, - const unsigned char *_blimit, - const unsigned char *_limit, - const unsigned char *_thresh, - int count) { - DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - __m128i mask, hev, flat; - const __m128i zero = _mm_set1_epi16(0); - __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; - const unsigned int extended_thresh = _thresh[0] * 0x01010101u; - const unsigned int extended_limit = _limit[0] * 0x01010101u; - const unsigned int extended_blimit = _blimit[0] * 0x01010101u; - const __m128i thresh = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); - const __m128i limit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); - const __m128i blimit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); - - p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); - p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); - p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); - p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); - q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); - q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); - q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); - q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); - { - const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), - _mm_subs_epu8(p0, p1)); - const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), - _mm_subs_epu8(q0, q1)); - const __m128i one = _mm_set1_epi8(1); - const __m128i fe = _mm_set1_epi8(0xfe); - const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); - __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), - _mm_subs_epu8(q0, p0)); - __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), - _mm_subs_epu8(q1, p1)); - __m128i work; - flat = _mm_max_epu8(abs_p1p0, abs_q1q0); - hev = _mm_subs_epu8(flat, thresh); - hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); - - abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); - abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); - mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); - mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); - // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; - mask = _mm_max_epu8(flat, mask); - // mask |= (abs(p1 - p0) > limit) * -1; - // mask |= (abs(q1 - q0) > limit) * -1; - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), - _mm_subs_epu8(p1, p2)), - _mm_or_si128(_mm_subs_epu8(p3, p2), - _mm_subs_epu8(p2, p3))); - mask = _mm_max_epu8(work, mask); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), - _mm_subs_epu8(q1, q2)), - _mm_or_si128(_mm_subs_epu8(q3, q2), - _mm_subs_epu8(q2, q3))); - mask = _mm_max_epu8(work, mask); - mask = _mm_subs_epu8(mask, limit); - mask = _mm_cmpeq_epi8(mask, zero); - - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), - _mm_subs_epu8(p0, p2)), - _mm_or_si128(_mm_subs_epu8(q2, q0), - _mm_subs_epu8(q0, q2))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), - _mm_subs_epu8(p0, p3)), - _mm_or_si128(_mm_subs_epu8(q3, q0), - _mm_subs_epu8(q0, q3))); - flat = _mm_max_epu8(work, flat); - work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), - _mm_subs_epu8(p0, p4)), - _mm_or_si128(_mm_subs_epu8(q4, q0), - _mm_subs_epu8(q0, q4))); - flat = _mm_max_epu8(work, flat); - flat = _mm_subs_epu8(flat, one); - flat = _mm_cmpeq_epi8(flat, zero); - flat = _mm_and_si128(flat, mask); - } - { - const __m128i four = _mm_set1_epi16(4); - unsigned char *src = s; - int i = 0; - do { - __m128i workp_a, workp_b, workp_shft; - p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero); - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); - q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero); - - workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1)); - workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); - workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op2[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op1[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op0[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq0[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq1[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4); - workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); - workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq2[i*8], - _mm_packus_epi16(workp_shft, workp_shft)); - - src += 8; - } while (++i < count); - } - // lp filter - { - const __m128i t4 = _mm_set1_epi8(4); - const __m128i t3 = _mm_set1_epi8(3); - const __m128i t80 = _mm_set1_epi8(0x80); - const __m128i te0 = _mm_set1_epi8(0xe0); - const __m128i t1f = _mm_set1_epi8(0x1f); - const __m128i t1 = _mm_set1_epi8(0x1); - const __m128i t7f = _mm_set1_epi8(0x7f); - - const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), - t80); - const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), - t80); - const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), - t80); - const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), - t80); - __m128i filt; - __m128i work_a; - __m128i filter1, filter2; - - filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); - work_a = _mm_subs_epi8(qs0, ps0); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - filt = _mm_adds_epi8(filt, work_a); - /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ - filt = _mm_and_si128(filt, mask); - - filter1 = _mm_adds_epi8(filt, t4); - filter2 = _mm_adds_epi8(filt, t3); - - /* Filter1 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter1); - filter1 = _mm_srli_epi16(filter1, 3); - work_a = _mm_and_si128(work_a, te0); - filter1 = _mm_and_si128(filter1, t1f); - filter1 = _mm_or_si128(filter1, work_a); - - /* Filter2 >> 3 */ - work_a = _mm_cmpgt_epi8(zero, filter2); - filter2 = _mm_srli_epi16(filter2, 3); - work_a = _mm_and_si128(work_a, te0); - filter2 = _mm_and_si128(filter2, t1f); - filter2 = _mm_or_si128(filter2, work_a); - - /* filt >> 1 */ - filt = _mm_adds_epi8(filter1, t1); - work_a = _mm_cmpgt_epi8(zero, filt); - filt = _mm_srli_epi16(filt, 1); - work_a = _mm_and_si128(work_a, t80); - filt = _mm_and_si128(filt, t7f); - filt = _mm_or_si128(filt, work_a); - - filt = _mm_andnot_si128(hev, filt); - - work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - q0 = _mm_load_si128((__m128i *)flat_oq0); - work_a = _mm_andnot_si128(flat, work_a); - q0 = _mm_and_si128(flat, q0); - q0 = _mm_or_si128(work_a, q0); - - work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - q1 = _mm_load_si128((__m128i *)flat_oq1); - work_a = _mm_andnot_si128(flat, work_a); - q1 = _mm_and_si128(flat, q1); - q1 = _mm_or_si128(work_a, q1); - - work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q2 = _mm_load_si128((__m128i *)flat_oq2); - work_a = _mm_andnot_si128(flat, work_a); - q2 = _mm_and_si128(flat, q2); - q2 = _mm_or_si128(work_a, q2); - - work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - p0 = _mm_load_si128((__m128i *)flat_op0); - work_a = _mm_andnot_si128(flat, work_a); - p0 = _mm_and_si128(flat, p0); - p0 = _mm_or_si128(work_a, p0); - - work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - p1 = _mm_load_si128((__m128i *)flat_op1); - work_a = _mm_andnot_si128(flat, work_a); - p1 = _mm_and_si128(flat, p1); - p1 = _mm_or_si128(work_a, p1); - - work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p2 = _mm_load_si128((__m128i *)flat_op2); - work_a = _mm_andnot_si128(flat, work_a); - p2 = _mm_and_si128(flat, p2); - p2 = _mm_or_si128(work_a, p2); - - if (count == 1) { - _mm_storel_epi64((__m128i *)(s - 3 * p), p2); - _mm_storel_epi64((__m128i *)(s - 2 * p), p1); - _mm_storel_epi64((__m128i *)(s - 1 * p), p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); - } else { - _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - _mm_storeu_si128((__m128i *)(s + 0 * p), q0); - _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - _mm_storeu_si128((__m128i *)(s + 2 * p), q2); - } - } -} - -static __inline void transpose(unsigned char *src[], int in_p, - unsigned char *dst[], int out_p, - int num_8x8_to_transpose) { - int idx8x8 = 0; - __m128i x0, x1, x2, x3, x4, x5, x6, x7; - do { - unsigned char *in = src[idx8x8]; - unsigned char *out = dst[idx8x8]; - - x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07 - x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17 - x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27 - x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37 - x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47 - x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57 - x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67 - x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77 - // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - x0 = _mm_unpacklo_epi8(x0, x1); - // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - x1 = _mm_unpacklo_epi8(x2, x3); - // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - x2 = _mm_unpacklo_epi8(x4, x5); - // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - x3 = _mm_unpacklo_epi8(x6, x7); - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - x4 = _mm_unpacklo_epi16(x0, x1); - // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - x5 = _mm_unpacklo_epi16(x2, x3); - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - x6 = _mm_unpacklo_epi32(x4, x5); - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - x7 = _mm_unpackhi_epi32(x4, x5); - - _mm_storel_pd((double *)(out + 0*out_p), - _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 - _mm_storeh_pd((double *)(out + 1*out_p), - _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 - _mm_storel_pd((double *)(out + 2*out_p), - _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 - _mm_storeh_pd((double *)(out + 3*out_p), - _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 - - // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - x4 = _mm_unpackhi_epi16(x0, x1); - // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 - x5 = _mm_unpackhi_epi16(x2, x3); - // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 - x6 = _mm_unpacklo_epi32(x4, x5); - // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 - x7 = _mm_unpackhi_epi32(x4, x5); - - _mm_storel_pd((double *)(out + 4*out_p), - _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 - _mm_storeh_pd((double *)(out + 5*out_p), - _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 - _mm_storel_pd((double *)(out + 6*out_p), - _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 - _mm_storeh_pd((double *)(out + 7*out_p), - _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 - } while (++idx8x8 < num_8x8_to_transpose); -} - -void vp9_mbloop_filter_vertical_edge_c_sse2(unsigned char *s, - int p, - const unsigned char *blimit, - const unsigned char *limit, - const unsigned char *thresh, - int count) { - DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 16]); - unsigned char *src[4]; - unsigned char *dst[4]; - - src[0] = s - 5; - src[1] = s - 5 + 8; - src[2] = s - 5 + p*8; - src[3] = s - 5 + p*8 + 8; - - dst[0] = t_dst; - dst[1] = t_dst + 16*8; - dst[2] = t_dst + 8; - dst[3] = t_dst + 16*8 + 8; - - // 16x16->16x16 or 16x8->8x16 - transpose(src, p, dst, 16, (1 << count)); - - vp9_mbloop_filter_horizontal_edge_c_sse2(t_dst + 5*16, 16, blimit, limit, - thresh, count); - - dst[0] = s - 5; - dst[1] = s - 5 + p*8; - - src[0] = t_dst; - src[1] = t_dst + 8; - - // 16x8->8x16 or 8x8->8x8 - transpose(src, 16, dst, p, (1 << (count - 1))); -} - -/* Horizontal MB filtering */ -void vp9_loop_filter_mbh_sse2(unsigned char *y_ptr, - unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, - struct loop_filter_info *lfi) { - vp9_mbloop_filter_horizontal_edge_c_sse2(y_ptr, y_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 2); - - /* TODO: write sse2 version with u,v interleaved */ - if (u_ptr) - vp9_mbloop_filter_horizontal_edge_c_sse2(u_ptr, uv_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_mbloop_filter_horizontal_edge_c_sse2(v_ptr, uv_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); -} - -void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, int uv_stride, - struct loop_filter_info *lfi) { - vp9_mbloop_filter_horizontal_edge_c_sse2( - y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); -} - -/* Vertical MB Filtering */ -void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, int uv_stride, - struct loop_filter_info *lfi) { - vp9_mbloop_filter_vertical_edge_c_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); - - /* TODO: write sse2 version with u,v interleaved */ - if (u_ptr) - vp9_mbloop_filter_vertical_edge_c_sse2(u_ptr, uv_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp9_mbloop_filter_vertical_edge_c_sse2(v_ptr, uv_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); -} - -void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, int uv_stride, - struct loop_filter_info *lfi) { - vp9_mbloop_filter_vertical_edge_c_sse2( - y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); -} - -/* Horizontal B Filtering */ -void vp9_loop_filter_bh_sse2(unsigned char *y_ptr, - unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, - struct loop_filter_info *lfi) { - vp9_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, - v_ptr + 4 * uv_stride); -} - -void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, - y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, - y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, - y_stride, blimit); -} - -/* Vertical B Filtering */ -void vp9_loop_filter_bv_sse2(unsigned char *y_ptr, - unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, - struct loop_filter_info *lfi) { - vp9_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp9_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, - v_ptr + 4); -} - -void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit); -} - -#endif diff --git a/vp9/common/x86/loopfilter_x86.h b/vp9/common/x86/loopfilter_x86.h deleted file mode 100644 index 25cf383c9..000000000 --- a/vp9/common/x86/loopfilter_x86.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef LOOPFILTER_X86_H -#define LOOPFILTER_X86_H - -/* Note: - * - * This platform is commonly built for runtime CPU detection. If you modify - * any of the function mappings present in this file, be sure to also update - * them in the function pointer initialization code - */ - -#if HAVE_MMX -extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx); -extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx); -extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx); -extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx); -extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx); -extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx); -#endif - -#if HAVE_SSE2 -extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2); -extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2); -extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2); -extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2); -extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2); -extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2); -#endif - -#endif // LOOPFILTER_X86_H diff --git a/vp9/common/x86/mask_sse3.asm b/vp9/common/x86/mask_sse3.asm deleted file mode 100644 index 0d90cfa86..000000000 --- a/vp9/common/x86/mask_sse3.asm +++ /dev/null @@ -1,484 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void int vp8_makemask_sse3( -; unsigned char *y, -; unsigned char *u, -; unsigned char *v, -; unsigned char *ym, -; unsigned char *uvm, -; int yp, -; int uvp, -; int ys, -; int us, -; int vs, -; int yt, -; int ut, -; int vt) -global sym(vp8_makemask_sse3) -sym(vp8_makemask_sse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 14 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;y - mov rdi, arg(1) ;u - mov rcx, arg(2) ;v - mov rax, arg(3) ;ym - movsxd rbx, dword arg(4) ;yp - movsxd rdx, dword arg(5) ;uvp - - pxor xmm0,xmm0 - - ;make 16 copies of the center y value - movd xmm1, arg(6) - pshufb xmm1, xmm0 - - ; make 16 copies of the center u value - movd xmm2, arg(7) - pshufb xmm2, xmm0 - - ; make 16 copies of the center v value - movd xmm3, arg(8) - pshufb xmm3, xmm0 - unpcklpd xmm2, xmm3 - - ;make 16 copies of the y tolerance - movd xmm3, arg(9) - pshufb xmm3, xmm0 - - ;make 16 copies of the u tolerance - movd xmm4, arg(10) - pshufb xmm4, xmm0 - - ;make 16 copies of the v tolerance - movd xmm5, arg(11) - pshufb xmm5, xmm0 - unpckhpd xmm4, xmm5 - - mov r8,8 - -NextPairOfRows: - - ;grab the y source values - movdqu xmm0, [rsi] - - ;compute abs difference between source and y target - movdqa xmm6, xmm1 - movdqa xmm7, xmm0 - psubusb xmm0, xmm1 - psubusb xmm6, xmm7 - por xmm0, xmm6 - - ;compute abs difference between - movdqa xmm6, xmm3 - pcmpgtb xmm6, xmm0 - - ;grab the y source values - add rsi, rbx - movdqu xmm0, [rsi] - - ;compute abs difference between source and y target - movdqa xmm11, xmm1 - movdqa xmm7, xmm0 - psubusb xmm0, xmm1 - psubusb xmm11, xmm7 - por xmm0, xmm11 - - ;compute abs difference between - movdqa xmm11, xmm3 - pcmpgtb xmm11, xmm0 - - - ;grab the u and v source values - movdqu xmm7, [rdi] - movdqu xmm8, [rcx] - unpcklpd xmm7, xmm8 - - ;compute abs difference between source and uv targets - movdqa xmm9, xmm2 - movdqa xmm10, xmm7 - psubusb xmm7, xmm2 - psubusb xmm9, xmm10 - por xmm7, xmm9 - - ;check whether the number is < tolerance - movdqa xmm0, xmm4 - pcmpgtb xmm0, xmm7 - - ;double u and v masks - movdqa xmm8, xmm0 - punpckhbw xmm0, xmm0 - punpcklbw xmm8, xmm8 - - ;mask row 0 and output - pand xmm6, xmm8 - pand xmm6, xmm0 - movdqa [rax],xmm6 - - ;mask row 1 and output - pand xmm11, xmm8 - pand xmm11, xmm0 - movdqa [rax+16],xmm11 - - - ; to the next row or set of rows - add rsi, rbx - add rdi, rdx - add rcx, rdx - add rax,32 - dec r8 - jnz NextPairOfRows - - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;GROW_HORIZ (register for result, source register or mem local) -; takes source and shifts left and ors with source -; then shifts right and ors with source -%macro GROW_HORIZ 2 - movdqa %1, %2 - movdqa xmm14, %1 - movdqa xmm15, %1 - pslldq xmm14, 1 - psrldq xmm15, 1 - por %1,xmm14 - por %1,xmm15 -%endmacro -;GROW_VERT (result, center row, above row, below row) -%macro GROW_VERT 4 - movdqa %1,%2 - por %1,%3 - por %1,%4 -%endmacro - -;GROW_NEXTLINE (new line to grow, new source, line to write) -%macro GROW_NEXTLINE 3 - GROW_HORIZ %1, %2 - GROW_VERT xmm3, xmm0, xmm1, xmm2 - movdqa %3,xmm3 -%endmacro - - -;void int vp8_growmaskmb_sse3( -; unsigned char *om, -; unsigned char *nm, -global sym(vp8_growmaskmb_sse3) -sym(vp8_growmaskmb_sse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src - mov rdi, arg(1) ;rst - - GROW_HORIZ xmm0, [rsi] - GROW_HORIZ xmm1, [rsi+16] - GROW_HORIZ xmm2, [rsi+32] - - GROW_VERT xmm3, xmm0, xmm1, xmm2 - por xmm0,xmm1 - movdqa [rdi], xmm0 - movdqa [rdi+16],xmm3 - - GROW_NEXTLINE xmm0,[rsi+48],[rdi+32] - GROW_NEXTLINE xmm1,[rsi+64],[rdi+48] - GROW_NEXTLINE xmm2,[rsi+80],[rdi+64] - GROW_NEXTLINE xmm0,[rsi+96],[rdi+80] - GROW_NEXTLINE xmm1,[rsi+112],[rdi+96] - GROW_NEXTLINE xmm2,[rsi+128],[rdi+112] - GROW_NEXTLINE xmm0,[rsi+144],[rdi+128] - GROW_NEXTLINE xmm1,[rsi+160],[rdi+144] - GROW_NEXTLINE xmm2,[rsi+176],[rdi+160] - GROW_NEXTLINE xmm0,[rsi+192],[rdi+176] - GROW_NEXTLINE xmm1,[rsi+208],[rdi+192] - GROW_NEXTLINE xmm2,[rsi+224],[rdi+208] - GROW_NEXTLINE xmm0,[rsi+240],[rdi+224] - - por xmm0,xmm2 - movdqa [rdi+240], xmm0 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - -;unsigned int vp8_sad16x16_masked_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; unsigned char *mask) -global sym(vp8_sad16x16_masked_wmt) -sym(vp8_sad16x16_masked_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rbx, arg(4) ;mask - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - mov rcx, 16 - - pxor xmm3, xmm3 - -NextSadRow: - movdqu xmm0, [rsi] - movdqu xmm1, [rdi] - movdqu xmm2, [rbx] - pand xmm0, xmm2 - pand xmm1, xmm2 - - psadbw xmm0, xmm1 - paddw xmm3, xmm0 - - add rsi, rax - add rdi, rdx - add rbx, 16 - - dec rcx - jnz NextSadRow - - movdqa xmm4 , xmm3 - psrldq xmm4, 8 - paddw xmm3, xmm4 - movq rax, xmm3 - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp8_sad16x16_unmasked_wmt( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; unsigned char *mask) -global sym(vp8_sad16x16_unmasked_wmt) -sym(vp8_sad16x16_unmasked_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rbx, arg(4) ;mask - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - mov rcx, 16 - - pxor xmm3, xmm3 - -next_vp8_sad16x16_unmasked_wmt: - movdqu xmm0, [rsi] - movdqu xmm1, [rdi] - movdqu xmm2, [rbx] - por xmm0, xmm2 - por xmm1, xmm2 - - psadbw xmm0, xmm1 - paddw xmm3, xmm0 - - add rsi, rax - add rdi, rdx - add rbx, 16 - - dec rcx - jnz next_vp8_sad16x16_unmasked_wmt - - movdqa xmm4 , xmm3 - psrldq xmm4, 8 - paddw xmm3, xmm4 - movq rax, xmm3 - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp8_masked_predictor_wmt( -; unsigned char *masked, -; unsigned char *unmasked, -; int src_stride, -; unsigned char *dst_ptr, -; int dst_stride, -; unsigned char *mask) -global sym(vp8_masked_predictor_wmt) -sym(vp8_masked_predictor_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;ref_ptr - - mov rbx, arg(5) ;mask - movsxd rax, dword ptr arg(2) ;src_stride - mov r11, arg(3) ; destination - movsxd rdx, dword ptr arg(4) ;dst_stride - - mov rcx, 16 - - pxor xmm3, xmm3 - -next_vp8_masked_predictor_wmt: - movdqu xmm0, [rsi] - movdqu xmm1, [rdi] - movdqu xmm2, [rbx] - - pand xmm0, xmm2 - pandn xmm2, xmm1 - por xmm0, xmm2 - movdqu [r11], xmm0 - - add r11, rdx - add rsi, rax - add rdi, rdx - add rbx, 16 - - dec rcx - jnz next_vp8_masked_predictor_wmt - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;unsigned int vp8_masked_predictor_uv_wmt( -; unsigned char *masked, -; unsigned char *unmasked, -; int src_stride, -; unsigned char *dst_ptr, -; int dst_stride, -; unsigned char *mask) -global sym(vp8_masked_predictor_uv_wmt) -sym(vp8_masked_predictor_uv_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;ref_ptr - - mov rbx, arg(5) ;mask - movsxd rax, dword ptr arg(2) ;src_stride - mov r11, arg(3) ; destination - movsxd rdx, dword ptr arg(4) ;dst_stride - - mov rcx, 8 - - pxor xmm3, xmm3 - -next_vp8_masked_predictor_uv_wmt: - movq xmm0, [rsi] - movq xmm1, [rdi] - movq xmm2, [rbx] - - pand xmm0, xmm2 - pandn xmm2, xmm1 - por xmm0, xmm2 - movq [r11], xmm0 - - add r11, rdx - add rsi, rax - add rdi, rax - add rbx, 8 - - dec rcx - jnz next_vp8_masked_predictor_uv_wmt - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;unsigned int vp8_uv_from_y_mask( -; unsigned char *ymask, -; unsigned char *uvmask) -global sym(vp8_uv_from_y_mask) -sym(vp8_uv_from_y_mask): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;dst_ptr - - - mov rcx, 8 - - pxor xmm3, xmm3 - -next_p8_uv_from_y_mask: - movdqu xmm0, [rsi] - pshufb xmm0, [shuf1b] ;[GLOBAL(shuf1b)] - movq [rdi],xmm0 - add rdi, 8 - add rsi,32 - - dec rcx - jnz next_p8_uv_from_y_mask - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -shuf1b: - db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 - diff --git a/vp9/common/x86/postproc_mmx.asm b/vp9/common/x86/postproc_mmx.asm deleted file mode 100644 index fa2152bab..000000000 --- a/vp9/common/x86/postproc_mmx.asm +++ /dev/null @@ -1,534 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define VP9_FILTER_WEIGHT 128 -%define VP9_FILTER_SHIFT 7 - -;void vp9_post_proc_down_and_across_mmx -;( -; unsigned char *src_ptr, -; unsigned char *dst_ptr, -; int src_pixels_per_line, -; int dst_pixels_per_line, -; int rows, -; int cols, -; int flimit -;) -global sym(vp9_post_proc_down_and_across_mmx) -sym(vp9_post_proc_down_and_across_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -%if ABI_IS_32BIT=1 && CONFIG_PIC=1 - ; move the global rd onto the stack, since we don't have enough registers - ; to do PIC addressing - movq mm0, [GLOBAL(rd)] - sub rsp, 8 - movq [rsp], mm0 -%define RD [rsp] -%else -%define RD [GLOBAL(rd)] -%endif - - push rbx - lea rbx, [GLOBAL(Blur)] - movd mm2, dword ptr arg(6) ;flimit - punpcklwd mm2, mm2 - punpckldq mm2, mm2 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;dst_ptr - - movsxd rcx, DWORD PTR arg(4) ;rows - movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - -.nextrow: - - xor rdx, rdx ; clear out rdx for use as loop counter -.nextcol: - - pxor mm7, mm7 ; mm7 = 00000000 - movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps - movq mm3, [rsi] ; mm4 = r0 p0..p7 - punpcklbw mm3, mm0 ; mm3 = p0..p3 - movq mm1, mm3 ; mm1 = p0..p3 - pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers - - movq mm6, [rbx + 48] ; mm6 = kernel 3 taps - movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 - punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 - pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers - paddusw mm3, mm6 ; mm3 += mm6 - - ; thresholding - movq mm7, mm1 ; mm7 = r0 p0..p3 - psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 - psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 - paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) - pcmpgtw mm7, mm2 - - movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers - movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 - punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 - pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = r0 p0..p3 - psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 - psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 - paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - - neg rax - movq mm6, [rbx ] ; kernel 0 taps - movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 - punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 - pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = r0 p0..p3 - psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 - psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 - paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - movq mm6, [rbx + 16] ; kernel 1 taps - movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 - punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 - pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = r0 p0..p3 - psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 - psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 - paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - - paddusw mm3, RD ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - - pand mm1, mm7 ; mm1 select vals > thresh from source - pandn mm7, mm3 ; mm7 select vals < thresh from blurred result - paddusw mm1, mm7 ; combination - - packuswb mm1, mm0 ; pack to bytes - - movd [rdi], mm1 ; - neg rax ; pitch is positive - - - add rsi, 4 - add rdi, 4 - add rdx, 4 - - cmp edx, dword ptr arg(5) ;cols - jl .nextcol - ; done with the all cols, start the across filtering in place - sub rsi, rdx - sub rdi, rdx - - - push rax - xor rdx, rdx - mov rax, [rdi-4]; - -.acrossnextcol: - pxor mm7, mm7 ; mm7 = 00000000 - movq mm6, [rbx + 32 ] ; - movq mm4, [rdi+rdx] ; mm4 = p0..p7 - movq mm3, mm4 ; mm3 = p0..p7 - punpcklbw mm3, mm0 ; mm3 = p0..p3 - movq mm1, mm3 ; mm1 = p0..p3 - pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers - - movq mm6, [rbx + 48] - psrlq mm4, 8 ; mm4 = p1..p7 - movq mm5, mm4 ; mm5 = p1..p7 - punpcklbw mm5, mm0 ; mm5 = p1..p4 - pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers - paddusw mm3, mm6 ; mm3 += mm6 - - ; thresholding - movq mm7, mm1 ; mm7 = p0..p3 - psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) - pcmpgtw mm7, mm2 - - movq mm6, [rbx + 64 ] - psrlq mm4, 8 ; mm4 = p2..p7 - movq mm5, mm4 ; mm5 = p2..p7 - punpcklbw mm5, mm0 ; mm5 = p2..p5 - pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = p0..p3 - psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - - movq mm6, [rbx ] - movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 - movq mm5, mm4 ; mm5 = p-2..p5 - punpcklbw mm5, mm0 ; mm5 = p-2..p1 - pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = p0..p3 - psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 - psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - movq mm6, [rbx + 16] - psrlq mm4, 8 ; mm4 = p-1..p5 - punpcklbw mm4, mm0 ; mm4 = p-1..p2 - pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. - paddusw mm3, mm6 ; mm3 += mm5 - - ; thresholding - movq mm6, mm1 ; mm6 = p0..p3 - psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 - psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 - paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw mm6, mm2 - por mm7, mm6 ; accumulate thresholds - - paddusw mm3, RD ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - - pand mm1, mm7 ; mm1 select vals > thresh from source - pandn mm7, mm3 ; mm7 select vals < thresh from blurred result - paddusw mm1, mm7 ; combination - - packuswb mm1, mm0 ; pack to bytes - mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes - movd eax, mm1 - - add rdx, 4 - cmp edx, dword ptr arg(5) ;cols - jl .acrossnextcol; - - mov DWORD PTR [rdi+rdx-4], eax - pop rax - - ; done with this rwo - add rsi,rax ; next line - movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch? - add rdi,rax ; next destination - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch? - - dec rcx ; decrement count - jnz .nextrow ; next row - pop rbx - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret -%undef RD - - -;void vp9_mbpost_proc_down_mmx(unsigned char *dst, -; int pitch, int rows, int cols,int flimit) -extern sym(vp9_rv) -global sym(vp9_mbpost_proc_down_mmx) -sym(vp9_mbpost_proc_down_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 136 - - ; unsigned char d[16][8] at [rsp] - ; create flimit2 at [rsp+128] - mov eax, dword ptr arg(4) ;flimit - mov [rsp+128], eax - mov [rsp+128+4], eax -%define flimit2 [rsp+128] - -%if ABI_IS_32BIT=0 - lea r8, [GLOBAL(sym(vp9_rv))] -%endif - - ;rows +=8; - add dword ptr arg(2), 8 - - ;for(c=0; c thresh from source - pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result - paddusw xmm1, xmm7 ; combination - - packuswb xmm1, xmm0 ; pack to bytes - movq QWORD PTR [rdi], xmm1 ; - - neg rax ; pitch is positive - add rsi, 8 - add rdi, 8 - - add rdx, 8 - cmp edx, dword arg(5) ;cols - - jl .nextcol - - ; done with the all cols, start the across filtering in place - sub rsi, rdx - sub rdi, rdx - - xor rdx, rdx - movq mm0, QWORD PTR [rdi-8]; - -.acrossnextcol: - movq xmm7, QWORD PTR [rdi +rdx -2] - movd xmm4, DWORD PTR [rdi +rdx +6] - - pslldq xmm4, 8 - por xmm4, xmm7 - - movdqa xmm3, xmm4 - psrldq xmm3, 2 - punpcklbw xmm3, xmm0 ; mm3 = p0..p3 - movdqa xmm1, xmm3 ; mm1 = p0..p3 - psllw xmm3, 2 - - - movdqa xmm5, xmm4 - psrldq xmm5, 3 - punpcklbw xmm5, xmm0 ; mm5 = p1..p4 - paddusw xmm3, xmm5 ; mm3 += mm6 - - ; thresholding - movdqa xmm7, xmm1 ; mm7 = p0..p3 - psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4 - psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 - paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4) - pcmpgtw xmm7, xmm2 - - movdqa xmm5, xmm4 - psrldq xmm5, 4 - punpcklbw xmm5, xmm0 ; mm5 = p2..p5 - paddusw xmm3, xmm5 ; mm3 += mm5 - - ; thresholding - movdqa xmm6, xmm1 ; mm6 = p0..p3 - psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 - psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 - paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; accumulate thresholds - - - movdqa xmm5, xmm4 ; mm5 = p-2..p5 - punpcklbw xmm5, xmm0 ; mm5 = p-2..p1 - paddusw xmm3, xmm5 ; mm3 += mm5 - - ; thresholding - movdqa xmm6, xmm1 ; mm6 = p0..p3 - psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 - psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 - paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; accumulate thresholds - - psrldq xmm4, 1 ; mm4 = p-1..p5 - punpcklbw xmm4, xmm0 ; mm4 = p-1..p2 - paddusw xmm3, xmm4 ; mm3 += mm5 - - ; thresholding - movdqa xmm6, xmm1 ; mm6 = p0..p3 - psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4 - psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3 - paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4) - pcmpgtw xmm6, xmm2 - por xmm7, xmm6 ; accumulate thresholds - - paddusw xmm3, RD42 ; mm3 += round value - psraw xmm3, 3 ; mm3 /= 8 - - pand xmm1, xmm7 ; mm1 select vals > thresh from source - pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result - paddusw xmm1, xmm7 ; combination - - packuswb xmm1, xmm0 ; pack to bytes - movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes - movdq2q mm0, xmm1 - - add rdx, 8 - cmp edx, dword arg(5) ;cols - jl .acrossnextcol; - - ; last 8 pixels - movq QWORD PTR [rdi+rdx-8], mm0 - - ; done with this rwo - add rsi,rax ; next line - mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch? - add rdi,rax ; next destination - mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch? - - dec rcx ; decrement count - jnz .nextrow ; next row - -%if ABI_IS_32BIT=1 && CONFIG_PIC=1 - add rsp,16 - pop rsp -%endif - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -%undef RD42 - - -;void vp9_mbpost_proc_down_xmm(unsigned char *dst, -; int pitch, int rows, int cols,int flimit) -extern sym(vp9_rv) -global sym(vp9_mbpost_proc_down_xmm) -sym(vp9_mbpost_proc_down_xmm): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 128+16 - - ; unsigned char d[16][8] at [rsp] - ; create flimit2 at [rsp+128] - mov eax, dword ptr arg(4) ;flimit - mov [rsp+128], eax - mov [rsp+128+4], eax - mov [rsp+128+8], eax - mov [rsp+128+12], eax -%define flimit4 [rsp+128] - -%if ABI_IS_32BIT=0 - lea r8, [GLOBAL(sym(vp9_rv))] -%endif - - ;rows +=8; - add dword arg(2), 8 - - ;for(c=0; cmode_info_context->mbmi.uv_mode; - build_intra_pred_mbuv_fn_t fn; - int src_stride = xd->dst.uv_stride; - - switch (mode) { - case V_PRED: - fn = vp9_intra_pred_uv_ve_mmx; - break; - case H_PRED: - fn = ho_fn; - break; - case TM_PRED: - fn = tm_fn; - break; - case DC_PRED: - if (xd->up_available) { - if (xd->left_available) { - fn = vp9_intra_pred_uv_dc_mmx2; - break; - } else { - fn = vp9_intra_pred_uv_dctop_mmx2; - break; - } - } else if (xd->left_available) { - fn = vp9_intra_pred_uv_dcleft_mmx2; - break; - } else { - fn = vp9_intra_pred_uv_dc128_mmx; - break; - } - break; - default: - return; - } - - fn(dst_u, dst_stride, xd->dst.u_buffer, src_stride); - fn(dst_v, dst_stride, xd->dst.v_buffer, src_stride); -} - -void vp9_build_intra_predictors_mbuv_sse2(MACROBLOCKD *xd) { - build_intra_predictors_mbuv_x86(xd, &xd->predictor[256], - &xd->predictor[320], 8, - vp9_intra_pred_uv_tm_sse2, - vp9_intra_pred_uv_ho_mmx2); -} - -void vp9_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *xd) { - build_intra_predictors_mbuv_x86(xd, &xd->predictor[256], - &xd->predictor[320], 8, - vp9_intra_pred_uv_tm_ssse3, - vp9_intra_pred_uv_ho_ssse3); -} - -void vp9_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *xd) { - build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer, - xd->dst.v_buffer, xd->dst.uv_stride, - vp9_intra_pred_uv_tm_sse2, - vp9_intra_pred_uv_ho_mmx2); -} - -void vp9_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *xd) { - build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer, - xd->dst.v_buffer, xd->dst.uv_stride, - vp9_intra_pred_uv_tm_ssse3, - vp9_intra_pred_uv_ho_ssse3); -} diff --git a/vp9/common/x86/sadmxn_x86.c b/vp9/common/x86/sadmxn_x86.c deleted file mode 100644 index 0b783ccea..000000000 --- a/vp9/common/x86/sadmxn_x86.c +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include // SSE2 -#include "./vpx_config.h" -#include "./vp9_rtcd.h" -#include "vpx/vpx_integer.h" - -#if HAVE_SSE2 -unsigned int vp9_sad16x3_sse2( - const unsigned char *src_ptr, - int src_stride, - const unsigned char *ref_ptr, - int ref_stride) { - __m128i s0, s1, s2; - __m128i r0, r1, r2; - __m128i sad; - - s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride)); - s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride)); - s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride)); - - r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * ref_stride)); - r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * ref_stride)); - r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * ref_stride)); - - sad = _mm_sad_epu8(s0, r0); - sad = _mm_add_epi16(sad, _mm_sad_epu8(s1, r1)); - sad = _mm_add_epi16(sad, _mm_sad_epu8(s2, r2)); - sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8)); - - return _mm_cvtsi128_si32(sad); -} - -unsigned int vp9_sad3x16_sse2( - const unsigned char *src_ptr, - int src_stride, - const unsigned char *ref_ptr, - int ref_stride) { - int r; - __m128i s0, s1, s2, s3; - __m128i r0, r1, r2, r3; - __m128i sad = _mm_setzero_si128(); - __m128i mask; - const int offset = (uintptr_t)src_ptr & 3; - - /* In current use case, the offset is 1 if CONFIG_SUBPELREFMV is off. - * Here, for offset=1, we adjust src_ptr to be 4-byte aligned. Then, movd - * takes much less time. - */ - if (offset == 1) - src_ptr -= 1; - - /* mask = 0xffffffffffff0000ffffffffffff0000 */ - mask = _mm_cmpeq_epi32(sad, sad); - mask = _mm_slli_epi64(mask, 16); - - for (r = 0; r < 16; r += 4) { - s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride)); - s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride)); - s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride)); - s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride)); - r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * ref_stride)); - r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * ref_stride)); - r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * ref_stride)); - r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * ref_stride)); - - s0 = _mm_unpacklo_epi8(s0, s1); - r0 = _mm_unpacklo_epi8(r0, r1); - s2 = _mm_unpacklo_epi8(s2, s3); - r2 = _mm_unpacklo_epi8(r2, r3); - s0 = _mm_unpacklo_epi64(s0, s2); - r0 = _mm_unpacklo_epi64(r0, r2); - - // throw out extra byte - if (offset == 1) - s0 = _mm_and_si128(s0, mask); - else - s0 = _mm_slli_epi64(s0, 16); - r0 = _mm_slli_epi64(r0, 16); - - sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0)); - - src_ptr += src_stride*4; - ref_ptr += ref_stride*4; - } - - sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8)); - return _mm_cvtsi128_si32(sad); -} - -#endif diff --git a/vp9/common/x86/subpixel_8t_ssse3.asm b/vp9/common/x86/subpixel_8t_ssse3.asm deleted file mode 100644 index dd89710e8..000000000 --- a/vp9/common/x86/subpixel_8t_ssse3.asm +++ /dev/null @@ -1,550 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -; -; This is an implementation of some of the SSE optimizations first seen in ffvp8 -; -;*************************************************************************************/ - -;void vp9_filter_block1d8_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d8_v8_ssse3) -sym(vp9_filter_block1d8_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movd xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - - lea rbx, [rdx + rdx*4] - add rbx, rdx ;pitch * 6 - -.vp9_filter_block1d8_v8_ssse3_loop: - movq xmm0, [rsi] ;A - movq xmm1, [rsi + rdx] ;B - movq xmm2, [rsi + rdx * 2] ;C - movq xmm3, [rax + rdx * 2] ;D - movq xmm4, [rsi + rdx * 4] ;E - movq xmm5, [rax + rdx * 4] ;F - - punpcklbw xmm0, xmm1 ;A B - punpcklbw xmm2, xmm3 ;C D - punpcklbw xmm4, xmm5 ;E F - - movq xmm6, [rsi + rbx] ;G - movq xmm7, [rax + rbx] ;H - - pmaddubsw xmm0, k0k1 - pmaddubsw xmm2, k2k3 - punpcklbw xmm6, xmm7 ;G H - pmaddubsw xmm4, k4k5 - pmaddubsw xmm6, k6k7 - - paddsw xmm0, xmm2 - paddsw xmm0, krd - paddsw xmm4, xmm6 - paddsw xmm0, xmm4 - - psraw xmm0, 7 - packuswb xmm0, xmm0 - - add rsi, rdx - add rax, rdx - - movq [rdi], xmm0 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d8_v8_ssse3_loop - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d16_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d16_v8_ssse3) -sym(vp9_filter_block1d16_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movd xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - - lea rbx, [rdx + rdx*4] - add rbx, rdx ;pitch * 6 - -.vp9_filter_block1d16_v8_ssse3_loop: - movq xmm0, [rsi] ;A - movq xmm1, [rsi + rdx] ;B - movq xmm2, [rsi + rdx * 2] ;C - movq xmm3, [rax + rdx * 2] ;D - movq xmm4, [rsi + rdx * 4] ;E - movq xmm5, [rax + rdx * 4] ;F - - punpcklbw xmm0, xmm1 ;A B - punpcklbw xmm2, xmm3 ;C D - punpcklbw xmm4, xmm5 ;E F - - movq xmm6, [rsi + rbx] ;G - movq xmm7, [rax + rbx] ;H - - pmaddubsw xmm0, k0k1 - pmaddubsw xmm2, k2k3 - punpcklbw xmm6, xmm7 ;G H - pmaddubsw xmm4, k4k5 - pmaddubsw xmm6, k6k7 - - paddsw xmm0, xmm2 - paddsw xmm0, krd - paddsw xmm4, xmm6 - paddsw xmm0, xmm4 - - psraw xmm0, 7 - packuswb xmm0, xmm0 - - movq [rdi], xmm0 - - movq xmm0, [rsi + 8] ;A - movq xmm1, [rsi + rdx + 8] ;B - movq xmm2, [rsi + rdx * 2 + 8] ;C - movq xmm3, [rax + rdx * 2 + 8] ;D - movq xmm4, [rsi + rdx * 4 + 8] ;E - movq xmm5, [rax + rdx * 4 + 8] ;F - - punpcklbw xmm0, xmm1 ;A B - punpcklbw xmm2, xmm3 ;C D - punpcklbw xmm4, xmm5 ;E F - - - movq xmm6, [rsi + rbx + 8] ;G - movq xmm7, [rax + rbx + 8] ;H - punpcklbw xmm6, xmm7 ;G H - - - pmaddubsw xmm0, k0k1 - pmaddubsw xmm2, k2k3 - pmaddubsw xmm4, k4k5 - pmaddubsw xmm6, k6k7 - - paddsw xmm0, xmm2 - paddsw xmm4, xmm6 - paddsw xmm0, krd - paddsw xmm0, xmm4 - - psraw xmm0, 7 - packuswb xmm0, xmm0 - - add rsi, rdx - add rax, rdx - - movq [rdi+8], xmm0 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d16_v8_ssse3_loop - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d8_h8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d8_h8_ssse3) -sym(vp9_filter_block1d8_h8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movd xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 -; movdqa krd, xmm5 - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rdx, dword ptr arg(3) ;output_pitch - movsxd rcx, dword ptr arg(4) ;output_height - -.filter_block1d8_h8_rowloop_ssse3: - movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 - -; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 - movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 -;note: if we create a k0_k7 filter, we can save a pshufb -; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 - punpcklqdq xmm0, xmm3 - - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm0, k0k1 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 - - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 - - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 - - paddsw xmm0, xmm1 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - paddsw xmm0, xmm4 - psraw xmm0, 7 - packuswb xmm0, xmm0 - - lea rsi, [rsi + rax] - movq [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .filter_block1d8_h8_rowloop_ssse3 - - add rsp, 16*5 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d16_h8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d16_h8_ssse3) -sym(vp9_filter_block1d16_h8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movd xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rdx, dword ptr arg(3) ;output_pitch - movsxd rcx, dword ptr arg(4) ;output_height - -.filter_block1d16_h8_rowloop_ssse3: - movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 - -; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 - movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 -;note: if we create a k0_k7 filter, we can save a pshufb -; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 - punpcklqdq xmm0, xmm3 - - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm0, k0k1 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 - - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 - - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 - - paddsw xmm0, xmm1 - paddsw xmm0, xmm4 - paddsw xmm0, xmm2 - paddsw xmm0, krd - psraw xmm0, 7 - packuswb xmm0, xmm0 - - - movq xmm3, [rsi + 5] -; movq xmm7, [rsi + 12] - movq xmm7, [rsi + 13] -;note: same as above -; punpcklbw xmm3, xmm7 - punpcklqdq xmm3, xmm7 - - movdqa xmm1, xmm3 - pshufb xmm3, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm3, k0k1 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 - - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 - - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 - - paddsw xmm3, xmm1 - paddsw xmm3, xmm2 - paddsw xmm3, krd - paddsw xmm3, xmm4 - psraw xmm3, 7 - packuswb xmm3, xmm3 - punpcklqdq xmm0, xmm3 - - lea rsi, [rsi + rax] - movdqa [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .filter_block1d16_h8_rowloop_ssse3 - - add rsp, 16*5 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -align 16 -shuf_t0t1: - db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -align 16 -shuf_t2t3: - db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -align 16 -shuf_t4t5: - db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -align 16 -shuf_t6t7: - db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 diff --git a/vp9/common/x86/subpixel_mmx.asm b/vp9/common/x86/subpixel_mmx.asm deleted file mode 100644 index 2f757fa80..000000000 --- a/vp9/common/x86/subpixel_mmx.asm +++ /dev/null @@ -1,727 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - - -%define BLOCK_HEIGHT_WIDTH 4 -%define vp9_filter_weight 128 -%define VP9_FILTER_SHIFT 7 - - -;void vp9_filter_block1d_h6_mmx -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp9_filter -;) -global sym(vp9_filter_block1d_h6_mmx) -sym(vp9_filter_block1d_h6_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp9_filter - - movq mm1, [rdx + 16] ; do both the negative taps first!!! - movq mm2, [rdx + 32] ; - movq mm6, [rdx + 48] ; - movq mm7, [rdx + 64] ; - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - -.nextrow: - movq mm3, [rsi-2] ; mm3 = p-2..p5 - movq mm4, mm3 ; mm4 = p-2..p5 - psrlq mm3, 8 ; mm3 = p-1..p5 - punpcklbw mm3, mm0 ; mm3 = p-1..p2 - pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. - - movq mm5, mm4 ; mm5 = p-2..p5 - punpckhbw mm4, mm0 ; mm5 = p2..p5 - pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - movq mm4, mm5 ; mm4 = p-2..p5; - psrlq mm5, 16 ; mm5 = p0..p5; - punpcklbw mm5, mm0 ; mm5 = p0..p3 - pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers - paddsw mm3, mm5 ; mm3 += mm5 - - movq mm5, mm4 ; mm5 = p-2..p5 - psrlq mm4, 24 ; mm4 = p1..p5 - punpcklbw mm4, mm0 ; mm4 = p1..p4 - pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - ; do outer positive taps - movd mm4, [rsi+3] - punpcklbw mm4, mm0 ; mm5 = p3..p6 - pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - punpcklbw mm5, mm0 ; mm5 = p-2..p1 - pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers - paddsw mm3, mm5 ; mm3 += mm5 - - paddsw mm3, [GLOBAL(rd)] ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - packuswb mm3, mm0 ; pack and unpack to saturate - punpcklbw mm3, mm0 ; - - movq [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line - add rdi, rax; -%else - movsxd r8, dword ptr arg(2) ;src_pixels_per_line - add rdi, rax; - - add rsi, r8 ; next line -%endif - - dec rcx ; decrement count - jnz .nextrow ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1dc_v6_mmx -;( -; short *src_ptr, -; unsigned char *output_ptr, -; int output_pitch, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp9_filter -;) -global sym(vp9_filter_block1dc_v6_mmx) -sym(vp9_filter_block1dc_v6_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movq mm5, [GLOBAL(rd)] - push rbx - mov rbx, arg(7) ;vp9_filter - movq mm1, [rbx + 16] ; do both the negative taps first!!! - movq mm2, [rbx + 32] ; - movq mm6, [rbx + 48] ; - movq mm7, [rbx + 64] ; - - movsxd rdx, dword ptr arg(3) ;pixels_per_line - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - sub rsi, rdx - sub rsi, rdx - movsxd rcx, DWORD PTR arg(5) ;output_height - movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - - -.nextrow_cv: - movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 - pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. - - - movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 - pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 - pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi] ; mm4 = p0..p3 = row -2 - pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - - add rsi, rdx ; move source forward 1 line to avoid 3 * pitch - movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 - pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 - pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - - paddsw mm3, mm5 ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - packuswb mm3, mm0 ; pack and saturate - - movd [rdi],mm3 ; store the results in the destination - ; the subsequent iterations repeat 3 out of 4 of these reads. Since the - ; recon block should be in cache this shouldn't cost much. Its obviously - ; avoidable!!!. - lea rdi, [rdi+rax] ; - dec rcx ; decrement count - jnz .nextrow_cv ; next row - - pop rbx - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void bilinear_predict8x8_mmx -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp9_bilinear_predict8x8_mmx) -sym(vp9_bilinear_predict8x8_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = bilinear_filters_mmx[xoffset]; - ;const short *VFilter = bilinear_filters_mmx[yoffset]; - - movsxd rax, dword ptr arg(2) ;xoffset - mov rdi, arg(4) ;dst_ptr ; - - shl rax, 5 ; offset * 32 - lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))] - - add rax, rcx ; HFilter - mov rsi, arg(0) ;src_ptr ; - - movsxd rdx, dword ptr arg(5) ;dst_pitch - movq mm1, [rax] ; - - movq mm2, [rax+16] ; - movsxd rax, dword ptr arg(3) ;yoffset - - pxor mm0, mm0 ; - - shl rax, 5 ; offset*32 - add rax, rcx ; VFilter - - lea rcx, [rdi+rdx*8] ; - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; - - - - ; get the first horizontal line done ; - movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movq mm4, mm3 ; make a copy of current line - - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - punpckhbw mm4, mm0 ; - - pmullw mm3, mm1 ; - pmullw mm4, mm1 ; - - movq mm5, [rsi+1] ; - movq mm6, mm5 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 ; - - pmullw mm5, mm2 ; - pmullw mm6, mm2 ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP9_FILTER_SHIFT ; - - movq mm7, mm3 ; - packuswb mm7, mm4 ; - - add rsi, rdx ; next line -.next_row_8x8: - movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movq mm4, mm3 ; make a copy of current line - - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - punpckhbw mm4, mm0 ; - - pmullw mm3, mm1 ; - pmullw mm4, mm1 ; - - movq mm5, [rsi+1] ; - movq mm6, mm5 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 ; - - pmullw mm5, mm2 ; - pmullw mm6, mm2 ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - movq mm5, mm7 ; - movq mm6, mm7 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 - - pmullw mm5, [rax] ; - pmullw mm6, [rax] ; - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP9_FILTER_SHIFT ; - - movq mm7, mm3 ; - packuswb mm7, mm4 ; - - - pmullw mm3, [rax+16] ; - pmullw mm4, [rax+16] ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP9_FILTER_SHIFT ; - - packuswb mm3, mm4 - - movq [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, rdx ; next line - add rdi, dword ptr arg(5) ;dst_pitch ; -%else - movsxd r8, dword ptr arg(5) ;dst_pitch - add rsi, rdx ; next line - add rdi, r8 ;dst_pitch -%endif - cmp rdi, rcx ; - jne .next_row_8x8 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void bilinear_predict8x4_mmx -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp9_bilinear_predict8x4_mmx) -sym(vp9_bilinear_predict8x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = bilinear_filters_mmx[xoffset]; - ;const short *VFilter = bilinear_filters_mmx[yoffset]; - - movsxd rax, dword ptr arg(2) ;xoffset - mov rdi, arg(4) ;dst_ptr ; - - lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))] - shl rax, 5 - - mov rsi, arg(0) ;src_ptr ; - add rax, rcx - - movsxd rdx, dword ptr arg(5) ;dst_pitch - movq mm1, [rax] ; - - movq mm2, [rax+16] ; - movsxd rax, dword ptr arg(3) ;yoffset - - pxor mm0, mm0 ; - shl rax, 5 - - add rax, rcx - lea rcx, [rdi+rdx*4] ; - - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; - - ; get the first horizontal line done ; - movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movq mm4, mm3 ; make a copy of current line - - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - punpckhbw mm4, mm0 ; - - pmullw mm3, mm1 ; - pmullw mm4, mm1 ; - - movq mm5, [rsi+1] ; - movq mm6, mm5 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 ; - - pmullw mm5, mm2 ; - pmullw mm6, mm2 ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP9_FILTER_SHIFT ; - - movq mm7, mm3 ; - packuswb mm7, mm4 ; - - add rsi, rdx ; next line -.next_row_8x4: - movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movq mm4, mm3 ; make a copy of current line - - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - punpckhbw mm4, mm0 ; - - pmullw mm3, mm1 ; - pmullw mm4, mm1 ; - - movq mm5, [rsi+1] ; - movq mm6, mm5 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 ; - - pmullw mm5, mm2 ; - pmullw mm6, mm2 ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - movq mm5, mm7 ; - movq mm6, mm7 ; - - punpcklbw mm5, mm0 ; - punpckhbw mm6, mm0 - - pmullw mm5, [rax] ; - pmullw mm6, [rax] ; - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP9_FILTER_SHIFT ; - - movq mm7, mm3 ; - packuswb mm7, mm4 ; - - - pmullw mm3, [rax+16] ; - pmullw mm4, [rax+16] ; - - paddw mm3, mm5 ; - paddw mm4, mm6 ; - - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw mm4, [GLOBAL(rd)] ; - psraw mm4, VP9_FILTER_SHIFT ; - - packuswb mm3, mm4 - - movq [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, rdx ; next line - add rdi, dword ptr arg(5) ;dst_pitch ; -%else - movsxd r8, dword ptr arg(5) ;dst_pitch - add rsi, rdx ; next line - add rdi, r8 -%endif - cmp rdi, rcx ; - jne .next_row_8x4 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void bilinear_predict4x4_mmx -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp9_bilinear_predict4x4_mmx) -sym(vp9_bilinear_predict4x4_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = bilinear_filters_mmx[xoffset]; - ;const short *VFilter = bilinear_filters_mmx[yoffset]; - - movsxd rax, dword ptr arg(2) ;xoffset - mov rdi, arg(4) ;dst_ptr ; - - lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))] - shl rax, 5 - - add rax, rcx ; HFilter - mov rsi, arg(0) ;src_ptr ; - - movsxd rdx, dword ptr arg(5) ;ldst_pitch - movq mm1, [rax] ; - - movq mm2, [rax+16] ; - movsxd rax, dword ptr arg(3) ;yoffset - - pxor mm0, mm0 ; - shl rax, 5 - - add rax, rcx - lea rcx, [rdi+rdx*4] ; - - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; - - ; get the first horizontal line done ; - movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - - pmullw mm3, mm1 ; - movd mm5, [rsi+1] ; - - punpcklbw mm5, mm0 ; - pmullw mm5, mm2 ; - - paddw mm3, mm5 ; - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - - psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movq mm7, mm3 ; - packuswb mm7, mm0 ; - - add rsi, rdx ; next line -.next_row_4x4: - movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 - - pmullw mm3, mm1 ; - movd mm5, [rsi+1] ; - - punpcklbw mm5, mm0 ; - pmullw mm5, mm2 ; - - paddw mm3, mm5 ; - - movq mm5, mm7 ; - punpcklbw mm5, mm0 ; - - pmullw mm5, [rax] ; - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - - psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - movq mm7, mm3 ; - - packuswb mm7, mm0 ; - - pmullw mm3, [rax+16] ; - paddw mm3, mm5 ; - - - paddw mm3, [GLOBAL(rd)] ; xmm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - packuswb mm3, mm0 - movd [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, rdx ; next line - add rdi, dword ptr arg(5) ;dst_pitch ; -%else - movsxd r8, dword ptr arg(5) ;dst_pitch ; - add rsi, rdx ; next line - add rdi, r8 -%endif - - cmp rdi, rcx ; - jne .next_row_4x4 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - - -SECTION_RODATA -align 16 -rd: - times 4 dw 0x40 - -align 16 -global HIDDEN_DATA(sym(vp9_six_tap_mmx)) -sym(vp9_six_tap_mmx): - times 8 dw 0 - times 8 dw 0 - times 8 dw 128 - times 8 dw 0 - times 8 dw 0 - times 8 dw 0 - - times 8 dw 0 - times 8 dw -6 - times 8 dw 123 - times 8 dw 12 - times 8 dw -1 - times 8 dw 0 - - times 8 dw 2 - times 8 dw -11 - times 8 dw 108 - times 8 dw 36 - times 8 dw -8 - times 8 dw 1 - - times 8 dw 0 - times 8 dw -9 - times 8 dw 93 - times 8 dw 50 - times 8 dw -6 - times 8 dw 0 - - times 8 dw 3 - times 8 dw -16 - times 8 dw 77 - times 8 dw 77 - times 8 dw -16 - times 8 dw 3 - - times 8 dw 0 - times 8 dw -6 - times 8 dw 50 - times 8 dw 93 - times 8 dw -9 - times 8 dw 0 - - times 8 dw 1 - times 8 dw -8 - times 8 dw 36 - times 8 dw 108 - times 8 dw -11 - times 8 dw 2 - - times 8 dw 0 - times 8 dw -1 - times 8 dw 12 - times 8 dw 123 - times 8 dw -6 - times 8 dw 0 - - -align 16 -global HIDDEN_DATA(sym(vp9_bilinear_filters_8x_mmx)) -sym(vp9_bilinear_filters_8x_mmx): - times 8 dw 128 - times 8 dw 0 - - times 8 dw 112 - times 8 dw 16 - - times 8 dw 96 - times 8 dw 32 - - times 8 dw 80 - times 8 dw 48 - - times 8 dw 64 - times 8 dw 64 - - times 8 dw 48 - times 8 dw 80 - - times 8 dw 32 - times 8 dw 96 - - times 8 dw 16 - times 8 dw 112 diff --git a/vp9/common/x86/subpixel_sse2.asm b/vp9/common/x86/subpixel_sse2.asm deleted file mode 100644 index f62587406..000000000 --- a/vp9/common/x86/subpixel_sse2.asm +++ /dev/null @@ -1,1372 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define BLOCK_HEIGHT_WIDTH 4 -%define VP9_FILTER_WEIGHT 128 -%define VP9_FILTER_SHIFT 7 - - -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -;*************************************************************************************/ -;void vp9_filter_block1d8_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short *vp9_filter -;) -global sym(vp9_filter_block1d8_h6_sse2) -sym(vp9_filter_block1d8_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;output_width -%endif - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d8_h6_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm4 - lea rsi, [rsi + rax] - -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(5) ;[output_width] -%else - add rdi, r8 -%endif - dec rcx - - jnz .filter_block1d8_h6_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d16_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short *vp9_filter -;) -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -;*************************************************************************************/ -global sym(vp9_filter_block1d16_h6_sse2) -sym(vp9_filter_block1d16_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;output_width -%endif - - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d16_h6_sse2_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - movq xmm2, MMWORD PTR [rsi +14] - pslldq xmm2, 8 - - por xmm2, xmm1 - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm4 - - movdqa xmm3, xmm2 - movdqa xmm4, xmm2 - - movdqa xmm5, xmm2 - movdqa xmm6, xmm2 - - movdqa xmm7, xmm2 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm2 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi+16], xmm4 - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(5) ;[output_width] -%else - add rdi, r8 -%endif - - dec rcx - jnz .filter_block1d16_h6_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d8_v6_sse2 -;( -; short *src_ptr, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp9_filter -;) -;/************************************************************************************ -; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The -; input pixel array has output_height rows. -;*************************************************************************************/ -global sym(vp9_filter_block1d8_v6_sse2) -sym(vp9_filter_block1d8_v6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rax, arg(7) ;vp9_filter - movsxd rdx, dword ptr arg(3) ;pixels_per_line - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - - sub rsi, rdx - sub rsi, rdx - - movsxd rcx, DWORD PTR arg(5) ;[output_height] - pxor xmm0, xmm0 ; clear xmm0 - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(2) ; dst_ptich -%endif - -.vp9_filter_block1d8_v6_sse2_loop: - movdqa xmm1, XMMWORD PTR [rsi] - pmullw xmm1, [rax] - - movdqa xmm2, XMMWORD PTR [rsi + rdx] - pmullw xmm2, [rax + 16] - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] - pmullw xmm3, [rax + 32] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] - pmullw xmm5, [rax + 64] - - add rsi, rdx - movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] - - pmullw xmm4, [rax + 48] - movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] - - pmullw xmm6, [rax + 80] - - paddsw xmm2, xmm5 - paddsw xmm2, xmm3 - - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - - paddsw xmm2, xmm6 - paddsw xmm2, xmm7 - - psraw xmm2, 7 - packuswb xmm2, xmm0 ; pack and saturate - - movq QWORD PTR [rdi], xmm2 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(2) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp9_filter_block1d8_v6_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d16_v6_sse2 -;( -; unsigned short *src_ptr, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; const short *vp9_filter -;) -;/************************************************************************************ -; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The -; input pixel array has output_height rows. -;*************************************************************************************/ -global sym(vp9_filter_block1d16_v6_sse2) -sym(vp9_filter_block1d16_v6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rax, arg(7) ;vp9_filter - movsxd rdx, dword ptr arg(3) ;pixels_per_line - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - - sub rsi, rdx - sub rsi, rdx - - movsxd rcx, DWORD PTR arg(5) ;[output_height] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(2) ; dst_ptich -%endif - -.vp9_filter_block1d16_v6_sse2_loop: -; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. - movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 - movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] - pmullw xmm1, [rax + 16] - pmullw xmm2, [rax + 16] - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 - movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] - pmullw xmm3, [rax + 64] - pmullw xmm4, [rax + 64] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 - movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] - pmullw xmm5, [rax + 32] - pmullw xmm6, [rax + 32] - - movdqa xmm7, XMMWORD PTR [rsi] ; line 1 - movdqa xmm0, XMMWORD PTR [rsi + 16] - pmullw xmm7, [rax] - pmullw xmm0, [rax] - - paddsw xmm1, xmm3 - paddsw xmm2, xmm4 - paddsw xmm1, xmm5 - paddsw xmm2, xmm6 - paddsw xmm1, xmm7 - paddsw xmm2, xmm0 - - add rsi, rdx - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 - movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] - pmullw xmm3, [rax + 48] - pmullw xmm4, [rax + 48] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 - movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] - pmullw xmm5, [rax + 80] - pmullw xmm6, [rax + 80] - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] - pxor xmm0, xmm0 ; clear xmm0 - - paddsw xmm1, xmm3 - paddsw xmm2, xmm4 - paddsw xmm1, xmm5 - paddsw xmm2, xmm6 - - paddsw xmm1, xmm7 - paddsw xmm2, xmm7 - - psraw xmm1, 7 - psraw xmm2, 7 - - packuswb xmm1, xmm2 ; pack and saturate - movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(2) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp9_filter_block1d16_v6_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d8_h6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp9_filter -;) -; First-pass filter only when yoffset==0 -global sym(vp9_filter_block1d8_h6_only_sse2) -sym(vp9_filter_block1d8_h6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(5) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ;dst_ptich -%endif - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d8_h6_only_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - - movq QWORD PTR [rdi], xmm4 ; store the results in the destination - lea rsi, [rsi + rax] - -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(3) ;dst_ptich -%else - add rdi, r8 -%endif - dec rcx - - jnz .filter_block1d8_h6_only_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d16_h6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp9_filter -;) -; First-pass filter only when yoffset==0 -global sym(vp9_filter_block1d16_h6_only_sse2) -sym(vp9_filter_block1d16_h6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(5) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ;dst_ptich -%endif - - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d16_h6_only_sse2_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - movq xmm2, MMWORD PTR [rsi +14] - pslldq xmm2, 8 - - por xmm2, xmm1 - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 ; lower 8 bytes - - movq QWORD Ptr [rdi], xmm4 ; store the results in the destination - - movdqa xmm3, xmm2 - movdqa xmm4, xmm2 - - movdqa xmm5, xmm2 - movdqa xmm6, xmm2 - - movdqa xmm7, xmm2 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm2 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 ; higher 8 bytes - - movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(3) ;dst_ptich -%else - add rdi, r8 -%endif - - dec rcx - jnz .filter_block1d16_h6_only_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d8_v6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp9_filter -;) -; Second-pass filter only when xoffset==0 -global sym(vp9_filter_block1d8_v6_only_sse2) -sym(vp9_filter_block1d8_v6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - mov rax, arg(5) ;vp9_filter - - pxor xmm0, xmm0 ; clear xmm0 - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ; dst_ptich -%endif - -.vp9_filter_block1d8_v6_only_sse2_loop: - movq xmm1, MMWORD PTR [rsi] - movq xmm2, MMWORD PTR [rsi + rdx] - movq xmm3, MMWORD PTR [rsi + rdx * 2] - movq xmm5, MMWORD PTR [rsi + rdx * 4] - add rsi, rdx - movq xmm4, MMWORD PTR [rsi + rdx * 2] - movq xmm6, MMWORD PTR [rsi + rdx * 4] - - punpcklbw xmm1, xmm0 - pmullw xmm1, [rax] - - punpcklbw xmm2, xmm0 - pmullw xmm2, [rax + 16] - - punpcklbw xmm3, xmm0 - pmullw xmm3, [rax + 32] - - punpcklbw xmm5, xmm0 - pmullw xmm5, [rax + 64] - - punpcklbw xmm4, xmm0 - pmullw xmm4, [rax + 48] - - punpcklbw xmm6, xmm0 - pmullw xmm6, [rax + 80] - - paddsw xmm2, xmm5 - paddsw xmm2, xmm3 - - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - - paddsw xmm2, xmm6 - paddsw xmm2, xmm7 - - psraw xmm2, 7 - packuswb xmm2, xmm0 ; pack and saturate - - movq QWORD PTR [rdi], xmm2 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp9_filter_block1d8_v6_only_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_unpack_block1d16_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int output_height, -; unsigned int output_width -;) -global sym(vp9_unpack_block1d16_h6_sse2) -sym(vp9_unpack_block1d16_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(3) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source - - pxor xmm0, xmm0 ; clear xmm0 for unpack -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source -%endif - -.unpack_block1d16_h6_sse2_rowloop: - movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 - movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - punpcklbw xmm1, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm1 - movdqa XMMWORD Ptr [rdi + 16], xmm3 - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(4) ;[output_width] -%else - add rdi, r8 -%endif - dec rcx - jnz .unpack_block1d16_h6_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_bilinear_predict16x16_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -extern sym(vp9_bilinear_filters_mmx) -global sym(vp9_bilinear_predict16x16_sse2) -sym(vp9_bilinear_predict16x16_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = bilinear_filters_mmx[xoffset] - ;const short *VFilter = bilinear_filters_mmx[yoffset] - - lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))] - movsxd rax, dword ptr arg(2) ;xoffset - - cmp rax, 0 ;skip first_pass filter if xoffset=0 - je .b16x16_sp_only - - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - - cmp rax, 0 ;skip second_pass filter if yoffset=0 - je .b16x16_fp_only - - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;dst_pitch -%endif - ; get the first horizontal line done - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - add rsi, rdx ; next line -.next_row: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, [rax] - pmullw xmm6, [rax] - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - pmullw xmm3, [rax+16] - pmullw xmm4, [rax+16] - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rdx ; next line -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(5) ;dst_pitch -%else - add rdi, r8 -%endif - - cmp rdi, rcx - jne .next_row - - jmp .done - -.b16x16_sp_only: - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - add rsi, rax ; next line -.next_row_spo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - movdqa xmm4, xmm3 ; make a copy of current line - movdqa xmm7, xmm3 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm5, xmm1 - pmullw xmm6, xmm1 - pmullw xmm3, xmm2 - pmullw xmm4, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ;dst_pitch - cmp rdi, rcx - jne .next_row_spo - - jmp .done - -.b16x16_fp_only: - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - pxor xmm0, xmm0 - -.next_row_fpo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ; dst_pitch - cmp rdi, rcx - jne .next_row_fpo - -.done: - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_bilinear_predict8x8_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -extern sym(vp9_bilinear_filters_mmx) -global sym(vp9_bilinear_predict8x8_sse2) -sym(vp9_bilinear_predict8x8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 144 ; reserve 144 bytes - - ;const short *HFilter = bilinear_filters_mmx[xoffset] - ;const short *VFilter = bilinear_filters_mmx[yoffset] - lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))] - - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - ;Read 9-line unaligned data in and put them on stack. This gives a big - ;performance boost. - movdqu xmm0, [rsi] - lea rax, [rdx + rdx*2] - movdqu xmm1, [rsi+rdx] - movdqu xmm2, [rsi+rdx*2] - add rsi, rax - movdqu xmm3, [rsi] - movdqu xmm4, [rsi+rdx] - movdqu xmm5, [rsi+rdx*2] - add rsi, rax - movdqu xmm6, [rsi] - movdqu xmm7, [rsi+rdx] - - movdqa XMMWORD PTR [rsp], xmm0 - - movdqu xmm0, [rsi+rdx*2] - - movdqa XMMWORD PTR [rsp+16], xmm1 - movdqa XMMWORD PTR [rsp+32], xmm2 - movdqa XMMWORD PTR [rsp+48], xmm3 - movdqa XMMWORD PTR [rsp+64], xmm4 - movdqa XMMWORD PTR [rsp+80], xmm5 - movdqa XMMWORD PTR [rsp+96], xmm6 - movdqa XMMWORD PTR [rsp+112], xmm7 - movdqa XMMWORD PTR [rsp+128], xmm0 - - movsxd rax, dword ptr arg(2) ;xoffset - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - - movdqa xmm5, [rax] - movdqa xmm6, [rax+16] - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqa xmm3, XMMWORD PTR [rsp] - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm7, xmm3 - add rsp, 16 ; next line -.next_row8x8: - movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - pmullw xmm7, xmm5 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm4, xmm3 - - pmullw xmm3, xmm6 - paddw xmm3, xmm7 - - movdqa xmm7, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - packuswb xmm3, xmm0 - movq [rdi], xmm3 ; store the results in the destination - - add rsp, 16 ; next line - add rdi, rdx - - cmp rdi, rcx - jne .next_row8x8 - - ;add rsp, 144 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -align 16 -rd: - times 8 dw 0x40 diff --git a/vp9/common/x86/subpixel_ssse3.asm b/vp9/common/x86/subpixel_ssse3.asm deleted file mode 100644 index 4a16f1928..000000000 --- a/vp9/common/x86/subpixel_ssse3.asm +++ /dev/null @@ -1,1515 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define BLOCK_HEIGHT_WIDTH 4 -%define VP9_FILTER_WEIGHT 128 -%define VP9_FILTER_SHIFT 7 - - -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -; -; This is an implementation of some of the SSE optimizations first seen in ffvp8 -; -;*************************************************************************************/ -;void vp9_filter_block1d8_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d8_h6_ssse3) -sym(vp9_filter_block1d8_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 - - movdqa xmm7, [GLOBAL(rd)] - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - mov rdi, arg(2) ;output_ptr - - cmp esi, DWORD PTR [rax] - je vp9_filter_block1d8_h4_ssse3 - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - - sub rdi, rdx -;xmm3 free -.filter_block1d8_h6_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm1, xmm0 - pmaddubsw xmm0, xmm4 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - pmaddubsw xmm1, xmm5 - - lea rdi, [rdi + rdx] - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] - dec rcx - - paddsw xmm0, xmm1 - paddsw xmm2, xmm7 - - paddsw xmm0, xmm2 - - psraw xmm0, 7 - - packuswb xmm0, xmm0 - - movq MMWORD Ptr [rdi], xmm0 - jnz .filter_block1d8_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -vp9_filter_block1d8_h4_ssse3: - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] - movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] - - mov rsi, arg(0) ;src_ptr - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - - sub rdi, rdx - -.filter_block1d8_h4_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm2, xmm0 - pshufb xmm0, xmm3 - - pshufb xmm2, xmm4 - pmaddubsw xmm0, xmm5 - - lea rdi, [rdi + rdx] - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] - dec rcx - - paddsw xmm0, xmm7 - - paddsw xmm0, xmm2 - - psraw xmm0, 7 - - packuswb xmm0, xmm0 - - movq MMWORD Ptr [rdi], xmm0 - - jnz .filter_block1d8_h4_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -;void vp9_filter_block1d16_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d16_h6_ssse3) -sym(vp9_filter_block1d16_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - mov rdi, arg(2) ;output_ptr - - mov rsi, arg(0) ;src_ptr - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rdx, dword ptr arg(3) ;output_pitch - -.filter_block1d16_h6_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm1, xmm0 - pmaddubsw xmm0, xmm4 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - movq xmm3, MMWORD PTR [rsi + 6] - - pmaddubsw xmm1, xmm5 - movq xmm7, MMWORD PTR [rsi + 11] - - pmaddubsw xmm2, xmm6 - punpcklbw xmm3, xmm7 - - paddsw xmm0, xmm1 - movdqa xmm1, xmm3 - - pmaddubsw xmm3, xmm4 - paddsw xmm0, xmm2 - - movdqa xmm2, xmm1 - paddsw xmm0, [GLOBAL(rd)] - - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - - psraw xmm0, 7 - pmaddubsw xmm1, xmm5 - - pmaddubsw xmm2, xmm6 - packuswb xmm0, xmm0 - - lea rsi, [rsi + rax] - paddsw xmm3, xmm1 - - paddsw xmm3, xmm2 - - paddsw xmm3, [GLOBAL(rd)] - - psraw xmm3, 7 - - packuswb xmm3, xmm3 - - punpcklqdq xmm0, xmm3 - - movdqa XMMWORD Ptr [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .filter_block1d16_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d4_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d4_h6_ssse3) -sym(vp9_filter_block1d4_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - movdqa xmm7, [GLOBAL(rd)] - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d4_h4_ssse3 - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - -;xmm3 free -.filter_block1d4_h6_rowloop_ssse3: - movdqu xmm0, XMMWORD PTR [rsi - 2] - - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf1b)] - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2b)] - pmaddubsw xmm0, xmm4 - pshufb xmm2, [GLOBAL(shuf3b)] - pmaddubsw xmm1, xmm5 - -;-- - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] -;-- - paddsw xmm0, xmm1 - paddsw xmm0, xmm7 - pxor xmm1, xmm1 - paddsw xmm0, xmm2 - psraw xmm0, 7 - packuswb xmm0, xmm0 - - movd DWORD PTR [rdi], xmm0 - - add rdi, rdx - dec rcx - jnz .filter_block1d4_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d4_h4_ssse3: - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] - movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - -.filter_block1d4_h4_rowloop_ssse3: - movdqu xmm1, XMMWORD PTR [rsi - 2] - - movdqa xmm2, xmm1 - pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] - pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] - pmaddubsw xmm1, xmm5 - -;-- - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] -;-- - paddsw xmm1, xmm7 - paddsw xmm1, xmm2 - psraw xmm1, 7 - packuswb xmm1, xmm1 - - movd DWORD PTR [rdi], xmm1 - - add rdi, rdx - dec rcx - jnz .filter_block1d4_h4_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - -;void vp9_filter_block1d16_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d16_v6_ssse3) -sym(vp9_filter_block1d16_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d16_v4_ssse3 - - movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - - -.vp9_filter_block1d16_v6_ssse3_loop: - movq xmm1, MMWORD PTR [rsi] ;A - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4] ;F - - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, [GLOBAL(rd)] - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 ;store the results - - movq xmm1, MMWORD PTR [rsi + 8] ;A - movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, [GLOBAL(rd)] - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi+8], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d16_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d16_v4_ssse3: - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - -.vp9_filter_block1d16_v4_ssse3_loop: - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - pmaddubsw xmm3, xmm6 - pmaddubsw xmm2, xmm7 - movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B - movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E - - paddsw xmm2, [GLOBAL(rd)] - paddsw xmm2, xmm3 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - punpcklbw xmm5, xmm4 ;B D - punpcklbw xmm1, xmm0 ;C E - - pmaddubsw xmm1, xmm6 - pmaddubsw xmm5, xmm7 - - movdqa xmm4, [GLOBAL(rd)] - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm5, xmm1 - paddsw xmm5, xmm4 - psraw xmm5, 7 - packuswb xmm5, xmm5 - - punpcklqdq xmm2, xmm5 - - movdqa XMMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d16_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d8_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d8_v6_ssse3) -sym(vp9_filter_block1d8_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ; out_pitch -%endif - movsxd rcx, DWORD PTR arg(4) ;[output_height] - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d8_v4_ssse3 - - movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d8_v6_ssse3_loop: - movq xmm1, MMWORD PTR [rsi] ;A - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4] ;F - movdqa xmm4, [GLOBAL(rd)] - - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d8_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d8_v4_ssse3: - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - movdqa xmm5, [GLOBAL(rd)] - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d8_v4_ssse3_loop: - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - pmaddubsw xmm3, xmm6 - pmaddubsw xmm2, xmm7 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm5 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d8_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -;void vp9_filter_block1d4_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d4_v6_ssse3) -sym(vp9_filter_block1d4_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ; out_pitch -%endif - movsxd rcx, DWORD PTR arg(4) ;[output_height] - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d4_v4_ssse3 - - movq mm5, MMWORD PTR [rax] ;k0_k5 - movq mm6, MMWORD PTR [rax+256] ;k2_k4 - movq mm7, MMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d4_v6_ssse3_loop: - movd mm1, DWORD PTR [rsi] ;A - movd mm2, DWORD PTR [rsi + rdx] ;B - movd mm3, DWORD PTR [rsi + rdx * 2] ;C - movd mm4, DWORD PTR [rax + rdx * 2] ;D - movd mm0, DWORD PTR [rsi + rdx * 4] ;E - - punpcklbw mm2, mm4 ;B D - punpcklbw mm3, mm0 ;C E - - movd mm0, DWORD PTR [rax + rdx * 4] ;F - - movq mm4, [GLOBAL(rd)] - - pmaddubsw mm3, mm6 - punpcklbw mm1, mm0 ;A F - pmaddubsw mm2, mm7 - pmaddubsw mm1, mm5 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw mm2, mm3 - paddsw mm2, mm1 - paddsw mm2, mm4 - psraw mm2, 7 - packuswb mm2, mm2 - - movd DWORD PTR [rdi], mm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d4_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d4_v4_ssse3: - movq mm6, MMWORD PTR [rax+256] ;k2_k4 - movq mm7, MMWORD PTR [rax+128] ;k1_k3 - movq mm5, MMWORD PTR [GLOBAL(rd)] - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d4_v4_ssse3_loop: - movd mm2, DWORD PTR [rsi + rdx] ;B - movd mm3, DWORD PTR [rsi + rdx * 2] ;C - movd mm4, DWORD PTR [rax + rdx * 2] ;D - movd mm0, DWORD PTR [rsi + rdx * 4] ;E - - punpcklbw mm2, mm4 ;B D - punpcklbw mm3, mm0 ;C E - - pmaddubsw mm3, mm6 - pmaddubsw mm2, mm7 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw mm2, mm3 - paddsw mm2, mm5 - psraw mm2, 7 - packuswb mm2, mm2 - - movd DWORD PTR [rdi], mm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d4_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_bilinear_predict16x16_ssse3 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp9_bilinear_predict16x16_ssse3) -sym(vp9_bilinear_predict16x16_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - lea rcx, [GLOBAL(bilinear_filters_ssse3)] - movsxd rax, dword ptr arg(2) ; xoffset - - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .b16x16_sp_only - - shl rax, 4 - lea rax, [rax + rcx] ; HFilter - - mov rdi, arg(4) ; dst_ptr - mov rsi, arg(0) ; src_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm1, [rax] - - movsxd rax, dword ptr arg(3) ; yoffset - - cmp rax, 0 ; skip second_pass filter if yoffset=0 - je .b16x16_fp_only - - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rdx, dword ptr arg(1) ; src_pixels_per_line - - movdqa xmm2, [rax] - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ; dst_pitch -%endif - movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 - movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 - - movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 - - lea rsi, [rsi + rdx] ; next line - - pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 - - punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 - pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value - psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128 - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - -.next_row: - movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm6, xmm5 - movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 - - movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 - lea rsi, [rsi + rdx] ; next line - - pmaddubsw xmm6, xmm1 - - punpcklbw xmm4, xmm5 - pmaddubsw xmm4, xmm1 - - paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value - psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128 - - paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value - psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128 - - packuswb xmm6, xmm4 - movdqa xmm5, xmm7 - - punpcklbw xmm5, xmm6 - pmaddubsw xmm5, xmm2 - - punpckhbw xmm7, xmm6 - pmaddubsw xmm7, xmm2 - - paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value - psraw xmm5, VP9_FILTER_SHIFT ; xmm5 /= 128 - - paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value - psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128 - - packuswb xmm5, xmm7 - movdqa xmm7, xmm6 - - movdqa [rdi], xmm5 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(5) ; dst_pitch -%else - add rdi, r8 -%endif - - cmp rdi, rcx - jne .next_row - - jmp .done - -.b16x16_sp_only: - movsxd rax, dword ptr arg(3) ; yoffset - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - mov rdi, arg(4) ; dst_ptr - mov rsi, arg(0) ; src_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm1, [rax] ; VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ; src_pixels_per_line - - ; get the first horizontal line done - movq xmm4, [rsi] ; load row 0 - movq xmm2, [rsi + 8] ; load row 0 - - lea rsi, [rsi + rax] ; next line -.next_row_sp: - movq xmm3, [rsi] ; load row + 1 - movq xmm5, [rsi + 8] ; load row + 1 - - punpcklbw xmm4, xmm3 - punpcklbw xmm2, xmm5 - - pmaddubsw xmm4, xmm1 - movq xmm7, [rsi + rax] ; load row + 2 - - pmaddubsw xmm2, xmm1 - movq xmm6, [rsi + rax + 8] ; load row + 2 - - punpcklbw xmm3, xmm7 - punpcklbw xmm5, xmm6 - - pmaddubsw xmm3, xmm1 - paddw xmm4, [GLOBAL(rd)] - - pmaddubsw xmm5, xmm1 - paddw xmm2, [GLOBAL(rd)] - - psraw xmm4, VP9_FILTER_SHIFT - psraw xmm2, VP9_FILTER_SHIFT - - packuswb xmm4, xmm2 - paddw xmm3, [GLOBAL(rd)] - - movdqa [rdi], xmm4 ; store row 0 - paddw xmm5, [GLOBAL(rd)] - - psraw xmm3, VP9_FILTER_SHIFT - psraw xmm5, VP9_FILTER_SHIFT - - packuswb xmm3, xmm5 - movdqa xmm4, xmm7 - - movdqa [rdi + rdx],xmm3 ; store row 1 - lea rsi, [rsi + 2*rax] - - movdqa xmm2, xmm6 - lea rdi, [rdi + 2*rdx] - - cmp rdi, rcx - jne .next_row_sp - - jmp .done - -.b16x16_fp_only: - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ; src_pixels_per_line - -.next_row_fp: - movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm2, xmm4 - movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 - - pmaddubsw xmm2, xmm1 - movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 - - lea rsi, [rsi + rax] ; next line - punpcklbw xmm3, xmm4 - - pmaddubsw xmm3, xmm1 - movq xmm5, [rsi] - - paddw xmm2, [GLOBAL(rd)] - movq xmm7, [rsi+1] - - movq xmm6, [rsi+8] - psraw xmm2, VP9_FILTER_SHIFT - - punpcklbw xmm5, xmm7 - movq xmm7, [rsi+9] - - paddw xmm3, [GLOBAL(rd)] - pmaddubsw xmm5, xmm1 - - psraw xmm3, VP9_FILTER_SHIFT - punpcklbw xmm6, xmm7 - - packuswb xmm2, xmm3 - pmaddubsw xmm6, xmm1 - - movdqa [rdi], xmm2 ; store the results in the destination - paddw xmm5, [GLOBAL(rd)] - - lea rdi, [rdi + rdx] ; dst_pitch - psraw xmm5, VP9_FILTER_SHIFT - - paddw xmm6, [GLOBAL(rd)] - psraw xmm6, VP9_FILTER_SHIFT - - packuswb xmm5, xmm6 - lea rsi, [rsi + rax] ; next line - - movdqa [rdi], xmm5 ; store the results in the destination - lea rdi, [rdi + rdx] ; dst_pitch - - cmp rdi, rcx - - jne .next_row_fp - -.done: - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_bilinear_predict8x8_ssse3 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp9_bilinear_predict8x8_ssse3) -sym(vp9_bilinear_predict8x8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 144 ; reserve 144 bytes - - lea rcx, [GLOBAL(bilinear_filters_ssse3)] - - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - ;Read 9-line unaligned data in and put them on stack. This gives a big - ;performance boost. - movdqu xmm0, [rsi] - lea rax, [rdx + rdx*2] - movdqu xmm1, [rsi+rdx] - movdqu xmm2, [rsi+rdx*2] - add rsi, rax - movdqu xmm3, [rsi] - movdqu xmm4, [rsi+rdx] - movdqu xmm5, [rsi+rdx*2] - add rsi, rax - movdqu xmm6, [rsi] - movdqu xmm7, [rsi+rdx] - - movdqa XMMWORD PTR [rsp], xmm0 - - movdqu xmm0, [rsi+rdx*2] - - movdqa XMMWORD PTR [rsp+16], xmm1 - movdqa XMMWORD PTR [rsp+32], xmm2 - movdqa XMMWORD PTR [rsp+48], xmm3 - movdqa XMMWORD PTR [rsp+64], xmm4 - movdqa XMMWORD PTR [rsp+80], xmm5 - movdqa XMMWORD PTR [rsp+96], xmm6 - movdqa XMMWORD PTR [rsp+112], xmm7 - movdqa XMMWORD PTR [rsp+128], xmm0 - - movsxd rax, dword ptr arg(2) ; xoffset - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .b8x8_sp_only - - shl rax, 4 - add rax, rcx ; HFilter - - mov rdi, arg(4) ; dst_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm0, [rax] - - movsxd rax, dword ptr arg(3) ; yoffset - cmp rax, 0 ; skip second_pass filter if yoffset=0 - je .b8x8_fp_only - - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - lea rcx, [rdi+rdx*8] - - movdqa xmm1, [rax] - - ; get the first horizontal line done - movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx - - psrldq xmm5, 1 - lea rsp, [rsp + 16] ; next line - - punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 - pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm7, xmm3 - packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - -.next_row: - movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - lea rsp, [rsp + 16] ; next line - - movdqa xmm5, xmm6 - - psrldq xmm5, 1 - - punpcklbw xmm6, xmm5 - pmaddubsw xmm6, xmm0 - - paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value - psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128 - - packuswb xmm6, xmm6 - - punpcklbw xmm7, xmm6 - pmaddubsw xmm7, xmm1 - - paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value - psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128 - - packuswb xmm7, xmm7 - - movq [rdi], xmm7 ; store the results in the destination - lea rdi, [rdi + rdx] - - movdqa xmm7, xmm6 - - cmp rdi, rcx - jne .next_row - - jmp .done8x8 - -.b8x8_sp_only: - movsxd rax, dword ptr arg(3) ; yoffset - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - mov rdi, arg(4) ;dst_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm0, [rax] ; VFilter - - movq xmm1, XMMWORD PTR [rsp] - movq xmm2, XMMWORD PTR [rsp+16] - - movq xmm3, XMMWORD PTR [rsp+32] - punpcklbw xmm1, xmm2 - - movq xmm4, XMMWORD PTR [rsp+48] - punpcklbw xmm2, xmm3 - - movq xmm5, XMMWORD PTR [rsp+64] - punpcklbw xmm3, xmm4 - - movq xmm6, XMMWORD PTR [rsp+80] - punpcklbw xmm4, xmm5 - - movq xmm7, XMMWORD PTR [rsp+96] - punpcklbw xmm5, xmm6 - - pmaddubsw xmm1, xmm0 - pmaddubsw xmm2, xmm0 - - pmaddubsw xmm3, xmm0 - pmaddubsw xmm4, xmm0 - - pmaddubsw xmm5, xmm0 - punpcklbw xmm6, xmm7 - - pmaddubsw xmm6, xmm0 - paddw xmm1, [GLOBAL(rd)] - - paddw xmm2, [GLOBAL(rd)] - psraw xmm1, VP9_FILTER_SHIFT - - paddw xmm3, [GLOBAL(rd)] - psraw xmm2, VP9_FILTER_SHIFT - - paddw xmm4, [GLOBAL(rd)] - psraw xmm3, VP9_FILTER_SHIFT - - paddw xmm5, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - paddw xmm6, [GLOBAL(rd)] - psraw xmm5, VP9_FILTER_SHIFT - - psraw xmm6, VP9_FILTER_SHIFT - packuswb xmm1, xmm1 - - packuswb xmm2, xmm2 - movq [rdi], xmm1 - - packuswb xmm3, xmm3 - movq [rdi+rdx], xmm2 - - packuswb xmm4, xmm4 - movq xmm1, XMMWORD PTR [rsp+112] - - lea rdi, [rdi + 2*rdx] - movq xmm2, XMMWORD PTR [rsp+128] - - packuswb xmm5, xmm5 - movq [rdi], xmm3 - - packuswb xmm6, xmm6 - movq [rdi+rdx], xmm4 - - lea rdi, [rdi + 2*rdx] - punpcklbw xmm7, xmm1 - - movq [rdi], xmm5 - pmaddubsw xmm7, xmm0 - - movq [rdi+rdx], xmm6 - punpcklbw xmm1, xmm2 - - pmaddubsw xmm1, xmm0 - paddw xmm7, [GLOBAL(rd)] - - psraw xmm7, VP9_FILTER_SHIFT - paddw xmm1, [GLOBAL(rd)] - - psraw xmm1, VP9_FILTER_SHIFT - packuswb xmm7, xmm7 - - packuswb xmm1, xmm1 - lea rdi, [rdi + 2*rdx] - - movq [rdi], xmm7 - - movq [rdi+rdx], xmm1 - lea rsp, [rsp + 144] - - jmp .done8x8 - -.b8x8_fp_only: - lea rcx, [rdi+rdx*8] - -.next_row_fp: - movdqa xmm1, XMMWORD PTR [rsp] - movdqa xmm3, XMMWORD PTR [rsp+16] - - movdqa xmm2, xmm1 - movdqa xmm5, XMMWORD PTR [rsp+32] - - psrldq xmm2, 1 - movdqa xmm7, XMMWORD PTR [rsp+48] - - movdqa xmm4, xmm3 - psrldq xmm4, 1 - - movdqa xmm6, xmm5 - psrldq xmm6, 1 - - punpcklbw xmm1, xmm2 - pmaddubsw xmm1, xmm0 - - punpcklbw xmm3, xmm4 - pmaddubsw xmm3, xmm0 - - punpcklbw xmm5, xmm6 - pmaddubsw xmm5, xmm0 - - movdqa xmm2, xmm7 - psrldq xmm2, 1 - - punpcklbw xmm7, xmm2 - pmaddubsw xmm7, xmm0 - - paddw xmm1, [GLOBAL(rd)] - psraw xmm1, VP9_FILTER_SHIFT - - paddw xmm3, [GLOBAL(rd)] - psraw xmm3, VP9_FILTER_SHIFT - - paddw xmm5, [GLOBAL(rd)] - psraw xmm5, VP9_FILTER_SHIFT - - paddw xmm7, [GLOBAL(rd)] - psraw xmm7, VP9_FILTER_SHIFT - - packuswb xmm1, xmm1 - packuswb xmm3, xmm3 - - packuswb xmm5, xmm5 - movq [rdi], xmm1 - - packuswb xmm7, xmm7 - movq [rdi+rdx], xmm3 - - lea rdi, [rdi + 2*rdx] - movq [rdi], xmm5 - - lea rsp, [rsp + 4*16] - movq [rdi+rdx], xmm7 - - lea rdi, [rdi + 2*rdx] - cmp rdi, rcx - - jne .next_row_fp - - lea rsp, [rsp + 16] - -.done8x8: - ;add rsp, 144 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -shuf1b: - db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 -shuf2b: - db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 -shuf3b: - db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 - -align 16 -shuf2bfrom1: - db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 -align 16 -shuf3bfrom1: - db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 - -align 16 -rd: - times 8 dw 0x40 - -align 16 -k0_k5: - times 8 db 0, 0 ;placeholder - times 8 db 0, 0 - times 8 db 2, 1 - times 8 db 0, 0 - times 8 db 3, 3 - times 8 db 0, 0 - times 8 db 1, 2 - times 8 db 0, 0 -k1_k3: - times 8 db 0, 0 ;placeholder - times 8 db -6, 12 - times 8 db -11, 36 - times 8 db -9, 50 - times 8 db -16, 77 - times 8 db -6, 93 - times 8 db -8, 108 - times 8 db -1, 123 -k2_k4: - times 8 db 128, 0 ;placeholder - times 8 db 123, -1 - times 8 db 108, -8 - times 8 db 93, -6 - times 8 db 77, -16 - times 8 db 50, -9 - times 8 db 36, -11 - times 8 db 12, -6 -align 16 -bilinear_filters_ssse3: - times 8 db 128, 0 - times 8 db 120, 8 - times 8 db 112, 16 - times 8 db 104, 24 - times 8 db 96, 32 - times 8 db 88, 40 - times 8 db 80, 48 - times 8 db 72, 56 - times 8 db 64, 64 - times 8 db 56, 72 - times 8 db 48, 80 - times 8 db 40, 88 - times 8 db 32, 96 - times 8 db 24, 104 - times 8 db 16, 112 - times 8 db 8, 120 - diff --git a/vp9/common/x86/subpixel_x86.h b/vp9/common/x86/subpixel_x86.h deleted file mode 100644 index 4c224da3b..000000000 --- a/vp9/common/x86/subpixel_x86.h +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef SUBPIXEL_X86_H -#define SUBPIXEL_X86_H - -/* Note: - * - * This platform is commonly built for runtime CPU detection. If you modify - * any of the function mappings present in this file, be sure to also update - * them in the function pointer initialization code - */ - -#if HAVE_MMX -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx); -extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx); -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx); -extern prototype_subpixel_predict(vp9_bilinear_predict8x4_mmx); -extern prototype_subpixel_predict(vp9_bilinear_predict4x4_mmx); - - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx - -#undef vp9_subpix_sixtap4x4 -#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx - -#undef vp9_subpix_bilinear8x8 -#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_mmx - -#undef vp9_subpix_bilinear8x4 -#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_mmx - -#undef vp9_subpix_bilinear4x4 -#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_mmx - -#endif -#endif - - -#if HAVE_SSE2 -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2); -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2); - - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2 - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2 - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2 - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2 - -#undef vp9_subpix_bilinear8x8 -#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2 - -#endif -#endif - -#if HAVE_SSSE3 -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3); -extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3); -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3 - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3 - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3 - -#undef vp9_subpix_sixtap4x4 -#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3 - - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3 - -#undef vp9_subpix_bilinear8x8 -#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3 - -#endif -#endif - - - -#endif diff --git a/vp9/common/x86/vp8_asm_stubs.c b/vp9/common/x86/vp8_asm_stubs.c deleted file mode 100644 index 817a93e24..000000000 --- a/vp9/common/x86/vp8_asm_stubs.c +++ /dev/null @@ -1,602 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_ports/config.h" -#include "vpx_ports/mem.h" -#include "vp9/common/subpixel.h" - -extern const short vp9_six_tap_mmx[16][6 * 8]; - -extern const short vp9_bilinear_filters_8x_mmx[16][2 * 8]; - -extern void vp9_filter_block1d_h6_mmx(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr, - unsigned char *output_ptr, - int output_pitch, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d8_h6_sse2(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d16_h6_sse2(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr, - unsigned char *output_ptr, - int dst_ptich, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr, - unsigned char *output_ptr, - int dst_ptich, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_unpack_block1d16_h6_sse2(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int output_height, - unsigned int output_width); - -extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - int dst_pitch, - unsigned int output_height, - const short *vp9_filter); - -extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr, - unsigned int src_pixels_per_lin, - unsigned char *output_ptr, - int dst_pitch, - unsigned int output_height, - const short *vp9_filter); - -extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - int dst_pitch, - unsigned int output_height, - const short *vp9_filter); - -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx); - -#if HAVE_MMX -void vp9_sixtap_predict4x4_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict4x4_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16); - const short *hfilter, *vfilter; - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 9, 8, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch, - 8, 4, 4, 4, vfilter); -} - -void vp9_sixtap_predict16x16_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict16x16_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24); - const short *hfilter, *vfilter; - - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), - fdata2, src_pixels_per_line, 1, 21, 32, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, - fdata2 + 4, src_pixels_per_line, 1, 21, 32, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, - fdata2 + 8, src_pixels_per_line, 1, 21, 32, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, - fdata2 + 12, src_pixels_per_line, 1, 21, 32, - hfilter); - - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr, dst_pitch, - 32, 16, 16, 16, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4, dst_pitch, - 32, 16, 16, 16, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8, dst_pitch, - 32, 16, 16, 16, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch, - 32, 16, 16, 16, vfilter); -} - -void vp9_sixtap_predict8x8_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x8_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; - - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), - fdata2, src_pixels_per_line, 1, 13, 16, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, - fdata2 + 4, src_pixels_per_line, 1, 13, 16, - hfilter); - - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 8, 8, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch, - 16, 8, 8, 8, vfilter); -} - -void vp9_sixtap_predict8x4_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x4_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; - - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), - fdata2, src_pixels_per_line, 1, 9, 16, hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, - fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter); - - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 4, 8, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch, - 16, 8, 4, 8, vfilter); -} - -void vp9_bilinear_predict16x16_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - vp9_bilinear_predict8x8_mmx(src_ptr, - src_pixels_per_line, xoffset, yoffset, - dst_ptr, dst_pitch); - vp9_bilinear_predict8x8_mmx(src_ptr + 8, - src_pixels_per_line, xoffset, yoffset, - dst_ptr + 8, dst_pitch); - vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, - src_pixels_per_line, xoffset, yoffset, - dst_ptr + dst_pitch * 8, dst_pitch); - vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, - src_pixels_per_line, xoffset, yoffset, - dst_ptr + dst_pitch * 8 + 8, dst_pitch); -} -#endif - -#if HAVE_SSE2 -void vp9_sixtap_predict16x16_sse2(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24); - const short *hfilter, *vfilter; -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict16x16_sse2\n"); -#endif - - if (xoffset) { - if (yoffset) { - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 21, 32, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch, - 32, 16, 16, dst_pitch, vfilter); - } else { - /* First-pass only */ - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 16, hfilter); - } - } else { - /* Second-pass only */ - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 21, 32); - vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch, - 32, 16, 16, dst_pitch, vfilter); - } -} - -void vp9_sixtap_predict8x8_sse2(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x8_sse2\n"); -#endif - - if (xoffset) { - if (yoffset) { - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 13, 16, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 8, dst_pitch, vfilter); - } else { - /* First-pass only */ - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 8, hfilter); - } - } else { - /* Second-pass only */ - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 8, vfilter); - } -} - -void vp9_sixtap_predict8x4_sse2(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x4_sse2\n"); -#endif - - if (xoffset) { - if (yoffset) { - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 9, 16, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 4, dst_pitch, vfilter); - } else { - /* First-pass only */ - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, hfilter); - } - } else { - /* Second-pass only */ - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, vfilter); - } -} -#endif - -#if HAVE_SSSE3 -extern void vp9_filter_block1d8_h6_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d16_h6_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d4_h6_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -void vp9_sixtap_predict16x16_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict16x16_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - fdata2, 16, 21, xoffset); - vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch, - 16, yoffset); - } else { - /* First-pass only */ - vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 16, xoffset); - } - } else { - /* Second-pass only */ - vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 16, yoffset); - } -} - -void vp9_sixtap_predict8x8_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x8_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, fdata2, 8, 13, xoffset); - vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset); - } else { - vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 8, xoffset); - } - } else { - /* Second-pass only */ - vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 8, yoffset); - } -} - -void vp9_sixtap_predict8x4_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x4_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, fdata2, 8, 9, xoffset); - vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset); - } else { - /* First-pass only */ - vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, xoffset); - } - } else { - /* Second-pass only */ - vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, yoffset); - } -} - -void vp9_sixtap_predict4x4_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict4x4_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, fdata2, 4, 9, xoffset); - vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset); - } else { - vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, xoffset); - } - } else { - vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, yoffset); - } -} - -void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *hfilter_aligned16, - const short *vfilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); - - vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride, - fdata2, 16, 23, hfilter_aligned16); - vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16, - vfilter_aligned16); - } else { - if (hfilter_aligned16[3] != 128) { - vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, - 16, hfilter_aligned16); - } else { - vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride, - dst_ptr, dst_stride, 16, vfilter_aligned16); - } - } -} - -void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *hfilter_aligned16, - const short *vfilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); - - vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride, - fdata2, 16, 15, hfilter_aligned16); - vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8, - vfilter_aligned16); - } else { - if (hfilter_aligned16[3] != 128) { - vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8, - hfilter_aligned16); - } else { - vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride, - dst_ptr, dst_stride, 8, vfilter_aligned16); - } - } -} - -void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *hfilter_aligned16, - const short *vfilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); - - vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride, - fdata2, 16, 11, hfilter_aligned16); - vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4, - vfilter_aligned16); - } else { - if (hfilter_aligned16[3] != 128) { - vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4, - hfilter_aligned16); - } else { - vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride, - dst_ptr, dst_stride, 4, vfilter_aligned16); - } - } -} -#endif diff --git a/vp9/common/x86/vp9_filter_sse2.c b/vp9/common/x86/vp9_filter_sse2.c new file mode 100644 index 000000000..09f8de384 --- /dev/null +++ b/vp9/common/x86/vp9_filter_sse2.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // for alignment checks +#include // SSE2 +#include "vp9/common/vp9_filter.h" +#include "vpx_ports/mem.h" // for DECLARE_ALIGNED +#include "vp9_rtcd.h" + +// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is +// just a quick partial snapshot so that other can already use some +// speedup. +// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap +// filtering. +// TODO(cd): Add some comments, better variable naming. +// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum +// of positive above 128), or have higher precision filter +// coefficients. + +DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = { + VP9_FILTER_WEIGHT >> 1, + VP9_FILTER_WEIGHT >> 1, + VP9_FILTER_WEIGHT >> 1, + VP9_FILTER_WEIGHT >> 1, +}; + +// Creating a macro to do more than four pixels at once to hide instruction +// latency is actually slower :-( +#define DO_FOUR_PIXELS(result, src_ptr, offset) \ + { \ + /* Do shifted load to achieve require shuffles through unpacking */ \ + const __m128i src0 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \ + const __m128i src1 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \ + const __m128i src2 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \ + const __m128i src3 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \ + const __m128i src01 = _mm_unpacklo_epi8(src0, src1); \ + const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero); \ + const __m128i src23 = _mm_unpacklo_epi8(src2, src3); \ + const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero); \ + /* Shit by 4 bytes through suffle to get additional shifted loads */ \ + const __m128i src4 = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1)); \ + const __m128i src5 = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1)); \ + const __m128i src6 = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1)); \ + const __m128i src7 = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1)); \ + const __m128i src45 = _mm_unpacklo_epi8(src4, src5); \ + const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero); \ + const __m128i src67 = _mm_unpacklo_epi8(src6, src7); \ + const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero); \ + /* multiply accumulate them */ \ + const __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \ + const __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \ + const __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \ + const __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \ + const __m128i mad0123 = _mm_add_epi32(mad01, mad23); \ + const __m128i mad4567 = _mm_add_epi32(mad45, mad67); \ + __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \ + mad_all = _mm_add_epi32(mad_all, rounding); \ + result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \ + } + +void vp9_filter_block2d_4x4_8_sse2 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + __m128i intermediateA, intermediateB, intermediateC; + + const int kInterp_Extend = 4; + + const __m128i zero = _mm_set1_epi16(0); + const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c); + + // check alignment + assert(0 == ((long)HFilter_aligned16)%16); + assert(0 == ((long)VFilter_aligned16)%16); + + { + __m128i transpose3_0; + __m128i transpose3_1; + __m128i transpose3_2; + __m128i transpose3_3; + + // Horizontal pass (src -> intermediate). + { + const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16); + // get first two columns filter coefficients + __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3)); + src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); + + { + __m128i mad_all0; + __m128i mad_all1; + __m128i mad_all2; + __m128i mad_all3; + DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) + DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) + DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) + DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) + mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); + mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); + intermediateA = _mm_packus_epi16(mad_all0, mad_all2); + // -- + src_ptr += src_stride*4; + // -- + DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) + DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) + DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) + DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) + mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); + mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); + intermediateB = _mm_packus_epi16(mad_all0, mad_all2); + // -- + src_ptr += src_stride*4; + // -- + DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) + DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) + DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) + mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); + mad_all2 = _mm_packs_epi32(mad_all2, mad_all2); + intermediateC = _mm_packus_epi16(mad_all0, mad_all2); + } + } + + // Transpose result (intermediate -> transpose3_x) + { + // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 + // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73 + // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx + const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB); + const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB); + const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC); + const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC); + // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73 + // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx + // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx + const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); + const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3); + const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3); + // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63 + // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73 + // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx + // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx + const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1); + const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1); + const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3); + const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx + // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx + transpose3_0 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), + _mm_castsi128_ps(transpose2_2), + _MM_SHUFFLE(1, 0, 1, 0))); + transpose3_1 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), + _mm_castsi128_ps(transpose2_2), + _MM_SHUFFLE(3, 2, 3, 2))); + transpose3_2 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), + _mm_castsi128_ps(transpose2_3), + _MM_SHUFFLE(1, 0, 1, 0))); + transpose3_3 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), + _mm_castsi128_ps(transpose2_3), + _MM_SHUFFLE(3, 2, 3, 2))); + // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx + // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx + // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx + // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx + } + + // Vertical pass (transpose3_x -> dst). + { + const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16); + // get first two columns filter coefficients + __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3)); + __m128i col0, col1, col2, col3; + DECLARE_ALIGNED(16, unsigned char, temp[32]); + { + _mm_store_si128((__m128i *)temp, transpose3_0); + DO_FOUR_PIXELS(col0, temp, 0); + } + { + _mm_store_si128((__m128i *)temp, transpose3_1); + DO_FOUR_PIXELS(col1, temp, 0); + } + { + _mm_store_si128((__m128i *)temp, transpose3_2); + DO_FOUR_PIXELS(col2, temp, 0); + } + { + _mm_store_si128((__m128i *)temp, transpose3_3); + DO_FOUR_PIXELS(col3, temp, 0); + } + // transpose + { + __m128i T0 = _mm_unpacklo_epi32(col0, col1); + __m128i T1 = _mm_unpacklo_epi32(col2, col3); + __m128i T2 = _mm_unpackhi_epi32(col0, col1); + __m128i T3 = _mm_unpackhi_epi32(col2, col3); + col0 = _mm_unpacklo_epi64(T0, T1); + col1 = _mm_unpackhi_epi64(T0, T1); + col2 = _mm_unpacklo_epi64(T2, T3); + col3 = _mm_unpackhi_epi64(T2, T3); + } + // saturate to 8 bit + { + col0 = _mm_packs_epi32(col0, col0); + col0 = _mm_packus_epi16(col0, col0); + col1 = _mm_packs_epi32(col1, col1); + col1 = _mm_packus_epi16(col1, col1); + col2 = _mm_packs_epi32 (col2, col2); + col2 = _mm_packus_epi16(col2, col2); + col3 = _mm_packs_epi32 (col3, col3); + col3 = _mm_packus_epi16(col3, col3); + } + // store + { + *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0); + *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1); + *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2); + *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3); + } + } + } +} + +void vp9_filter_block2d_8x4_8_sse2 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + int j; + for (j=0; j<8; j+=4) { + vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride, + HFilter_aligned16, VFilter_aligned16, + dst_ptr + j, dst_stride); + } +} + +void vp9_filter_block2d_8x8_8_sse2 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + int i, j; + for (i=0; i<8; i+=4) { + for (j=0; j<8; j+=4) { + vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, + HFilter_aligned16, VFilter_aligned16, + dst_ptr + j + i*dst_stride, dst_stride); + } + } +} + +void vp9_filter_block2d_16x16_8_sse2 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + int i, j; + for (i=0; i<16; i+=4) { + for (j=0; j<16; j+=4) { + vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, + HFilter_aligned16, VFilter_aligned16, + dst_ptr + j + i*dst_stride, dst_stride); + } + } +} diff --git a/vp9/common/x86/vp9_filter_sse4.c b/vp9/common/x86/vp9_filter_sse4.c new file mode 100644 index 000000000..52c35b296 --- /dev/null +++ b/vp9/common/x86/vp9_filter_sse4.c @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // for alignment checks +#include // SSE4.1 +#include "vp9/common/vp9_filter.h" +#include "vpx_ports/mem.h" // for DECLARE_ALIGNED +#include "vp9_rtcd.h" + +// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is +// just a quick partial snapshot so that other can already use some +// speedup. +// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap +// filtering. +// TODO(cd): Reduce source size by using macros instead of current code +// duplication. +// TODO(cd): Add some comments, better variable naming. +// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum +// of positive above 128), or have higher precision filter +// coefficients. + +DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = { + 0x00, 0x01, + 0x01, 0x02, + 0x02, 0x03, + 0x03, 0x04, + 0x02, 0x03, + 0x03, 0x04, + 0x04, 0x05, + 0x05, 0x06, +}; +DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = { + 0x04, 0x05, + 0x05, 0x06, + 0x06, 0x07, + 0x07, 0x08, + 0x06, 0x07, + 0x07, 0x08, + 0x08, 0x09, + 0x09, 0x0A, +}; +DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = { + VP9_FILTER_WEIGHT >> 1, + VP9_FILTER_WEIGHT >> 1, + VP9_FILTER_WEIGHT >> 1, + VP9_FILTER_WEIGHT >> 1, +}; +DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = { + 0, 4, 8, 12, + 1, 5, 9, 13, + 2, 6, 10, 14, + 3, 7, 11, 15 +}; + +// Creating a macro to do more than four pixels at once to hide instruction +// latency is actually slower :-( +#define DO_FOUR_PIXELS(result, offset) \ + { \ + /*load pixels*/ \ + __m128i src = _mm_loadu_si128((const __m128i *)(src_ptr + offset)); \ + /* extract the ones used for first column */ \ + __m128i src0123 = _mm_shuffle_epi8(src, mask0123); \ + __m128i src4567 = _mm_shuffle_epi8(src, mask4567); \ + __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); \ + __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); \ + __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); \ + __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); \ + /* multiply accumulate them */ \ + __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \ + __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \ + __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \ + __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \ + __m128i mad0123 = _mm_add_epi32(mad01, mad23); \ + __m128i mad4567 = _mm_add_epi32(mad45, mad67); \ + __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \ + mad_all = _mm_add_epi32(mad_all, rounding); \ + result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \ + } + +void vp9_filter_block2d_4x4_8_sse4_1 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + __m128i intermediateA, intermediateB, intermediateC; + + const int kInterp_Extend = 4; + + const __m128i zero = _mm_set1_epi16(0); + const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c); + const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c); + const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c); + const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c); + + // check alignment + assert(0 == ((long)HFilter_aligned16)%16); + assert(0 == ((long)VFilter_aligned16)%16); + + { + __m128i transpose3_0; + __m128i transpose3_1; + __m128i transpose3_2; + __m128i transpose3_3; + + // Horizontal pass (src -> intermediate). + { + const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16); + // get first two columns filter coefficients + __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3)); + src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); + + { + __m128i mad_all0; + __m128i mad_all1; + __m128i mad_all2; + __m128i mad_all3; + DO_FOUR_PIXELS(mad_all0, 0*src_stride) + DO_FOUR_PIXELS(mad_all1, 1*src_stride) + DO_FOUR_PIXELS(mad_all2, 2*src_stride) + DO_FOUR_PIXELS(mad_all3, 3*src_stride) + mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); + mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); + intermediateA = _mm_packus_epi16(mad_all0, mad_all2); + // -- + src_ptr += src_stride*4; + // -- + DO_FOUR_PIXELS(mad_all0, 0*src_stride) + DO_FOUR_PIXELS(mad_all1, 1*src_stride) + DO_FOUR_PIXELS(mad_all2, 2*src_stride) + DO_FOUR_PIXELS(mad_all3, 3*src_stride) + mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); + mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); + intermediateB = _mm_packus_epi16(mad_all0, mad_all2); + // -- + src_ptr += src_stride*4; + // -- + DO_FOUR_PIXELS(mad_all0, 0*src_stride) + DO_FOUR_PIXELS(mad_all1, 1*src_stride) + DO_FOUR_PIXELS(mad_all2, 2*src_stride) + mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); + mad_all2 = _mm_packs_epi32(mad_all2, mad_all2); + intermediateC = _mm_packus_epi16(mad_all0, mad_all2); + } + } + + // Transpose result (intermediate -> transpose3_x) + { + // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 + // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73 + // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx + const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose); + const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose); + const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx + const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1); + const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + transpose3_0 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), + _mm_castsi128_ps(transpose1_2), + _MM_SHUFFLE(0, 0, 1, 0))); + transpose3_1 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), + _mm_castsi128_ps(transpose1_2), + _MM_SHUFFLE(1, 1, 3, 2))); + transpose3_2 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), + _mm_castsi128_ps(transpose1_2), + _MM_SHUFFLE(2, 2, 1, 0))); + transpose3_3 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), + _mm_castsi128_ps(transpose1_2), + _MM_SHUFFLE(3, 3, 3, 2))); + // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx + // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx + // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx + // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx + } + + // Vertical pass (transpose3_x -> dst). + { + const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16); + // get first two columns filter coefficients + __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3)); + __m128i col0, col1, col2, col3; + { + //load pixels + __m128i src = transpose3_0; + // extract the ones used for first column + __m128i src0123 = _mm_shuffle_epi8(src, mask0123); + __m128i src4567 = _mm_shuffle_epi8(src, mask4567); + __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); + __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); + __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); + __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); + // multiply accumulate them + __m128i mad01 = _mm_madd_epi16(src01_16, fil01); + __m128i mad23 = _mm_madd_epi16(src23_16, fil23); + __m128i mad45 = _mm_madd_epi16(src45_16, fil45); + __m128i mad67 = _mm_madd_epi16(src67_16, fil67); + __m128i mad0123 = _mm_add_epi32(mad01, mad23); + __m128i mad4567 = _mm_add_epi32(mad45, mad67); + __m128i mad_all = _mm_add_epi32(mad0123, mad4567); + mad_all = _mm_add_epi32(mad_all, rounding); + mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); + mad_all = _mm_packs_epi32(mad_all, mad_all); + col0 = _mm_packus_epi16(mad_all, mad_all); + } + { + //load pixels + __m128i src = transpose3_1; + // extract the ones used for first column + __m128i src0123 = _mm_shuffle_epi8(src, mask0123); + __m128i src4567 = _mm_shuffle_epi8(src, mask4567); + __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); + __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); + __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); + __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); + // multiply accumulate them + __m128i mad01 = _mm_madd_epi16(src01_16, fil01); + __m128i mad23 = _mm_madd_epi16(src23_16, fil23); + __m128i mad45 = _mm_madd_epi16(src45_16, fil45); + __m128i mad67 = _mm_madd_epi16(src67_16, fil67); + __m128i mad0123 = _mm_add_epi32(mad01, mad23); + __m128i mad4567 = _mm_add_epi32(mad45, mad67); + __m128i mad_all = _mm_add_epi32(mad0123, mad4567); + mad_all = _mm_add_epi32(mad_all, rounding); + mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); + mad_all = _mm_packs_epi32(mad_all, mad_all); + col1 = _mm_packus_epi16(mad_all, mad_all); + } + { + //load pixels + __m128i src = transpose3_2; + // extract the ones used for first column + __m128i src0123 = _mm_shuffle_epi8(src, mask0123); + __m128i src4567 = _mm_shuffle_epi8(src, mask4567); + __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); + __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); + __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); + __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); + // multiply accumulate them + __m128i mad01 = _mm_madd_epi16(src01_16, fil01); + __m128i mad23 = _mm_madd_epi16(src23_16, fil23); + __m128i mad45 = _mm_madd_epi16(src45_16, fil45); + __m128i mad67 = _mm_madd_epi16(src67_16, fil67); + __m128i mad0123 = _mm_add_epi32(mad01, mad23); + __m128i mad4567 = _mm_add_epi32(mad45, mad67); + __m128i mad_all = _mm_add_epi32(mad0123, mad4567); + mad_all = _mm_add_epi32(mad_all, rounding); + mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); + mad_all = _mm_packs_epi32(mad_all, mad_all); + col2 = _mm_packus_epi16(mad_all, mad_all); + } + { + //load pixels + __m128i src = transpose3_3; + // extract the ones used for first column + __m128i src0123 = _mm_shuffle_epi8(src, mask0123); + __m128i src4567 = _mm_shuffle_epi8(src, mask4567); + __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); + __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); + __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); + __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); + // multiply accumulate them + __m128i mad01 = _mm_madd_epi16(src01_16, fil01); + __m128i mad23 = _mm_madd_epi16(src23_16, fil23); + __m128i mad45 = _mm_madd_epi16(src45_16, fil45); + __m128i mad67 = _mm_madd_epi16(src67_16, fil67); + __m128i mad0123 = _mm_add_epi32(mad01, mad23); + __m128i mad4567 = _mm_add_epi32(mad45, mad67); + __m128i mad_all = _mm_add_epi32(mad0123, mad4567); + mad_all = _mm_add_epi32(mad_all, rounding); + mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); + mad_all = _mm_packs_epi32(mad_all, mad_all); + col3 = _mm_packus_epi16(mad_all, mad_all); + } + { + __m128i col01 = _mm_unpacklo_epi8(col0, col1); + __m128i col23 = _mm_unpacklo_epi8(col2, col3); + __m128i col0123 = _mm_unpacklo_epi16(col01, col23); + //TODO(cd): look into Ronald's comment: + // Future suggestion: I believe here, too, you can merge the + // packs_epi32() and pacus_epi16() for the 4 cols above, so that + // you get the data in a single register, and then use pshufb + // (shuffle_epi8()) instead of the unpacks here. Should be + // 2+3+2 instructions faster. + *((unsigned int *)&dst_ptr[dst_stride * 0]) = + _mm_extract_epi32(col0123, 0); + *((unsigned int *)&dst_ptr[dst_stride * 1]) = + _mm_extract_epi32(col0123, 1); + *((unsigned int *)&dst_ptr[dst_stride * 2]) = + _mm_extract_epi32(col0123, 2); + *((unsigned int *)&dst_ptr[dst_stride * 3]) = + _mm_extract_epi32(col0123, 3); + } + } + } +} + +void vp9_filter_block2d_8x4_8_sse4_1 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + int j; + for (j=0; j<8; j+=4) { + vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride, + HFilter_aligned16, VFilter_aligned16, + dst_ptr + j, dst_stride); + } +} + +void vp9_filter_block2d_8x8_8_sse4_1 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + int i, j; + for (i=0; i<8; i+=4) { + for (j=0; j<8; j+=4) { + vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride, + HFilter_aligned16, VFilter_aligned16, + dst_ptr + j + i*dst_stride, dst_stride); + } + } +} + +void vp9_filter_block2d_16x16_8_sse4_1 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + int i, j; + for (i=0; i<16; i+=4) { + for (j=0; j<16; j+=4) { + vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride, + HFilter_aligned16, VFilter_aligned16, + dst_ptr + j + i*dst_stride, dst_stride); + } + } +} diff --git a/vp9/common/x86/vp9_idct_x86.h b/vp9/common/x86/vp9_idct_x86.h new file mode 100644 index 000000000..297ab0d33 --- /dev/null +++ b/vp9/common/x86/vp9_idct_x86.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef IDCT_X86_H +#define IDCT_X86_H + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ + +#if HAVE_MMX +extern prototype_idct(vp9_short_idct4x4llm_1_mmx); +extern prototype_idct(vp9_short_idct4x4llm_mmx); +extern prototype_idct_scalar_add(vp9_dc_only_idct_add_mmx); + +extern prototype_second_order(vp9_short_inv_walsh4x4_mmx); +extern prototype_second_order(vp9_short_inv_walsh4x4_1_mmx); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp9_idct_idct1 +#define vp9_idct_idct1 vp9_short_idct4x4llm_1_mmx + +#undef vp9_idct_idct16 +#define vp9_idct_idct16 vp9_short_idct4x4llm_mmx + +#undef vp9_idct_idct1_scalar_add +#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_mmx + +#undef vp9_idct_iwalsh16 +#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_mmx + +#undef vp9_idct_iwalsh1 +#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_mmx + +#endif +#endif + +#if HAVE_SSE2 + +extern prototype_second_order(vp9_short_inv_walsh4x4_sse2); + +#if !CONFIG_RUNTIME_CPU_DETECT + +#undef vp9_idct_iwalsh16 +#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_sse2 + +#endif + +#endif + + + +#endif diff --git a/vp9/common/x86/vp9_idctllm_mmx.asm b/vp9/common/x86/vp9_idctllm_mmx.asm new file mode 100644 index 000000000..15e81addb --- /dev/null +++ b/vp9/common/x86/vp9_idctllm_mmx.asm @@ -0,0 +1,241 @@ +; +; Copyright (c) 2012 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +align 16 +x_s1sqr2: times 4 dw 0x8A8C +align 16 +x_c1sqr2less1: times 4 dw 0x4E7B +align 16 +pw_16: times 4 dw 16 + +SECTION .text + + +; /**************************************************************************** +; * Notes: +; * +; * This implementation makes use of 16 bit fixed point version of two multiply +; * constants: +; * 1. sqrt(2) * cos (pi/8) +; * 2. sqrt(2) * sin (pi/8) +; * Because the first constant is bigger than 1, to maintain the same 16 bit +; * fixed point precision as the second one, we use a trick of +; * x * a = x + x*(a-1) +; * so +; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). +; * +; * For the second constant, because of the 16bit version is 35468, which +; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative +; * number. +; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x +; * +; **************************************************************************/ + +INIT_MMX + +;void short_idct4x4llm_mmx(short *input, short *output, int pitch) +cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit + mova m0, [inpq +0] + mova m1, [inpq +8] + + mova m2, [inpq+16] + mova m3, [inpq+24] + + psubw m0, m2 ; b1= 0-2 + paddw m2, m2 ; + + mova m5, m1 + paddw m2, m0 ; a1 =0+2 + + pmulhw m5, [x_s1sqr2] ; + paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) + + mova m7, m3 ; + pmulhw m7, [x_c1sqr2less1] ; + + paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) + psubw m7, m5 ; c1 + + mova m5, m1 + mova m4, m3 + + pmulhw m5, [x_c1sqr2less1] + paddw m5, m1 + + pmulhw m3, [x_s1sqr2] + paddw m3, m4 + + paddw m3, m5 ; d1 + mova m6, m2 ; a1 + + mova m4, m0 ; b1 + paddw m2, m3 ;0 + + paddw m4, m7 ;1 + psubw m0, m7 ;2 + + psubw m6, m3 ;3 + + mova m1, m2 ; 03 02 01 00 + mova m3, m4 ; 23 22 21 20 + + punpcklwd m1, m0 ; 11 01 10 00 + punpckhwd m2, m0 ; 13 03 12 02 + + punpcklwd m3, m6 ; 31 21 30 20 + punpckhwd m4, m6 ; 33 23 32 22 + + mova m0, m1 ; 11 01 10 00 + mova m5, m2 ; 13 03 12 02 + + punpckldq m0, m3 ; 30 20 10 00 + punpckhdq m1, m3 ; 31 21 11 01 + + punpckldq m2, m4 ; 32 22 12 02 + punpckhdq m5, m4 ; 33 23 13 03 + + mova m3, m5 ; 33 23 13 03 + + psubw m0, m2 ; b1= 0-2 + paddw m2, m2 ; + + mova m5, m1 + paddw m2, m0 ; a1 =0+2 + + pmulhw m5, [x_s1sqr2] ; + paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2) + + mova m7, m3 ; + pmulhw m7, [x_c1sqr2less1] ; + + paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2) + psubw m7, m5 ; c1 + + mova m5, m1 + mova m4, m3 + + pmulhw m5, [x_c1sqr2less1] + paddw m5, m1 + + pmulhw m3, [x_s1sqr2] + paddw m3, m4 + + paddw m3, m5 ; d1 + paddw m0, [pw_16] + + paddw m2, [pw_16] + mova m6, m2 ; a1 + + mova m4, m0 ; b1 + paddw m2, m3 ;0 + + paddw m4, m7 ;1 + psubw m0, m7 ;2 + + psubw m6, m3 ;3 + psraw m2, 5 + + psraw m0, 5 + psraw m4, 5 + + psraw m6, 5 + + mova m1, m2 ; 03 02 01 00 + mova m3, m4 ; 23 22 21 20 + + punpcklwd m1, m0 ; 11 01 10 00 + punpckhwd m2, m0 ; 13 03 12 02 + + punpcklwd m3, m6 ; 31 21 30 20 + punpckhwd m4, m6 ; 33 23 32 22 + + mova m0, m1 ; 11 01 10 00 + mova m5, m2 ; 13 03 12 02 + + punpckldq m0, m3 ; 30 20 10 00 + punpckhdq m1, m3 ; 31 21 11 01 + + punpckldq m2, m4 ; 32 22 12 02 + punpckhdq m5, m4 ; 33 23 13 03 + + mova [outq], m0 + + mova [outq+r2], m1 + mova [outq+pitq*2], m2 + + add outq, pitq + mova [outq+pitq*2], m5 + RET + +;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch) +cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit + movh m0, [inpq] + paddw m0, [pw_16] + psraw m0, 5 + punpcklwd m0, m0 + punpckldq m0, m0 + + mova [outq], m0 + mova [outq+pitq], m0 + + mova [outq+pitq*2], m0 + add r1, r2 + + mova [outq+pitq*2], m0 + RET + + +;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride) +cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride +%if ARCH_X86_64 + movsxd strideq, dword stridem +%else + mov strideq, stridem +%endif + pxor m0, m0 + + movh m5, in_dcq ; dc + paddw m5, [pw_16] + + psraw m5, 5 + + punpcklwd m5, m5 + punpckldq m5, m5 + + movh m1, [predq] + punpcklbw m1, m0 + paddsw m1, m5 + packuswb m1, m0 ; pack and unpack to saturate + movh [dstq], m1 + + movh m2, [predq+pitq] + punpcklbw m2, m0 + paddsw m2, m5 + packuswb m2, m0 ; pack and unpack to saturate + movh [dstq+strideq], m2 + + movh m3, [predq+2*pitq] + punpcklbw m3, m0 + paddsw m3, m5 + packuswb m3, m0 ; pack and unpack to saturate + movh [dstq+2*strideq], m3 + + add dstq, strideq + add predq, pitq + movh m4, [predq+2*pitq] + punpcklbw m4, m0 + paddsw m4, m5 + packuswb m4, m0 ; pack and unpack to saturate + movh [dstq+2*strideq], m4 + RET + diff --git a/vp9/common/x86/vp9_idctllm_sse2.asm b/vp9/common/x86/vp9_idctllm_sse2.asm new file mode 100644 index 000000000..daa572e01 --- /dev/null +++ b/vp9/common/x86/vp9_idctllm_sse2.asm @@ -0,0 +1,712 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp9_idct_dequant_0_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *pre - 2 +; unsigned char *dst - 3 +; int dst_stride - 4 +; int blk_stride - 5 +; ) + +global sym(vp9_idct_dequant_0_2x_sse2) +sym(vp9_idct_dequant_0_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + ; end prolog + + mov rdx, arg(1) ; dequant + mov rax, arg(0) ; qcoeff + + movd xmm4, [rax] + movd xmm5, [rdx] + + pinsrw xmm4, [rax+32], 4 + pinsrw xmm5, [rdx], 4 + + pmullw xmm4, xmm5 + + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 + + ; clear coeffs + movd [rax], xmm5 + movd [rax+32], xmm5 +;pshufb + pshuflw xmm4, xmm4, 00000000b + pshufhw xmm4, xmm4, 00000000b + + mov rax, arg(2) ; pre + paddw xmm4, [GLOBAL(fours)] + + movsxd rcx, dword ptr arg(5) ; blk_stride + psraw xmm4, 3 + + movq xmm0, [rax] + movq xmm1, [rax+rcx] + movq xmm2, [rax+2*rcx] + lea rcx, [3*rcx] + movq xmm3, [rax+rcx] + + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + + mov rax, arg(3) ; dst + movsxd rdx, dword ptr arg(4) ; dst_stride + + ; Add to predict buffer + paddw xmm0, xmm4 + paddw xmm1, xmm4 + paddw xmm2, xmm4 + paddw xmm3, xmm4 + + ; pack up before storing + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 + + ; store blocks back out + movq [rax], xmm0 + movq [rax + rdx], xmm1 + + lea rax, [rax + 2*rdx] + + movq [rax], xmm2 + movq [rax + rdx], xmm3 + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_idct_dequant_full_2x_sse2) +sym(vp9_idct_dequant_full_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + mov rsi, arg(2) ; pre + mov rdi, arg(3) ; dst + movsxd rcx, dword ptr arg(5) ; blk_stride + + ; Zero out xmm7, for use unpacking + pxor xmm7, xmm7 + + mov rdx, arg(1) ; dequant + + ; note the transpose of xmm1 and xmm2, necessary for shuffle + ; to spit out sensicle data + movdqa xmm0, [rax] + movdqa xmm2, [rax+16] + movdqa xmm1, [rax+32] + movdqa xmm3, [rax+48] + + ; Clear out coeffs + movdqa [rax], xmm7 + movdqa [rax+16], xmm7 + movdqa [rax+32], xmm7 + movdqa [rax+48], xmm7 + + ; dequantize qcoeff buffer + pmullw xmm0, [rdx] + pmullw xmm2, [rdx+16] + pmullw xmm1, [rdx] + pmullw xmm3, [rdx+16] + + ; repack so block 0 row x and block 1 row x are together + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm4, xmm1 + + pshufd xmm0, xmm0, 11011000b + pshufd xmm1, xmm4, 11011000b + + movdqa xmm4, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm4, xmm3 + + pshufd xmm2, xmm2, 11011000b + pshufd xmm3, xmm4, 11011000b + + ; first pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 ; + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + + ; transpose for the second pass + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + ; second pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + paddw xmm0, [GLOBAL(fours)] + + paddw xmm2, [GLOBAL(fours)] + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + psraw xmm2, 3 + + psraw xmm0, 3 + psraw xmm4, 3 + + psraw xmm6, 3 + + ; transpose to save + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + pxor xmm7, xmm7 + + ; Load up predict blocks + movq xmm4, [rsi] + movq xmm5, [rsi+rcx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movq xmm4, [rsi+2*rcx] + lea rcx, [3*rcx] + movq xmm5, [rsi+rcx] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm4 + paddw xmm3, xmm5 + +.finish: + + ; pack up before storing + packuswb xmm0, xmm7 + packuswb xmm1, xmm7 + packuswb xmm2, xmm7 + packuswb xmm3, xmm7 + + ; Load destination stride before writing out, + ; doesn't need to persist + movsxd rdx, dword ptr arg(4) ; dst_stride + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + + lea rdi, [rdi + 2*rdx] + + movq [rdi], xmm2 + movq [rdi + rdx], xmm3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_idct_dequant_dc_0_2x_sse2 +; ( +; short *qcoeff - 0 +; short *dequant - 1 +; unsigned char *pre - 2 +; unsigned char *dst - 3 +; int dst_stride - 4 +; short *dc - 5 +; ) +global sym(vp9_idct_dequant_dc_0_2x_sse2) +sym(vp9_idct_dequant_dc_0_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + mov rsi, arg(2) ; pre + mov rdi, arg(3) ; dst + mov rdx, arg(5) ; dc + + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 + + ; load up 2 dc words here == 2*16 = doubleword + movd xmm4, [rdx] + + ; Load up predict blocks + movq xmm0, [rsi] + movq xmm1, [rsi+16] + movq xmm2, [rsi+32] + movq xmm3, [rsi+48] + + ; Duplicate and expand dc across + punpcklwd xmm4, xmm4 + punpckldq xmm4, xmm4 + + ; Rounding to dequant and downshift + paddw xmm4, [GLOBAL(fours)] + psraw xmm4, 3 + + ; Predict buffer needs to be expanded from bytes to words + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 + + ; Add to predict buffer + paddw xmm0, xmm4 + paddw xmm1, xmm4 + paddw xmm2, xmm4 + paddw xmm3, xmm4 + + ; pack up before storing + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 + + ; Load destination stride before writing out, + ; doesn't need to persist + movsxd rdx, dword ptr arg(4) ; dst_stride + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + + lea rdi, [rdi + 2*rdx] + + movq [rdi], xmm2 + movq [rdi + rdx], xmm3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_idct_dequant_dc_full_2x_sse2) +sym(vp9_idct_dequant_dc_full_2x_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ; special case when 2 blocks have 0 or 1 coeffs + ; dc is set as first coeff, so no need to load qcoeff + mov rax, arg(0) ; qcoeff + mov rsi, arg(2) ; pre + mov rdi, arg(3) ; dst + + ; Zero out xmm7, for use unpacking + pxor xmm7, xmm7 + + mov rdx, arg(1) ; dequant + + ; note the transpose of xmm1 and xmm2, necessary for shuffle + ; to spit out sensicle data + movdqa xmm0, [rax] + movdqa xmm2, [rax+16] + movdqa xmm1, [rax+32] + movdqa xmm3, [rax+48] + + ; Clear out coeffs + movdqa [rax], xmm7 + movdqa [rax+16], xmm7 + movdqa [rax+32], xmm7 + movdqa [rax+48], xmm7 + + ; dequantize qcoeff buffer + pmullw xmm0, [rdx] + pmullw xmm2, [rdx+16] + pmullw xmm1, [rdx] + pmullw xmm3, [rdx+16] + + ; DC component + mov rdx, arg(5) + + ; repack so block 0 row x and block 1 row x are together + movdqa xmm4, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm4, xmm1 + + pshufd xmm0, xmm0, 11011000b + pshufd xmm1, xmm4, 11011000b + + movdqa xmm4, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm4, xmm3 + + pshufd xmm2, xmm2, 11011000b + pshufd xmm3, xmm4, 11011000b + + ; insert DC component + pinsrw xmm0, [rdx], 0 + pinsrw xmm0, [rdx+2], 4 + + ; first pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 ; + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + + ; transpose for the second pass + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + ; second pass + psubw xmm0, xmm2 ; b1 = 0-2 + paddw xmm2, xmm2 + + movdqa xmm5, xmm1 + paddw xmm2, xmm0 ; a1 = 0+2 + + pmulhw xmm5, [GLOBAL(x_s1sqr2)] + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) + + movdqa xmm7, xmm3 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] + + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) + psubw xmm7, xmm5 ; c1 + + movdqa xmm5, xmm1 + movdqa xmm4, xmm3 + + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] + paddw xmm5, xmm1 + + pmulhw xmm3, [GLOBAL(x_s1sqr2)] + paddw xmm3, xmm4 + + paddw xmm3, xmm5 ; d1 + paddw xmm0, [GLOBAL(fours)] + + paddw xmm2, [GLOBAL(fours)] + movdqa xmm6, xmm2 ; a1 + + movdqa xmm4, xmm0 ; b1 + paddw xmm2, xmm3 ;0 + + paddw xmm4, xmm7 ;1 + psubw xmm0, xmm7 ;2 + + psubw xmm6, xmm3 ;3 + psraw xmm2, 3 + + psraw xmm0, 3 + psraw xmm4, 3 + + psraw xmm6, 3 + + ; transpose to save + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 + + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 + + + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 + + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 + + + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 + + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 + + pshufd xmm0, xmm2, 11011000b + pshufd xmm2, xmm1, 11011000b + + pshufd xmm1, xmm5, 11011000b + pshufd xmm3, xmm7, 11011000b + + pxor xmm7, xmm7 + + ; Load up predict blocks + movq xmm4, [rsi] + movq xmm5, [rsi+16] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm0, xmm4 + paddw xmm1, xmm5 + + movq xmm4, [rsi+32] + movq xmm5, [rsi+48] + + punpcklbw xmm4, xmm7 + punpcklbw xmm5, xmm7 + + paddw xmm2, xmm4 + paddw xmm3, xmm5 + +.finish: + + ; pack up before storing + packuswb xmm0, xmm7 + packuswb xmm1, xmm7 + packuswb xmm2, xmm7 + packuswb xmm3, xmm7 + + ; Load destination stride before writing out, + ; doesn't need to persist + movsxd rdx, dword ptr arg(4) ; dst_stride + + ; store blocks back out + movq [rdi], xmm0 + movq [rdi + rdx], xmm1 + + lea rdi, [rdi + 2*rdx] + + movq [rdi], xmm2 + movq [rdi + rdx], xmm3 + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +fours: + times 8 dw 0x0004 +align 16 +x_s1sqr2: + times 8 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 8 dw 0x4E7B diff --git a/vp9/common/x86/vp9_iwalsh_mmx.asm b/vp9/common/x86/vp9_iwalsh_mmx.asm new file mode 100644 index 000000000..6b276b95a --- /dev/null +++ b/vp9/common/x86/vp9_iwalsh_mmx.asm @@ -0,0 +1,173 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output) +global sym(vp9_short_inv_walsh4x4_1_mmx) +sym(vp9_short_inv_walsh4x4_1_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) + mov rax, 3 + + mov rdi, arg(1) + add rax, [rsi] ;input[0] + 3 + + movd mm0, eax + + punpcklwd mm0, mm0 ;x x val val + + punpckldq mm0, mm0 ;val val val val + + psraw mm0, 3 ;(input[0] + 3) >> 3 + + movq [rdi + 0], mm0 + movq [rdi + 8], mm0 + movq [rdi + 16], mm0 + movq [rdi + 24], mm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_short_inv_walsh4x4_mmx(short *input, short *output) +global sym(vp9_short_inv_walsh4x4_mmx) +sym(vp9_short_inv_walsh4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + mov rax, 3 + mov rsi, arg(0) + mov rdi, arg(1) + shl rax, 16 + + movq mm0, [rsi + 0] ;ip[0] + movq mm1, [rsi + 8] ;ip[4] + or rax, 3 ;00030003h + + movq mm2, [rsi + 16] ;ip[8] + movq mm3, [rsi + 24] ;ip[12] + + movq mm7, rax + movq mm4, mm0 + + punpcklwd mm7, mm7 ;0003000300030003h + movq mm5, mm1 + + paddw mm4, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl + + movq mm6, mm4 ;temp al + + paddw mm4, mm5 ;al + bl + psubw mm6, mm5 ;al - bl + + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm1, mm2 ;ip[4] - ip[8] aka c1 + + movq mm5, mm0 ;temp dl + + paddw mm0, mm1 ;dl + cl + psubw mm5, mm1 ;dl - cl + + ; 03 02 01 00 + ; 13 12 11 10 + ; 23 22 21 20 + ; 33 32 31 30 + + movq mm3, mm4 ; 03 02 01 00 + punpcklwd mm4, mm0 ; 11 01 10 00 + punpckhwd mm3, mm0 ; 13 03 12 02 + + movq mm1, mm6 ; 23 22 21 20 + punpcklwd mm6, mm5 ; 31 21 30 20 + punpckhwd mm1, mm5 ; 33 23 32 22 + + movq mm0, mm4 ; 11 01 10 00 + movq mm2, mm3 ; 13 03 12 02 + + punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] + punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] + + punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] + punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] +;~~~~~~~~~~~~~~~~~~~~~ + movq mm1, mm0 + movq mm5, mm4 + + paddw mm1, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl + + movq mm6, mm1 ;temp al + + paddw mm1, mm5 ;al + bl + psubw mm6, mm5 ;al - bl + + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm4, mm2 ;ip[4] - ip[8] aka c1 + + movq mm5, mm0 ;temp dl + + paddw mm0, mm4 ;dl + cl + psubw mm5, mm4 ;dl - cl +;~~~~~~~~~~~~~~~~~~~~~ + movq mm3, mm1 ; 03 02 01 00 + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm3, mm0 ; 13 03 12 02 + + movq mm4, mm6 ; 23 22 21 20 + punpcklwd mm6, mm5 ; 31 21 30 20 + punpckhwd mm4, mm5 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm2, mm3 ; 13 03 12 02 + + punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] + punpckhdq mm1, mm6 ; 31 21 11 01 aka ip[4] + + punpckldq mm2, mm4 ; 32 22 12 02 aka ip[8] + punpckhdq mm3, mm4 ; 33 23 13 03 aka ip[12] + + paddw mm0, mm7 + paddw mm1, mm7 + paddw mm2, mm7 + paddw mm3, mm7 + + psraw mm0, 3 + psraw mm1, 3 + psraw mm2, 3 + psraw mm3, 3 + + movq [rdi + 0], mm0 + movq [rdi + 8], mm1 + movq [rdi + 16], mm2 + movq [rdi + 24], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + diff --git a/vp9/common/x86/vp9_iwalsh_sse2.asm b/vp9/common/x86/vp9_iwalsh_sse2.asm new file mode 100644 index 000000000..143cce87d --- /dev/null +++ b/vp9/common/x86/vp9_iwalsh_sse2.asm @@ -0,0 +1,119 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp9_short_inv_walsh4x4_sse2(short *input, short *output) +global sym(vp9_short_inv_walsh4x4_sse2) +sym(vp9_short_inv_walsh4x4_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + SAVE_XMM 6 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) + mov rdi, arg(1) + mov rax, 3 + + movdqa xmm0, [rsi + 0] ;ip[4] ip[0] + movdqa xmm1, [rsi + 16] ;ip[12] ip[8] + + shl rax, 16 + or rax, 3 ;00030003h + + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm0 ;ip[4] ip[0] + + paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm4, xmm0 + punpcklqdq xmm0, xmm3 ;d1 a1 + punpckhqdq xmm4, xmm3 ;c1 b1 + movd xmm6, eax + + movdqa xmm1, xmm4 ;c1 b1 + paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + +;;;temp output +;; movdqu [rdi + 0], xmm4 +;; movdqu [rdi + 16], xmm3 + +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm4 ;ip[4] ip[0] + + pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03 + + paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm3 ;d1 a1 + punpckhqdq xmm5, xmm3 ;c1 b1 + + movdqa xmm1, xmm5 ;c1 b1 + paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02 +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + paddw xmm5, xmm6 + paddw xmm1, xmm6 + + psraw xmm5, 3 + psraw xmm1, 3 + + movdqa [rdi + 0], xmm5 + movdqa [rdi + 16], xmm1 + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +x_s1sqr2: + times 4 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 4 dw 0x4E7B +align 16 +fours: + times 4 dw 0x0004 diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vp9/common/x86/vp9_loopfilter_mmx.asm new file mode 100644 index 000000000..ac3f74eda --- /dev/null +++ b/vp9/common/x86/vp9_loopfilter_mmx.asm @@ -0,0 +1,969 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void vp9_loop_filter_horizontal_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp9_loop_filter_horizontal_edge_mmx) +sym(vp9_loop_filter_horizontal_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + movsxd rcx, dword ptr arg(5) ;count +.next8_h: + mov rdx, arg(3) ;limit + movq mm7, [rdx] + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + ; calculate breakout conditions + movq mm2, [rdi+2*rax] ; q3 + movq mm1, [rsi+2*rax] ; q2 + movq mm6, mm1 ; q2 + psubusb mm1, mm2 ; q2-=q3 + psubusb mm2, mm6 ; q3-=q2 + por mm1, mm2 ; abs(q3-q2) + psubusb mm1, mm7 ; + + + movq mm4, [rsi+rax] ; q1 + movq mm3, mm4 ; q1 + psubusb mm4, mm6 ; q1-=q2 + psubusb mm6, mm3 ; q2-=q1 + por mm4, mm6 ; abs(q2-q1) + + psubusb mm4, mm7 + por mm1, mm4 + + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + psubusb mm4, mm3 ; q0-=q1 + psubusb mm3, mm0 ; q1-=q0 + por mm4, mm3 ; abs(q0-q1) + movq t0, mm4 ; save to t0 + psubusb mm4, mm7 + por mm1, mm4 + + + neg rax ; negate pitch to deal with above border + + movq mm2, [rsi+4*rax] ; p3 + movq mm4, [rdi+4*rax] ; p2 + movq mm5, mm4 ; p2 + psubusb mm4, mm2 ; p2-=p3 + psubusb mm2, mm5 ; p3-=p2 + por mm4, mm2 ; abs(p3 - p2) + psubusb mm4, mm7 + por mm1, mm4 + + + movq mm4, [rsi+2*rax] ; p1 + movq mm3, mm4 ; p1 + psubusb mm4, mm5 ; p1-=p2 + psubusb mm5, mm3 ; p2-=p1 + por mm4, mm5 ; abs(p2 - p1) + psubusb mm4, mm7 + por mm1, mm4 + + movq mm2, mm3 ; p1 + + movq mm4, [rsi+rax] ; p0 + movq mm5, mm4 ; p0 + psubusb mm4, mm3 ; p0-=p1 + psubusb mm3, mm5 ; p1-=p0 + por mm4, mm3 ; abs(p1 - p0) + movq t1, mm4 ; save to t1 + psubusb mm4, mm7 + por mm1, mm4 + + movq mm3, [rdi] ; q1 + movq mm4, mm3 ; q1 + psubusb mm3, mm2 ; q1-=p1 + psubusb mm2, mm4 ; p1-=q1 + por mm2, mm3 ; abs(p1-q1) + pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm2, 1 ; abs(p1-q1)/2 + + movq mm6, mm5 ; p0 + movq mm3, [rsi] ; q0 + psubusb mm5, mm3 ; p0-=q0 + psubusb mm3, mm6 ; q0-=p0 + por mm5, mm3 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] ; blimit + + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm5 + pxor mm5, mm5 + pcmpeqb mm1, mm5 ; mask mm1 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb mm4, mm5 + + pcmpeqb mm5, mm5 + pxor mm4, mm5 + + + ; start work on filters + movq mm2, [rsi+2*rax] ; p1 + movq mm7, [rdi] ; q1 + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + pand mm2, mm4 ; high var mask (hvm)(p1 - q1) + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + movq mm2, mm1 + paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + + pxor mm0, mm0 ; + pxor mm5, mm5 + punpcklbw mm0, mm2 ; + punpckhbw mm5, mm2 ; + psraw mm0, 11 ; + psraw mm5, 11 + packsswb mm0, mm5 + movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor mm0, mm0 ; 0 + movq mm5, mm1 ; abcdefgh + punpcklbw mm0, mm1 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + pxor mm1, mm1 ; 0 + punpckhbw mm1, mm5 ; a0b0c0d0 + psraw mm1, 11 ; sign extended shift right by 3 + movq mm5, mm0 ; save results + + packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw mm5, [GLOBAL(ones)] + paddsw mm1, [GLOBAL(ones)] + psraw mm5, 1 ; partial shifted one more time for 2nd tap + psraw mm1, 1 ; partial shifted one more time for 2nd tap + packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + pandn mm4, mm5 ; high edge variance additive + + paddsb mm6, mm2 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+rax], mm6 ; write back + + movq mm6, [rsi+2*rax] ; p1 + pxor mm6, [GLOBAL(t80)] ; reoffset + paddsb mm6, mm4 ; p1+= p1 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+2*rax], mm6 ; write back + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + movq [rsi], mm3 ; write back + + psubsb mm7, mm4 ; q1-= q1 add + pxor mm7, [GLOBAL(t80)] ; unoffset + movq [rdi], mm7 ; write back + + add rsi,8 + neg rax + dec rcx + jnz .next8_h + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_loop_filter_vertical_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp9_loop_filter_vertical_edge_mmx) +sym(vp9_loop_filter_vertical_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 64 ; reserve 64 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4 - 4] + + movsxd rcx, dword ptr arg(5) ;count +.next8_v: + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + + ;transpose + movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 + movq mm7, mm6 ; 77 76 75 74 73 72 71 70 + + punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64 + punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60 + + movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 + movq mm5, mm4 ; 47 46 45 44 43 42 41 40 + + punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44 + punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40 + + movq mm3, mm5 ; 57 47 56 46 55 45 54 44 + punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 + + punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 + movq mm2, mm4 ; 53 43 52 42 51 41 50 40 + + punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 + punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 + + neg rax + movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 + + movq mm1, mm6 ; 27 26 25 24 23 22 21 20 + punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24 + + punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20 + movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 + + punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 + movq mm0, mm7 ; 17 07 16 06 15 05 14 04 + + punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 + punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 + + movq mm6, mm7 ; 37 27 17 07 36 26 16 06 + punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 + + punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 + + movq mm5, mm6 ; 76 66 56 46 36 26 16 06 + psubusb mm5, mm7 ; q2-q3 + + psubusb mm7, mm6 ; q3-q2 + por mm7, mm5; ; mm7=abs (q3-q2) + + movq mm5, mm0 ; 35 25 15 05 34 24 14 04 + punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 + + punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 + movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 + + psubusb mm3, mm6 ; q1-q2 + psubusb mm6, mm5 ; q2-q1 + + por mm6, mm3 ; mm6=abs(q2-q1) + lea rdx, srct + + movq [rdx+24], mm5 ; save q1 + movq [rdx+16], mm0 ; save q0 + + movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 + punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 + + movq mm0, mm3 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 31 21 11 01 30 20 10 00 + + punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 + punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 + + movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 + psubusb mm2, mm0 ; p2-p3 + + psubusb mm0, mm1 ; p3-p2 + por mm0, mm2 ; mm0=abs(p3-p2) + + movq mm2, mm3 ; 33 23 13 03 32 22 12 02 + punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 + + punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 + movq [rdx+8], mm3 ; save p0 + + movq [rdx], mm2 ; save p1 + movq mm5, mm2 ; mm5 = p1 + + psubusb mm2, mm1 ; p1-p2 + psubusb mm1, mm5 ; p2-p1 + + por mm1, mm2 ; mm1=abs(p2-p1) + mov rdx, arg(3) ;limit + + movq mm4, [rdx] ; mm4 = limit + psubusb mm7, mm4 + + psubusb mm0, mm4 + psubusb mm1, mm4 + + psubusb mm6, mm4 + por mm7, mm6 + + por mm0, mm1 + por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit + + movq mm1, mm5 ; p1 + + movq mm7, mm3 ; mm3=mm7=p0 + psubusb mm7, mm5 ; p0 - p1 + + psubusb mm5, mm3 ; p1 - p0 + por mm5, mm7 ; abs(p1-p0) + + movq t0, mm5 ; save abs(p1-p0) + lea rdx, srct + + psubusb mm5, mm4 + por mm0, mm5 ; mm0=mask + + movq mm5, [rdx+16] ; mm5=q0 + movq mm7, [rdx+24] ; mm7=q1 + + movq mm6, mm5 ; mm6=q0 + movq mm2, mm7 ; q1 + psubusb mm5, mm7 ; q0-q1 + + psubusb mm7, mm6 ; q1-q0 + por mm7, mm5 ; abs(q1-q0) + + movq t1, mm7 ; save abs(q1-q0) + psubusb mm7, mm4 + + por mm0, mm7 ; mask + + movq mm5, mm2 ; q1 + psubusb mm5, mm1 ; q1-=p1 + psubusb mm1, mm2 ; p1-=q1 + por mm5, mm1 ; abs(p1-q1) + pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm5, 1 ; abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; + + movq mm4, [rdx] ;blimit + movq mm1, mm3 ; mm1=mm3=p0 + + movq mm7, mm6 ; mm7=mm6=q0 + psubusb mm1, mm7 ; p0-q0 + + psubusb mm7, mm3 ; q0-p0 + por mm1, mm7 ; abs(q0-p0) + paddusb mm1, mm1 ; abs(q0-p0)*2 + paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm0; ; mask + + pxor mm0, mm0 + pcmpeqb mm1, mm0 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] + ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + + por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + pcmpeqb mm4, mm0 + + pcmpeqb mm0, mm0 + pxor mm4, mm0 + + + + ; start work on filters + lea rdx, srct + + movq mm2, [rdx] ; p1 + movq mm7, [rdx+24] ; q1 + + movq mm6, [rdx+8] ; p0 + movq mm0, [rdx+16] ; q0 + + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + + psubsb mm2, mm7 ; p1 - q1 + pand mm2, mm4 ; high var mask (hvm)(p1 - q1) + + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + + paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + + paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + + movq mm2, mm1 + paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + + paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + pxor mm0, mm0 ; + + pxor mm5, mm5 + punpcklbw mm0, mm2 ; + + punpckhbw mm5, mm2 ; + psraw mm0, 11 ; + + psraw mm5, 11 + packsswb mm0, mm5 + + movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor mm0, mm0 ; 0 + movq mm5, mm1 ; abcdefgh + + punpcklbw mm0, mm1 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + + pxor mm1, mm1 ; 0 + punpckhbw mm1, mm5 ; a0b0c0d0 + + psraw mm1, 11 ; sign extended shift right by 3 + movq mm5, mm0 ; save results + + packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw mm5, [GLOBAL(ones)] + + paddsw mm1, [GLOBAL(ones)] + psraw mm5, 1 ; partial shifted one more time for 2nd tap + + psraw mm1, 1 ; partial shifted one more time for 2nd tap + packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + + pandn mm4, mm5 ; high edge variance additive + + paddsb mm6, mm2 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + + ; mm6=p0 ; + movq mm1, [rdx] ; p1 + pxor mm1, [GLOBAL(t80)] ; reoffset + + paddsb mm1, mm4 ; p1+= p1 add + pxor mm1, [GLOBAL(t80)] ; unoffset + ; mm6 = p0 mm1 = p1 + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + + ; mm3 = q0 + psubsb mm7, mm4 ; q1-= q1 add + pxor mm7, [GLOBAL(t80)] ; unoffset + ; mm7 = q1 + + ; tranpose and write back + ; mm1 = 72 62 52 42 32 22 12 02 + ; mm6 = 73 63 53 43 33 23 13 03 + ; mm3 = 74 64 54 44 34 24 14 04 + ; mm7 = 75 65 55 45 35 25 15 05 + + movq mm2, mm1 ; 72 62 52 42 32 22 12 02 + punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 + + movq mm4, mm3 ; 74 64 54 44 34 24 14 04 + punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 + + punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 + punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 + + movq mm6, mm2 ; 33 32 23 22 13 12 03 02 + punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 + + punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 + movq mm5, mm1 ; 73 72 63 62 53 52 43 42 + + punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 + punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 + + + ; mm2 = 15 14 13 12 05 04 03 02 + ; mm6 = 35 34 33 32 25 24 23 22 + ; mm5 = 55 54 53 52 45 44 43 42 + ; mm1 = 75 74 73 72 65 64 63 62 + + + + movd [rsi+rax*4+2], mm2 + psrlq mm2, 32 + + movd [rdi+rax*4+2], mm2 + movd [rsi+rax*2+2], mm6 + + psrlq mm6, 32 + movd [rsi+rax+2],mm6 + + movd [rsi+2], mm1 + psrlq mm1, 32 + + movd [rdi+2], mm1 + neg rax + + movd [rdi+rax+2],mm5 + psrlq mm5, 32 + + movd [rdi+rax*2+2], mm5 + + lea rsi, [rsi+rax*8] + dec rcx + jnz .next8_v + + add rsp, 64 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_loop_filter_simple_horizontal_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit +;) +global sym(vp9_loop_filter_simple_horizontal_edge_mmx) +sym(vp9_loop_filter_simple_horizontal_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + mov rcx, 2 ; count +.nexts8_h: + mov rdx, arg(2) ;blimit ; get blimit + movq mm3, [rdx] ; + + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + neg rax + + ; calculate mask + movq mm1, [rsi+2*rax] ; p1 + movq mm0, [rdi] ; q1 + movq mm2, mm1 + movq mm7, mm0 + movq mm4, mm0 + psubusb mm0, mm1 ; q1-=p1 + psubusb mm1, mm4 ; p1-=q1 + por mm1, mm0 ; abs(p1-q1) + pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm1, 1 ; abs(p1-q1)/2 + + movq mm5, [rsi+rax] ; p0 + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + movq mm6, mm5 ; p0 + psubusb mm5, mm4 ; p0-=q0 + psubusb mm4, mm6 ; q0-=p0 + por mm5, mm4 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor mm3, mm3 + pcmpeqb mm5, mm3 + + ; start work on filters + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) + paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) + paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) + pand mm5, mm2 ; mask filter values we don't care about + + ; do + 4 side + paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + movq mm1, mm5 ; get a copy of filters + psraw mm1, 11 ; arithmetic shift right 11 + psllw mm1, 8 ; shift left 8 to put it back + + por mm0, mm1 ; put the two together to get result + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + movq [rsi], mm3 ; write back + + + ; now do +3 side + psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + psraw mm5, 11 ; arithmetic shift right 11 + psllw mm5, 8 ; shift left 8 to put it back + por mm0, mm5 ; put the two together to get result + + + paddsb mm6, mm0 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+rax], mm6 ; write back + + add rsi,8 + neg rax + dec rcx + jnz .nexts8_h + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_loop_filter_simple_vertical_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit +;) +global sym(vp9_loop_filter_simple_vertical_edge_mmx) +sym(vp9_loop_filter_simple_vertical_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4- 2]; ; + mov rcx, 2 ; count +.nexts8_v: + + lea rdi, [rsi + rax]; + movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70 + + movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60 + punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 + + movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50 + movd mm4, [rsi] ; xx xx xx xx 43 42 41 40 + + punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 + movq mm5, mm4 ; 53 43 52 42 51 41 50 40 + + punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40 + punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42 + + neg rax + + movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30 + movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20 + + punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20 + movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10 + + movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00 + punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00 + + movq mm2, mm0 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 13 03 12 02 11 01 10 00 + + punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1 + movq mm3, mm2 ; 33 23 13 03 32 22 12 02 + + punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0 + punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0 + + punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1 + + + ; calculate mask + movq mm6, mm0 ; p1 + movq mm7, mm3 ; q1 + psubusb mm7, mm6 ; q1-=p1 + psubusb mm6, mm3 ; p1-=q1 + por mm6, mm7 ; abs(p1-q1) + pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm6, 1 ; abs(p1-q1)/2 + + movq mm5, mm1 ; p0 + movq mm4, mm2 ; q0 + + psubusb mm5, mm2 ; p0-=q0 + psubusb mm4, mm1 ; q0-=p0 + + por mm5, mm4 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] + + psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor mm7, mm7 + pcmpeqb mm5, mm7 ; mm5 = mask + + ; start work on filters + movq t0, mm0 + movq t1, mm3 + + pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values + + psubsb mm0, mm3 ; p1 - q1 + movq mm6, mm1 ; p0 + + movq mm7, mm2 ; q0 + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + + pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm7 ; offseted ; q0 + + psubsb mm7, mm6 ; q0 - p0 + paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0) + + paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0) + paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0) + + pand mm5, mm0 ; mask filter values we don't care about + + paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + + movq mm7, mm5 ; get a copy of filters + psraw mm7, 11 ; arithmetic shift right 11 + psllw mm7, 8 ; shift left 8 to put it back + + por mm0, mm7 ; put the two together to get result + + psubsb mm3, mm0 ; q0-= q0sz add + pxor mm3, [GLOBAL(t80)] ; unoffset + + ; now do +3 side + psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 + + movq mm0, mm5 ; get a copy of filters + psllw mm0, 8 ; shift left 8 + psraw mm0, 3 ; arithmetic shift right 11 + psrlw mm0, 8 + + psraw mm5, 11 ; arithmetic shift right 11 + psllw mm5, 8 ; shift left 8 to put it back + por mm0, mm5 ; put the two together to get result + + paddsb mm6, mm0 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + + + movq mm0, t0 + movq mm4, t1 + + ; mm0 = 70 60 50 40 30 20 10 00 + ; mm6 = 71 61 51 41 31 21 11 01 + ; mm3 = 72 62 52 42 32 22 12 02 + ; mm4 = 73 63 53 43 33 23 13 03 + ; transpose back to write out + + movq mm1, mm0 ; + punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00 + + punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40 + movq mm2, mm3 ; + + punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02 + movq mm5, mm1 ; 71 70 61 60 51 50 41 40 + + punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42 + movq mm6, mm0 ; 31 30 21 20 11 10 01 00 + + punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00 + punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20 + + movd [rsi+rax*4], mm0 ; write 03 02 01 00 + punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40 + + psrlq mm0, 32 ; xx xx xx xx 13 12 11 10 + punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60 + + movd [rdi+rax*4], mm0 ; write 13 12 11 10 + movd [rsi+rax*2], mm6 ; write 23 22 21 20 + + psrlq mm6, 32 ; 33 32 31 30 + movd [rsi], mm1 ; write 43 42 41 40 + + movd [rsi + rax], mm6 ; write 33 32 31 30 + neg rax + + movd [rsi + rax*2], mm5 ; write 63 62 61 60 + psrlq mm1, 32 ; 53 52 51 50 + + movd [rdi], mm1 ; write out 53 52 51 50 + psrlq mm5, 32 ; 73 72 71 70 + + movd [rdi + rax*2], mm5 ; write 73 72 71 70 + + lea rsi, [rsi+rax*8] ; next 8 + + dec rcx + jnz .nexts8_v + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + +;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, +; int y_stride, +; loop_filter_info *lfi) +;{ +; +; +; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2); +; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2); +; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2); +;} + +SECTION_RODATA +align 16 +tfe: + times 8 db 0xfe +align 16 +t80: + times 8 db 0x80 +align 16 +t1s: + times 8 db 0x01 +align 16 +t3: + times 8 db 0x03 +align 16 +t4: + times 8 db 0x04 +align 16 +ones: + times 4 dw 0x0001 +align 16 +s27: + times 4 dw 0x1b00 +align 16 +s18: + times 4 dw 0x1200 +align 16 +s9: + times 4 dw 0x0900 +align 16 +s63: + times 4 dw 0x003f diff --git a/vp9/common/x86/vp9_loopfilter_sse2.asm b/vp9/common/x86/vp9_loopfilter_sse2.asm new file mode 100644 index 000000000..9c0c4b000 --- /dev/null +++ b/vp9/common/x86/vp9_loopfilter_sse2.asm @@ -0,0 +1,1238 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +; Use of pmaxub instead of psubusb to compute filter mask was seen +; in ffvp8 + +%macro LFH_FILTER_AND_HEV_MASK 1 +%if %1 + movdqa xmm2, [rdi+2*rax] ; q3 + movdqa xmm1, [rsi+2*rax] ; q2 + movdqa xmm4, [rsi+rax] ; q1 + movdqa xmm5, [rsi] ; q0 + neg rax ; negate pitch to deal with above border +%else + movlps xmm2, [rsi + rcx*2] ; q3 + movlps xmm1, [rsi + rcx] ; q2 + movlps xmm4, [rsi] ; q1 + movlps xmm5, [rsi + rax] ; q0 + + movhps xmm2, [rdi + rcx*2] + movhps xmm1, [rdi + rcx] + movhps xmm4, [rdi] + movhps xmm5, [rdi + rax] + + lea rsi, [rsi + rax*4] + lea rdi, [rdi + rax*4] + + movdqa XMMWORD PTR [rsp], xmm1 ; store q2 + movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1 +%endif + + movdqa xmm6, xmm1 ; q2 + movdqa xmm3, xmm4 ; q1 + + psubusb xmm1, xmm2 ; q2-=q3 + psubusb xmm2, xmm6 ; q3-=q2 + + psubusb xmm4, xmm6 ; q1-=q2 + psubusb xmm6, xmm3 ; q2-=q1 + + por xmm4, xmm6 ; abs(q2-q1) + por xmm1, xmm2 ; abs(q3-q2) + + movdqa xmm0, xmm5 ; q0 + pmaxub xmm1, xmm4 + + psubusb xmm5, xmm3 ; q0-=q1 + psubusb xmm3, xmm0 ; q1-=q0 + + por xmm5, xmm3 ; abs(q0-q1) + movdqa t0, xmm5 ; save to t0 + + pmaxub xmm1, xmm5 + +%if %1 + movdqa xmm2, [rsi+4*rax] ; p3 + movdqa xmm4, [rdi+4*rax] ; p2 + movdqa xmm6, [rsi+2*rax] ; p1 +%else + movlps xmm2, [rsi + rax] ; p3 + movlps xmm4, [rsi] ; p2 + movlps xmm6, [rsi + rcx] ; p1 + + movhps xmm2, [rdi + rax] + movhps xmm4, [rdi] + movhps xmm6, [rdi + rcx] + + movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2 + movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1 +%endif + + movdqa xmm5, xmm4 ; p2 + movdqa xmm3, xmm6 ; p1 + + psubusb xmm4, xmm2 ; p2-=p3 + psubusb xmm2, xmm5 ; p3-=p2 + + psubusb xmm3, xmm5 ; p1-=p2 + pmaxub xmm1, xmm4 ; abs(p3 - p2) + + psubusb xmm5, xmm6 ; p2-=p1 + pmaxub xmm1, xmm2 ; abs(p3 - p2) + + pmaxub xmm1, xmm5 ; abs(p2 - p1) + movdqa xmm2, xmm6 ; p1 + + pmaxub xmm1, xmm3 ; abs(p2 - p1) +%if %1 + movdqa xmm4, [rsi+rax] ; p0 + movdqa xmm3, [rdi] ; q1 +%else + movlps xmm4, [rsi + rcx*2] ; p0 + movhps xmm4, [rdi + rcx*2] + movdqa xmm3, q1 ; q1 +%endif + + movdqa xmm5, xmm4 ; p0 + psubusb xmm4, xmm6 ; p0-=p1 + + psubusb xmm6, xmm5 ; p1-=p0 + + por xmm6, xmm4 ; abs(p1 - p0) + mov rdx, arg(2) ; get blimit + + movdqa t1, xmm6 ; save to t1 + + movdqa xmm4, xmm3 ; q1 + pmaxub xmm1, xmm6 + + psubusb xmm3, xmm2 ; q1-=p1 + psubusb xmm2, xmm4 ; p1-=q1 + + psubusb xmm1, xmm7 + por xmm2, xmm3 ; abs(p1-q1) + + movdqa xmm7, XMMWORD PTR [rdx] ; blimit + + movdqa xmm3, xmm0 ; q0 + pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero + + mov rdx, arg(4) ; hev get thresh + + movdqa xmm6, xmm5 ; p0 + psrlw xmm2, 1 ; abs(p1-q1)/2 + + psubusb xmm5, xmm3 ; p0-=q0 + + psubusb xmm3, xmm6 ; q0-=p0 + por xmm5, xmm3 ; abs(p0 - q0) + + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + + movdqa xmm4, t0 ; hev get abs (q1 - q0) + + movdqa xmm3, t1 ; get abs (p1 - p0) + + paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + movdqa xmm2, XMMWORD PTR [rdx] ; hev + + psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + psubusb xmm4, xmm2 ; hev + + psubusb xmm3, xmm2 ; hev + por xmm1, xmm5 + + pxor xmm7, xmm7 + paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb xmm4, xmm5 ; hev + pcmpeqb xmm3, xmm3 ; hev + + pcmpeqb xmm1, xmm7 ; mask xmm1 + pxor xmm4, xmm3 ; hev +%endmacro + +%macro B_FILTER 1 +%if %1 == 0 + movdqa xmm2, p1 ; p1 + movdqa xmm7, q1 ; q1 +%elif %1 == 1 + movdqa xmm2, [rsi+2*rax] ; p1 + movdqa xmm7, [rdi] ; q1 +%elif %1 == 2 + lea rdx, srct + + movdqa xmm2, [rdx] ; p1 + movdqa xmm7, [rdx+48] ; q1 + movdqa xmm6, [rdx+16] ; p0 + movdqa xmm0, [rdx+32] ; q0 +%endif + + pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + + psubsb xmm2, xmm7 ; p1 - q1 + pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values + + pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) + pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values + + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + + paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + + paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + + pand xmm1, xmm2 ; mask filter values we don't care about + + movdqa xmm2, xmm1 + + paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + + punpckhbw xmm5, xmm2 ; axbxcxdx + punpcklbw xmm2, xmm2 ; exfxgxhx + + punpcklbw xmm0, xmm1 ; exfxgxhx + psraw xmm5, 11 ; sign extended shift right by 3 + + punpckhbw xmm1, xmm1 ; axbxcxdx + psraw xmm2, 11 ; sign extended shift right by 3 + + packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + psraw xmm0, 11 ; sign extended shift right by 3 + + psraw xmm1, 11 ; sign extended shift right by 3 + movdqa xmm5, xmm0 ; save results + + packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw xmm5, [GLOBAL(ones)] + + paddsw xmm1, [GLOBAL(ones)] + psraw xmm5, 1 ; partial shifted one more time for 2nd tap + + psraw xmm1, 1 ; partial shifted one more time for 2nd tap + + paddsb xmm6, xmm2 ; p0+= p0 add + packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + +%if %1 == 0 + movdqa xmm1, p1 ; p1 +%elif %1 == 1 + movdqa xmm1, [rsi+2*rax] ; p1 +%elif %1 == 2 + movdqa xmm1, [rdx] ; p1 +%endif + pandn xmm4, xmm5 ; high edge variance additive + pxor xmm6, [GLOBAL(t80)] ; unoffset + + pxor xmm1, [GLOBAL(t80)] ; reoffset + psubsb xmm3, xmm0 ; q0-= q0 add + + paddsb xmm1, xmm4 ; p1+= p1 add + pxor xmm3, [GLOBAL(t80)] ; unoffset + + pxor xmm1, [GLOBAL(t80)] ; unoffset + psubsb xmm7, xmm4 ; q1-= q1 add + + pxor xmm7, [GLOBAL(t80)] ; unoffset +%if %1 == 0 + lea rsi, [rsi + rcx*2] + lea rdi, [rdi + rcx*2] + movq MMWORD PTR [rsi], xmm6 ; p0 + movhps MMWORD PTR [rdi], xmm6 + movq MMWORD PTR [rsi + rax], xmm1 ; p1 + movhps MMWORD PTR [rdi + rax], xmm1 + movq MMWORD PTR [rsi + rcx], xmm3 ; q0 + movhps MMWORD PTR [rdi + rcx], xmm3 + movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1 + movhps MMWORD PTR [rdi + rcx*2],xmm7 +%elif %1 == 1 + movdqa [rsi+rax], xmm6 ; write back + movdqa [rsi+2*rax], xmm1 ; write back + movdqa [rsi], xmm3 ; write back + movdqa [rdi], xmm7 ; write back +%endif + +%endmacro + + +;void vp9_loop_filter_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp9_loop_filter_horizontal_edge_sse2) +sym(vp9_loop_filter_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step + + mov rdx, arg(3) ;limit + movdqa xmm7, XMMWORD PTR [rdx] + + lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 1 + ; filter and write back the result + B_FILTER 1 + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_loop_filter_horizontal_edge_uv_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp9_loop_filter_horizontal_edge_uv_sse2) +sym(vp9_loop_filter_horizontal_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 96 ; reserve 96 bytes + %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16]; + %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16]; + %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16]; + %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16]; + %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16]; + + mov rsi, arg(0) ; u + mov rdi, arg(5) ; v + movsxd rax, dword ptr arg(1) ; src_pixel_step + mov rcx, rax + neg rax ; negate pitch to deal with above border + + mov rdx, arg(3) ;limit + movdqa xmm7, XMMWORD PTR [rdx] + + lea rsi, [rsi + rcx] + lea rdi, [rdi + rcx] + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 0 + ; filter and write back the result + B_FILTER 0 + + add rsp, 96 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +%macro TRANSPOSE_16X8 2 + movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 + movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 + movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 + movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 + movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 + movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 + + punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + + movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 + + movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 + + movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 + + punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 +%if %1 + lea rsi, [rsi+rax*8] +%else + mov rsi, arg(5) ; v_ptr +%endif + + movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 + punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 + + punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + + punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 +%if %1 + lea rdi, [rdi+rax*8] +%else + lea rsi, [rsi - 4] +%endif + + punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 +%if %1 + lea rdx, srct +%else + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing +%endif + + movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 + + movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 + punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + + punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + + punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + + punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + + movdqa t0, xmm2 ; save to free XMM2 + movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 + movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 + movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 + movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 + movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 + + punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + + movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 + + punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 + + movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 + + punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 + + movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 + + punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 + + movdqa xmm6, xmm1 ; + punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 + + punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 + movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + + punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + + punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + + movdqa xmm0, xmm5 + punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + + punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + + punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 + + punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 + movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + + punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 + + punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 +%if %2 + movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + + punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + + movdqa [rdx], xmm2 ; save 2 + + movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + + movdqa [rdx+16], xmm3 ; save 3 + + punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + + movdqa [rdx+32], xmm4 ; save 4 + movdqa [rdx+48], xmm5 ; save 5 + movdqa xmm1, t0 ; get + + movdqa xmm2, xmm1 ; + punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + + punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 +%else + movdqa [rdx+112], xmm7 ; save 7 + + movdqa [rdx+96], xmm6 ; save 6 + + movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + + punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + + movdqa [rdx+32], xmm2 ; save 2 + + movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + + movdqa [rdx+48], xmm3 ; save 3 + + punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + + movdqa [rdx+64], xmm4 ; save 4 + movdqa [rdx+80], xmm5 ; save 5 + movdqa xmm1, t0 ; get + + movdqa xmm2, xmm1 + punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + + punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + + movdqa [rdx+16], xmm1 + + movdqa [rdx], xmm2 +%endif +%endmacro + +%macro LFV_FILTER_MASK_HEV_MASK 1 + movdqa xmm0, xmm6 ; q2 + psubusb xmm0, xmm7 ; q2-q3 + + psubusb xmm7, xmm6 ; q3-q2 + movdqa xmm4, xmm5 ; q1 + + por xmm7, xmm0 ; abs (q3-q2) + psubusb xmm4, xmm6 ; q1-q2 + + movdqa xmm0, xmm1 + psubusb xmm6, xmm5 ; q2-q1 + + por xmm6, xmm4 ; abs (q2-q1) + psubusb xmm0, xmm2 ; p2 - p3; + + psubusb xmm2, xmm1 ; p3 - p2; + por xmm0, xmm2 ; abs(p2-p3) +%if %1 + movdqa xmm2, [rdx] ; p1 +%else + movdqa xmm2, [rdx+32] ; p1 +%endif + movdqa xmm5, xmm2 ; p1 + pmaxub xmm0, xmm7 + + psubusb xmm5, xmm1 ; p1-p2 + psubusb xmm1, xmm2 ; p2-p1 + + movdqa xmm7, xmm3 ; p0 + psubusb xmm7, xmm2 ; p0-p1 + + por xmm1, xmm5 ; abs(p2-p1) + pmaxub xmm0, xmm6 + + pmaxub xmm0, xmm1 + movdqa xmm1, xmm2 ; p1 + + psubusb xmm2, xmm3 ; p1-p0 + lea rdx, srct + + por xmm2, xmm7 ; abs(p1-p0) + + movdqa t0, xmm2 ; save abs(p1-p0) + + pmaxub xmm0, xmm2 + +%if %1 + movdqa xmm5, [rdx+32] ; q0 + movdqa xmm7, [rdx+48] ; q1 +%else + movdqa xmm5, [rdx+64] ; q0 + movdqa xmm7, [rdx+80] ; q1 +%endif + mov rdx, arg(3) ; limit + + movdqa xmm6, xmm5 ; q0 + movdqa xmm2, xmm7 ; q1 + + psubusb xmm5, xmm7 ; q0-q1 + psubusb xmm7, xmm6 ; q1-q0 + + por xmm7, xmm5 ; abs(q1-q0) + + movdqa t1, xmm7 ; save abs(q1-q0) + + movdqa xmm4, XMMWORD PTR [rdx]; limit + + pmaxub xmm0, xmm7 + mov rdx, arg(2) ; blimit + + psubusb xmm0, xmm4 + movdqa xmm5, xmm2 ; q1 + + psubusb xmm5, xmm1 ; q1-=p1 + psubusb xmm1, xmm2 ; p1-=q1 + + por xmm5, xmm1 ; abs(p1-q1) + movdqa xmm1, xmm3 ; p0 + + pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero + psubusb xmm1, xmm6 ; p0-q0 + + psrlw xmm5, 1 ; abs(p1-q1)/2 + psubusb xmm6, xmm3 ; q0-p0 + + movdqa xmm4, XMMWORD PTR [rdx]; blimit + + mov rdx, arg(4) ; get thresh + + por xmm1, xmm6 ; abs(q0-p0) + + movdqa xmm6, t0 ; get abs (q1 - q0) + + paddusb xmm1, xmm1 ; abs(q0-p0)*2 + + movdqa xmm3, t1 ; get abs (p1 - p0) + + movdqa xmm7, XMMWORD PTR [rdx] + + paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh + + psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh + + psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + por xmm1, xmm0 ; mask + pcmpeqb xmm6, xmm0 + + pxor xmm0, xmm0 + pcmpeqb xmm4, xmm4 + + pcmpeqb xmm1, xmm0 + pxor xmm4, xmm6 +%endmacro + +%macro BV_TRANSPOSE 0 + ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + + movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + + punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 + + movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 + + punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 + movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 + + punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 + ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 + ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 + ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 + ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 +%endmacro + +%macro BV_WRITEBACK 2 + movd [rsi+2], %1 + psrldq %1, 4 + + movd [rdi+2], %1 + psrldq %1, 4 + + movd [rsi+2*rax+2], %1 + psrldq %1, 4 + + movd [rdi+2*rax+2], %1 + + movd [rsi+4*rax+2], %2 + psrldq %2, 4 + + movd [rdi+4*rax+2], %2 + psrldq %2, 4 + + movd [rsi+2*rcx+2], %2 + psrldq %2, 4 + + movd [rdi+2*rcx+2], %2 +%endmacro + + +;void vp9_loop_filter_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp9_loop_filter_vertical_edge_sse2) +sym(vp9_loop_filter_vertical_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 96 ; reserve 96 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; + + mov rsi, arg(0) ; src_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax*2+rax] + + ;transpose 16x8 to 8x16, and store the 8-line result on stack. + TRANSPOSE_16X8 1, 1 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK 1 + + ; start work on filters + B_FILTER 2 + + ; tranpose and write back - only work on q1, q0, p0, p1 + BV_TRANSPOSE + ; store 16-line result + + lea rdx, [rax] + neg rdx + + BV_WRITEBACK xmm1, xmm5 + + lea rsi, [rsi+rdx*8] + lea rdi, [rdi+rdx*8] + BV_WRITEBACK xmm2, xmm6 + + add rsp, 96 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_loop_filter_vertical_edge_uv_sse2 +;( +; unsigned char *u, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; unsigned char *v +;) +global sym(vp9_loop_filter_vertical_edge_uv_sse2) +sym(vp9_loop_filter_vertical_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 96 ; reserve 96 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; + + mov rsi, arg(0) ; u_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax+2*rax] + + lea rdx, srct + + ;transpose 16x8 to 8x16, and store the 8-line result on stack. + TRANSPOSE_16X8 0, 1 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK 1 + + ; start work on filters + B_FILTER 2 + + ; tranpose and write back - only work on q1, q0, p0, p1 + BV_TRANSPOSE + + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + + ; store 16-line result + BV_WRITEBACK xmm1, xmm5 + + mov rsi, arg(0) ; u_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + BV_WRITEBACK xmm2, xmm6 + + add rsp, 96 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_loop_filter_simple_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +;) +global sym(vp9_loop_filter_simple_horizontal_edge_sse2) +sym(vp9_loop_filter_simple_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + mov rdx, arg(2) ;blimit + movdqa xmm3, XMMWORD PTR [rdx] + + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + neg rax + + ; calculate mask + movdqa xmm1, [rsi+2*rax] ; p1 + movdqa xmm0, [rdi] ; q1 + movdqa xmm2, xmm1 + movdqa xmm7, xmm0 + movdqa xmm4, xmm0 + psubusb xmm0, xmm1 ; q1-=p1 + psubusb xmm1, xmm4 ; p1-=q1 + por xmm1, xmm0 ; abs(p1-q1) + pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw xmm1, 1 ; abs(p1-q1)/2 + + movdqa xmm5, [rsi+rax] ; p0 + movdqa xmm4, [rsi] ; q0 + movdqa xmm0, xmm4 ; q0 + movdqa xmm6, xmm5 ; p0 + psubusb xmm5, xmm4 ; p0-=q0 + psubusb xmm4, xmm6 ; q0-=p0 + por xmm5, xmm4 ; abs(p0 - q0) + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor xmm3, xmm3 + pcmpeqb xmm5, xmm3 + + ; start work on filters + pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb xmm2, xmm7 ; p1 - q1 + + pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) + paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) + paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) + pand xmm5, xmm2 ; mask filter values we don't care about + + ; do + 4 side + paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 + + movdqa xmm0, xmm5 ; get a copy of filters + psllw xmm0, 8 ; shift left 8 + psraw xmm0, 3 ; arithmetic shift right 11 + psrlw xmm0, 8 + movdqa xmm1, xmm5 ; get a copy of filters + psraw xmm1, 11 ; arithmetic shift right 11 + psllw xmm1, 8 ; shift left 8 to put it back + + por xmm0, xmm1 ; put the two together to get result + + psubsb xmm3, xmm0 ; q0-= q0 add + pxor xmm3, [GLOBAL(t80)] ; unoffset + movdqa [rsi], xmm3 ; write back + + ; now do +3 side + psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 + + movdqa xmm0, xmm5 ; get a copy of filters + psllw xmm0, 8 ; shift left 8 + psraw xmm0, 3 ; arithmetic shift right 11 + psrlw xmm0, 8 + psraw xmm5, 11 ; arithmetic shift right 11 + psllw xmm5, 8 ; shift left 8 to put it back + por xmm0, xmm5 ; put the two together to get result + + + paddsb xmm6, xmm0 ; p0+= p0 add + pxor xmm6, [GLOBAL(t80)] ; unoffset + movdqa [rsi+rax], xmm6 ; write back + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_loop_filter_simple_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +;) +global sym(vp9_loop_filter_simple_vertical_edge_sse2) +sym(vp9_loop_filter_simple_vertical_edge_sse2): + push rbp ; save old base pointer value. + mov rbp, rsp ; set new base pointer value. + SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 7 + GET_GOT rbx ; save callee-saved reg + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi - 2 ] + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 + movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 + movd xmm2, [rdi] ; 13 12 11 10 + movd xmm3, [rcx] ; 53 52 51 50 + punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 + punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 + + movd xmm4, [rsi + rax*2] ; 23 22 21 20 + movd xmm5, [rdx + rax*2] ; 63 62 61 60 + movd xmm6, [rdi + rax*2] ; 33 32 31 30 + movd xmm7, [rcx + rax*2] ; 73 72 71 70 + punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 + punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 + + punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 + punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 + + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + + movdqa xmm2, xmm0 + punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + + movdqa t0, xmm0 ; save to t0 + movdqa t1, xmm2 ; save to t1 + + lea rsi, [rsi + rax*8] + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd xmm4, [rsi] ; 83 82 81 80 + movd xmm1, [rdx] ; c3 c2 c1 c0 + movd xmm6, [rdi] ; 93 92 91 90 + movd xmm3, [rcx] ; d3 d2 d1 d0 + punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 + punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 + + movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0 + movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 + movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0 + movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 + punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 + punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 + + punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 + punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 + + movdqa xmm1, xmm4 + punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 + + movdqa xmm6, xmm4 + punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + + movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + + punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + + ; calculate mask + movdqa xmm6, xmm0 ; p1 + movdqa xmm7, xmm3 ; q1 + psubusb xmm7, xmm0 ; q1-=p1 + psubusb xmm6, xmm3 ; p1-=q1 + por xmm6, xmm7 ; abs(p1-q1) + pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw xmm6, 1 ; abs(p1-q1)/2 + + movdqa xmm5, xmm1 ; p0 + movdqa xmm4, xmm2 ; q0 + psubusb xmm5, xmm2 ; p0-=q0 + psubusb xmm4, xmm1 ; q0-=p0 + por xmm5, xmm4 ; abs(p0 - q0) + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit + movdqa xmm7, XMMWORD PTR [rdx] + + psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit + pxor xmm7, xmm7 + pcmpeqb xmm5, xmm7 ; mm5 = mask + + ; start work on filters + movdqa t0, xmm0 + movdqa t1, xmm3 + + pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values + + psubsb xmm0, xmm3 ; p1 - q1 + movdqa xmm6, xmm1 ; p0 + + movdqa xmm7, xmm2 ; q0 + pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values + + pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values + movdqa xmm3, xmm7 ; offseted ; q0 + + psubsb xmm7, xmm6 ; q0 - p0 + paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0) + + paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0) + paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0) + + pand xmm5, xmm0 ; mask filter values we don't care about + + + paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 + + movdqa xmm0, xmm5 ; get a copy of filters + psllw xmm0, 8 ; shift left 8 + + psraw xmm0, 3 ; arithmetic shift right 11 + psrlw xmm0, 8 + + movdqa xmm7, xmm5 ; get a copy of filters + psraw xmm7, 11 ; arithmetic shift right 11 + + psllw xmm7, 8 ; shift left 8 to put it back + por xmm0, xmm7 ; put the two together to get result + + psubsb xmm3, xmm0 ; q0-= q0sz add + pxor xmm3, [GLOBAL(t80)] ; unoffset q0 + + ; now do +3 side + psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 + movdqa xmm0, xmm5 ; get a copy of filters + + psllw xmm0, 8 ; shift left 8 + psraw xmm0, 3 ; arithmetic shift right 11 + + psrlw xmm0, 8 + psraw xmm5, 11 ; arithmetic shift right 11 + + psllw xmm5, 8 ; shift left 8 to put it back + por xmm0, xmm5 ; put the two together to get result + + paddsb xmm6, xmm0 ; p0+= p0 add + pxor xmm6, [GLOBAL(t80)] ; unoffset p0 + + movdqa xmm0, t0 ; p1 + movdqa xmm4, t1 ; q1 + + ; transpose back to write out + ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + + movdqa xmm5, xmm3 + punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + movdqa xmm2, xmm0 + punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + + movdqa xmm3, xmm1 + punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + + ; write out order: xmm0 xmm2 xmm1 xmm3 + lea rdx, [rsi + rax*4] + + movd [rsi], xmm1 ; write the second 8-line result + psrldq xmm1, 4 + movd [rdi], xmm1 + psrldq xmm1, 4 + movd [rsi + rax*2], xmm1 + psrldq xmm1, 4 + movd [rdi + rax*2], xmm1 + + movd [rdx], xmm3 + psrldq xmm3, 4 + movd [rcx], xmm3 + psrldq xmm3, 4 + movd [rdx + rax*2], xmm3 + psrldq xmm3, 4 + movd [rcx + rax*2], xmm3 + + neg rax + lea rsi, [rsi + rax*8] + neg rax + lea rdi, [rsi + rax] + lea rdx, [rsi + rax*4] + lea rcx, [rdx + rax] + + movd [rsi], xmm0 ; write the first 8-line result + psrldq xmm0, 4 + movd [rdi], xmm0 + psrldq xmm0, 4 + movd [rsi + rax*2], xmm0 + psrldq xmm0, 4 + movd [rdi + rax*2], xmm0 + + movd [rdx], xmm2 + psrldq xmm2, 4 + movd [rcx], xmm2 + psrldq xmm2, 4 + movd [rdx + rax*2], xmm2 + psrldq xmm2, 4 + movd [rcx + rax*2], xmm2 + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +tfe: + times 16 db 0xfe +align 16 +t80: + times 16 db 0x80 +align 16 +t1s: + times 16 db 0x01 +align 16 +t3: + times 16 db 0x03 +align 16 +t4: + times 16 db 0x04 +align 16 +ones: + times 8 dw 0x0001 +align 16 +s9: + times 8 dw 0x0900 +align 16 +s63: + times 8 dw 0x003f diff --git a/vp9/common/x86/vp9_loopfilter_x86.c b/vp9/common/x86/vp9_loopfilter_x86.c new file mode 100644 index 000000000..af8bb6922 --- /dev/null +++ b/vp9/common/x86/vp9_loopfilter_x86.c @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 +#include "vpx_config.h" +#include "vp9/common/vp9_loopfilter.h" + +prototype_loopfilter(vp9_loop_filter_vertical_edge_mmx); +prototype_loopfilter(vp9_loop_filter_horizontal_edge_mmx); + +prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2); +prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2); + +extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2; +extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2; + +#if HAVE_MMX +/* Horizontal MB filtering */ +void vp9_loop_filter_mbh_mmx(unsigned char *y_ptr, + unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, + struct loop_filter_info *lfi) { +} + +/* Vertical MB Filtering */ +void vp9_loop_filter_mbv_mmx(unsigned char *y_ptr, + unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, + struct loop_filter_info *lfi) { +} + +/* Horizontal B Filtering */ +void vp9_loop_filter_bh_mmx(unsigned char *y_ptr, + unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + +} + +void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, + y_stride, blimit); + vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, + y_stride, blimit); + vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, + y_stride, blimit); +} + +/* Vertical B Filtering */ +void vp9_loop_filter_bv_mmx(unsigned char *y_ptr, + unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp9_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp9_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp9_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 1); +} + +void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit); + vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit); + vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit); +} +#endif + +#if HAVE_SSE2 +void vp9_mbloop_filter_horizontal_edge_c_sse2(unsigned char *s, + int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh, + int count) { + DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); + __m128i mask, hev, flat; + const __m128i zero = _mm_set1_epi16(0); + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + const unsigned int extended_thresh = _thresh[0] * 0x01010101u; + const unsigned int extended_limit = _limit[0] * 0x01010101u; + const unsigned int extended_blimit = _blimit[0] * 0x01010101u; + const __m128i thresh = + _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); + const __m128i limit = + _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); + const __m128i blimit = + _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); + + p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), + _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), + _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), + _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), + _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), + _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), + _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), + _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), + _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), + _mm_subs_epu8(p0, p4)), + _mm_or_si128(_mm_subs_epu8(q4, q0), + _mm_subs_epu8(q0, q4))); + flat = _mm_max_epu8(work, flat); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + int i = 0; + do { + __m128i workp_a, workp_b, workp_shft; + p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + src += 8; + } while (++i < count); + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), + t80); + const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), + t80); + const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), + t80); + const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), + t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + /* Filter1 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + /* Filter2 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + /* filt >> 1 */ + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_load_si128((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + q1 = _mm_load_si128((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_load_si128((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_load_si128((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + p1 = _mm_load_si128((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_load_si128((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + if (count == 1) { + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + } else { + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + } + } +} + +static __inline void transpose(unsigned char *src[], int in_p, + unsigned char *dst[], int out_p, + int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + do { + unsigned char *in = src[idx8x8]; + unsigned char *out = dst[idx8x8]; + + x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07 + x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17 + x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27 + x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37 + x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47 + x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57 + x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67 + x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77 + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + x0 = _mm_unpacklo_epi8(x0, x1); + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = _mm_unpacklo_epi8(x2, x3); + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = _mm_unpacklo_epi8(x4, x5); + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = _mm_unpacklo_epi8(x6, x7); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + x4 = _mm_unpacklo_epi16(x0, x1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x5 = _mm_unpacklo_epi16(x2, x3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + x6 = _mm_unpacklo_epi32(x4, x5); + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi32(x4, x5); + + _mm_storel_pd((double *)(out + 0*out_p), + _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 + _mm_storeh_pd((double *)(out + 1*out_p), + _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 + _mm_storel_pd((double *)(out + 2*out_p), + _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 + _mm_storeh_pd((double *)(out + 3*out_p), + _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi16(x0, x1); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi16(x2, x3); + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + x6 = _mm_unpacklo_epi32(x4, x5); + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi32(x4, x5); + + _mm_storel_pd((double *)(out + 4*out_p), + _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 + _mm_storeh_pd((double *)(out + 5*out_p), + _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 + _mm_storel_pd((double *)(out + 6*out_p), + _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 + _mm_storeh_pd((double *)(out + 7*out_p), + _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} + +void vp9_mbloop_filter_vertical_edge_c_sse2(unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { + DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 16]); + unsigned char *src[4]; + unsigned char *dst[4]; + + src[0] = s - 5; + src[1] = s - 5 + 8; + src[2] = s - 5 + p*8; + src[3] = s - 5 + p*8 + 8; + + dst[0] = t_dst; + dst[1] = t_dst + 16*8; + dst[2] = t_dst + 8; + dst[3] = t_dst + 16*8 + 8; + + // 16x16->16x16 or 16x8->8x16 + transpose(src, p, dst, 16, (1 << count)); + + vp9_mbloop_filter_horizontal_edge_c_sse2(t_dst + 5*16, 16, blimit, limit, + thresh, count); + + dst[0] = s - 5; + dst[1] = s - 5 + p*8; + + src[0] = t_dst; + src[1] = t_dst + 8; + + // 16x8->8x16 or 8x8->8x8 + transpose(src, 16, dst, p, (1 << (count - 1))); +} + +/* Horizontal MB filtering */ +void vp9_loop_filter_mbh_sse2(unsigned char *y_ptr, + unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp9_mbloop_filter_horizontal_edge_c_sse2(y_ptr, y_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 2); + + /* TODO: write sse2 version with u,v interleaved */ + if (u_ptr) + vp9_mbloop_filter_horizontal_edge_c_sse2(u_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp9_mbloop_filter_horizontal_edge_c_sse2(v_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); +} + +void vp9_loop_filter_bh8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp9_mbloop_filter_horizontal_edge_c_sse2( + y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); +} + +/* Vertical MB Filtering */ +void vp9_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp9_mbloop_filter_vertical_edge_c_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, + lfi->hev_thr, 2); + + /* TODO: write sse2 version with u,v interleaved */ + if (u_ptr) + vp9_mbloop_filter_vertical_edge_c_sse2(u_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); + + if (v_ptr) + vp9_mbloop_filter_vertical_edge_c_sse2(v_ptr, uv_stride, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); +} + +void vp9_loop_filter_bv8x8_sse2(unsigned char *y_ptr, unsigned char *u_ptr, + unsigned char *v_ptr, int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp9_mbloop_filter_vertical_edge_c_sse2( + y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2); +} + +/* Horizontal B Filtering */ +void vp9_loop_filter_bh_sse2(unsigned char *y_ptr, + unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp9_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp9_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, + v_ptr + 4 * uv_stride); +} + +void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, + y_stride, blimit); + vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, + y_stride, blimit); + vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, + y_stride, blimit); +} + +/* Vertical B Filtering */ +void vp9_loop_filter_bv_sse2(unsigned char *y_ptr, + unsigned char *u_ptr, unsigned char *v_ptr, + int y_stride, int uv_stride, + struct loop_filter_info *lfi) { + vp9_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + vp9_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, + lfi->blim, lfi->lim, lfi->hev_thr, 2); + + if (u_ptr) + vp9_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, + lfi->blim, lfi->lim, lfi->hev_thr, + v_ptr + 4); +} + +void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, + const unsigned char *blimit) { + vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit); + vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit); + vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit); +} + +#endif diff --git a/vp9/common/x86/vp9_loopfilter_x86.h b/vp9/common/x86/vp9_loopfilter_x86.h new file mode 100644 index 000000000..25cf383c9 --- /dev/null +++ b/vp9/common/x86/vp9_loopfilter_x86.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef LOOPFILTER_X86_H +#define LOOPFILTER_X86_H + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ + +#if HAVE_MMX +extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx); +extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx); +extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx); +extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx); +extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx); +extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx); +extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx); +extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx); +#endif + +#if HAVE_SSE2 +extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2); +extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2); +extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2); +extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2); +extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2); +extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2); +extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2); +extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2); +#endif + +#endif // LOOPFILTER_X86_H diff --git a/vp9/common/x86/vp9_mask_sse3.asm b/vp9/common/x86/vp9_mask_sse3.asm new file mode 100644 index 000000000..0d90cfa86 --- /dev/null +++ b/vp9/common/x86/vp9_mask_sse3.asm @@ -0,0 +1,484 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void int vp8_makemask_sse3( +; unsigned char *y, +; unsigned char *u, +; unsigned char *v, +; unsigned char *ym, +; unsigned char *uvm, +; int yp, +; int uvp, +; int ys, +; int us, +; int vs, +; int yt, +; int ut, +; int vt) +global sym(vp8_makemask_sse3) +sym(vp8_makemask_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 14 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;y + mov rdi, arg(1) ;u + mov rcx, arg(2) ;v + mov rax, arg(3) ;ym + movsxd rbx, dword arg(4) ;yp + movsxd rdx, dword arg(5) ;uvp + + pxor xmm0,xmm0 + + ;make 16 copies of the center y value + movd xmm1, arg(6) + pshufb xmm1, xmm0 + + ; make 16 copies of the center u value + movd xmm2, arg(7) + pshufb xmm2, xmm0 + + ; make 16 copies of the center v value + movd xmm3, arg(8) + pshufb xmm3, xmm0 + unpcklpd xmm2, xmm3 + + ;make 16 copies of the y tolerance + movd xmm3, arg(9) + pshufb xmm3, xmm0 + + ;make 16 copies of the u tolerance + movd xmm4, arg(10) + pshufb xmm4, xmm0 + + ;make 16 copies of the v tolerance + movd xmm5, arg(11) + pshufb xmm5, xmm0 + unpckhpd xmm4, xmm5 + + mov r8,8 + +NextPairOfRows: + + ;grab the y source values + movdqu xmm0, [rsi] + + ;compute abs difference between source and y target + movdqa xmm6, xmm1 + movdqa xmm7, xmm0 + psubusb xmm0, xmm1 + psubusb xmm6, xmm7 + por xmm0, xmm6 + + ;compute abs difference between + movdqa xmm6, xmm3 + pcmpgtb xmm6, xmm0 + + ;grab the y source values + add rsi, rbx + movdqu xmm0, [rsi] + + ;compute abs difference between source and y target + movdqa xmm11, xmm1 + movdqa xmm7, xmm0 + psubusb xmm0, xmm1 + psubusb xmm11, xmm7 + por xmm0, xmm11 + + ;compute abs difference between + movdqa xmm11, xmm3 + pcmpgtb xmm11, xmm0 + + + ;grab the u and v source values + movdqu xmm7, [rdi] + movdqu xmm8, [rcx] + unpcklpd xmm7, xmm8 + + ;compute abs difference between source and uv targets + movdqa xmm9, xmm2 + movdqa xmm10, xmm7 + psubusb xmm7, xmm2 + psubusb xmm9, xmm10 + por xmm7, xmm9 + + ;check whether the number is < tolerance + movdqa xmm0, xmm4 + pcmpgtb xmm0, xmm7 + + ;double u and v masks + movdqa xmm8, xmm0 + punpckhbw xmm0, xmm0 + punpcklbw xmm8, xmm8 + + ;mask row 0 and output + pand xmm6, xmm8 + pand xmm6, xmm0 + movdqa [rax],xmm6 + + ;mask row 1 and output + pand xmm11, xmm8 + pand xmm11, xmm0 + movdqa [rax+16],xmm11 + + + ; to the next row or set of rows + add rsi, rbx + add rdi, rdx + add rcx, rdx + add rax,32 + dec r8 + jnz NextPairOfRows + + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;GROW_HORIZ (register for result, source register or mem local) +; takes source and shifts left and ors with source +; then shifts right and ors with source +%macro GROW_HORIZ 2 + movdqa %1, %2 + movdqa xmm14, %1 + movdqa xmm15, %1 + pslldq xmm14, 1 + psrldq xmm15, 1 + por %1,xmm14 + por %1,xmm15 +%endmacro +;GROW_VERT (result, center row, above row, below row) +%macro GROW_VERT 4 + movdqa %1,%2 + por %1,%3 + por %1,%4 +%endmacro + +;GROW_NEXTLINE (new line to grow, new source, line to write) +%macro GROW_NEXTLINE 3 + GROW_HORIZ %1, %2 + GROW_VERT xmm3, xmm0, xmm1, xmm2 + movdqa %3,xmm3 +%endmacro + + +;void int vp8_growmaskmb_sse3( +; unsigned char *om, +; unsigned char *nm, +global sym(vp8_growmaskmb_sse3) +sym(vp8_growmaskmb_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src + mov rdi, arg(1) ;rst + + GROW_HORIZ xmm0, [rsi] + GROW_HORIZ xmm1, [rsi+16] + GROW_HORIZ xmm2, [rsi+32] + + GROW_VERT xmm3, xmm0, xmm1, xmm2 + por xmm0,xmm1 + movdqa [rdi], xmm0 + movdqa [rdi+16],xmm3 + + GROW_NEXTLINE xmm0,[rsi+48],[rdi+32] + GROW_NEXTLINE xmm1,[rsi+64],[rdi+48] + GROW_NEXTLINE xmm2,[rsi+80],[rdi+64] + GROW_NEXTLINE xmm0,[rsi+96],[rdi+80] + GROW_NEXTLINE xmm1,[rsi+112],[rdi+96] + GROW_NEXTLINE xmm2,[rsi+128],[rdi+112] + GROW_NEXTLINE xmm0,[rsi+144],[rdi+128] + GROW_NEXTLINE xmm1,[rsi+160],[rdi+144] + GROW_NEXTLINE xmm2,[rsi+176],[rdi+160] + GROW_NEXTLINE xmm0,[rsi+192],[rdi+176] + GROW_NEXTLINE xmm1,[rsi+208],[rdi+192] + GROW_NEXTLINE xmm2,[rsi+224],[rdi+208] + GROW_NEXTLINE xmm0,[rsi+240],[rdi+224] + + por xmm0,xmm2 + movdqa [rdi+240], xmm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + +;unsigned int vp8_sad16x16_masked_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; unsigned char *mask) +global sym(vp8_sad16x16_masked_wmt) +sym(vp8_sad16x16_masked_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rbx, arg(4) ;mask + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + mov rcx, 16 + + pxor xmm3, xmm3 + +NextSadRow: + movdqu xmm0, [rsi] + movdqu xmm1, [rdi] + movdqu xmm2, [rbx] + pand xmm0, xmm2 + pand xmm1, xmm2 + + psadbw xmm0, xmm1 + paddw xmm3, xmm0 + + add rsi, rax + add rdi, rdx + add rbx, 16 + + dec rcx + jnz NextSadRow + + movdqa xmm4 , xmm3 + psrldq xmm4, 8 + paddw xmm3, xmm4 + movq rax, xmm3 + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad16x16_unmasked_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; unsigned char *mask) +global sym(vp8_sad16x16_unmasked_wmt) +sym(vp8_sad16x16_unmasked_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rbx, arg(4) ;mask + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + mov rcx, 16 + + pxor xmm3, xmm3 + +next_vp8_sad16x16_unmasked_wmt: + movdqu xmm0, [rsi] + movdqu xmm1, [rdi] + movdqu xmm2, [rbx] + por xmm0, xmm2 + por xmm1, xmm2 + + psadbw xmm0, xmm1 + paddw xmm3, xmm0 + + add rsi, rax + add rdi, rdx + add rbx, 16 + + dec rcx + jnz next_vp8_sad16x16_unmasked_wmt + + movdqa xmm4 , xmm3 + psrldq xmm4, 8 + paddw xmm3, xmm4 + movq rax, xmm3 + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_masked_predictor_wmt( +; unsigned char *masked, +; unsigned char *unmasked, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; unsigned char *mask) +global sym(vp8_masked_predictor_wmt) +sym(vp8_masked_predictor_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;ref_ptr + + mov rbx, arg(5) ;mask + movsxd rax, dword ptr arg(2) ;src_stride + mov r11, arg(3) ; destination + movsxd rdx, dword ptr arg(4) ;dst_stride + + mov rcx, 16 + + pxor xmm3, xmm3 + +next_vp8_masked_predictor_wmt: + movdqu xmm0, [rsi] + movdqu xmm1, [rdi] + movdqu xmm2, [rbx] + + pand xmm0, xmm2 + pandn xmm2, xmm1 + por xmm0, xmm2 + movdqu [r11], xmm0 + + add r11, rdx + add rsi, rax + add rdi, rdx + add rbx, 16 + + dec rcx + jnz next_vp8_masked_predictor_wmt + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;unsigned int vp8_masked_predictor_uv_wmt( +; unsigned char *masked, +; unsigned char *unmasked, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; unsigned char *mask) +global sym(vp8_masked_predictor_uv_wmt) +sym(vp8_masked_predictor_uv_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;ref_ptr + + mov rbx, arg(5) ;mask + movsxd rax, dword ptr arg(2) ;src_stride + mov r11, arg(3) ; destination + movsxd rdx, dword ptr arg(4) ;dst_stride + + mov rcx, 8 + + pxor xmm3, xmm3 + +next_vp8_masked_predictor_uv_wmt: + movq xmm0, [rsi] + movq xmm1, [rdi] + movq xmm2, [rbx] + + pand xmm0, xmm2 + pandn xmm2, xmm1 + por xmm0, xmm2 + movq [r11], xmm0 + + add r11, rdx + add rsi, rax + add rdi, rax + add rbx, 8 + + dec rcx + jnz next_vp8_masked_predictor_uv_wmt + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_uv_from_y_mask( +; unsigned char *ymask, +; unsigned char *uvmask) +global sym(vp8_uv_from_y_mask) +sym(vp8_uv_from_y_mask): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + + mov rcx, 8 + + pxor xmm3, xmm3 + +next_p8_uv_from_y_mask: + movdqu xmm0, [rsi] + pshufb xmm0, [shuf1b] ;[GLOBAL(shuf1b)] + movq [rdi],xmm0 + add rdi, 8 + add rsi,32 + + dec rcx + jnz next_p8_uv_from_y_mask + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +shuf1b: + db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 + diff --git a/vp9/common/x86/vp9_postproc_mmx.asm b/vp9/common/x86/vp9_postproc_mmx.asm new file mode 100644 index 000000000..fa2152bab --- /dev/null +++ b/vp9/common/x86/vp9_postproc_mmx.asm @@ -0,0 +1,534 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define VP9_FILTER_WEIGHT 128 +%define VP9_FILTER_SHIFT 7 + +;void vp9_post_proc_down_and_across_mmx +;( +; unsigned char *src_ptr, +; unsigned char *dst_ptr, +; int src_pixels_per_line, +; int dst_pixels_per_line, +; int rows, +; int cols, +; int flimit +;) +global sym(vp9_post_proc_down_and_across_mmx) +sym(vp9_post_proc_down_and_across_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + ; move the global rd onto the stack, since we don't have enough registers + ; to do PIC addressing + movq mm0, [GLOBAL(rd)] + sub rsp, 8 + movq [rsp], mm0 +%define RD [rsp] +%else +%define RD [GLOBAL(rd)] +%endif + + push rbx + lea rbx, [GLOBAL(Blur)] + movd mm2, dword ptr arg(6) ;flimit + punpcklwd mm2, mm2 + punpckldq mm2, mm2 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + movsxd rcx, DWORD PTR arg(4) ;rows + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + +.nextrow: + + xor rdx, rdx ; clear out rdx for use as loop counter +.nextcol: + + pxor mm7, mm7 ; mm7 = 00000000 + movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps + movq mm3, [rsi] ; mm4 = r0 p0..p7 + punpcklbw mm3, mm0 ; mm3 = p0..p3 + movq mm1, mm3 ; mm1 = p0..p3 + pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers + + movq mm6, [rbx + 48] ; mm6 = kernel 3 taps + movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 + punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 + pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers + paddusw mm3, mm6 ; mm3 += mm6 + + ; thresholding + movq mm7, mm1 ; mm7 = r0 p0..p3 + psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 + psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 + paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) + pcmpgtw mm7, mm2 + + movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers + movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 + punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 + pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = r0 p0..p3 + psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 + psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 + paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + + neg rax + movq mm6, [rbx ] ; kernel 0 taps + movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 + punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 + pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = r0 p0..p3 + psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 + paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + movq mm6, [rbx + 16] ; kernel 1 taps + movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 + punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 + pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = r0 p0..p3 + psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 + paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + + paddusw mm3, RD ; mm3 += round value + psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 + + pand mm1, mm7 ; mm1 select vals > thresh from source + pandn mm7, mm3 ; mm7 select vals < thresh from blurred result + paddusw mm1, mm7 ; combination + + packuswb mm1, mm0 ; pack to bytes + + movd [rdi], mm1 ; + neg rax ; pitch is positive + + + add rsi, 4 + add rdi, 4 + add rdx, 4 + + cmp edx, dword ptr arg(5) ;cols + jl .nextcol + ; done with the all cols, start the across filtering in place + sub rsi, rdx + sub rdi, rdx + + + push rax + xor rdx, rdx + mov rax, [rdi-4]; + +.acrossnextcol: + pxor mm7, mm7 ; mm7 = 00000000 + movq mm6, [rbx + 32 ] ; + movq mm4, [rdi+rdx] ; mm4 = p0..p7 + movq mm3, mm4 ; mm3 = p0..p7 + punpcklbw mm3, mm0 ; mm3 = p0..p3 + movq mm1, mm3 ; mm1 = p0..p3 + pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers + + movq mm6, [rbx + 48] + psrlq mm4, 8 ; mm4 = p1..p7 + movq mm5, mm4 ; mm5 = p1..p7 + punpcklbw mm5, mm0 ; mm5 = p1..p4 + pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers + paddusw mm3, mm6 ; mm3 += mm6 + + ; thresholding + movq mm7, mm1 ; mm7 = p0..p3 + psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 + psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) + pcmpgtw mm7, mm2 + + movq mm6, [rbx + 64 ] + psrlq mm4, 8 ; mm4 = p2..p7 + movq mm5, mm4 ; mm5 = p2..p7 + punpcklbw mm5, mm0 ; mm5 = p2..p5 + pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = p0..p3 + psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 + psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + + movq mm6, [rbx ] + movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 + movq mm5, mm4 ; mm5 = p-2..p5 + punpcklbw mm5, mm0 ; mm5 = p-2..p1 + pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = p0..p3 + psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 + psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + movq mm6, [rbx + 16] + psrlq mm4, 8 ; mm4 = p-1..p5 + punpcklbw mm4, mm0 ; mm4 = p-1..p2 + pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = p0..p3 + psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 + psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + paddusw mm3, RD ; mm3 += round value + psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 + + pand mm1, mm7 ; mm1 select vals > thresh from source + pandn mm7, mm3 ; mm7 select vals < thresh from blurred result + paddusw mm1, mm7 ; combination + + packuswb mm1, mm0 ; pack to bytes + mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes + movd eax, mm1 + + add rdx, 4 + cmp edx, dword ptr arg(5) ;cols + jl .acrossnextcol; + + mov DWORD PTR [rdi+rdx-4], eax + pop rax + + ; done with this rwo + add rsi,rax ; next line + movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch? + add rdi,rax ; next destination + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch? + + dec rcx ; decrement count + jnz .nextrow ; next row + pop rbx + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +%undef RD + + +;void vp9_mbpost_proc_down_mmx(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) +extern sym(vp9_rv) +global sym(vp9_mbpost_proc_down_mmx) +sym(vp9_mbpost_proc_down_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 136 + + ; unsigned char d[16][8] at [rsp] + ; create flimit2 at [rsp+128] + mov eax, dword ptr arg(4) ;flimit + mov [rsp+128], eax + mov [rsp+128+4], eax +%define flimit2 [rsp+128] + +%if ABI_IS_32BIT=0 + lea r8, [GLOBAL(sym(vp9_rv))] +%endif + + ;rows +=8; + add dword ptr arg(2), 8 + + ;for(c=0; c thresh from source + pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result + paddusw xmm1, xmm7 ; combination + + packuswb xmm1, xmm0 ; pack to bytes + movq QWORD PTR [rdi], xmm1 ; + + neg rax ; pitch is positive + add rsi, 8 + add rdi, 8 + + add rdx, 8 + cmp edx, dword arg(5) ;cols + + jl .nextcol + + ; done with the all cols, start the across filtering in place + sub rsi, rdx + sub rdi, rdx + + xor rdx, rdx + movq mm0, QWORD PTR [rdi-8]; + +.acrossnextcol: + movq xmm7, QWORD PTR [rdi +rdx -2] + movd xmm4, DWORD PTR [rdi +rdx +6] + + pslldq xmm4, 8 + por xmm4, xmm7 + + movdqa xmm3, xmm4 + psrldq xmm3, 2 + punpcklbw xmm3, xmm0 ; mm3 = p0..p3 + movdqa xmm1, xmm3 ; mm1 = p0..p3 + psllw xmm3, 2 + + + movdqa xmm5, xmm4 + psrldq xmm5, 3 + punpcklbw xmm5, xmm0 ; mm5 = p1..p4 + paddusw xmm3, xmm5 ; mm3 += mm6 + + ; thresholding + movdqa xmm7, xmm1 ; mm7 = p0..p3 + psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4) + pcmpgtw xmm7, xmm2 + + movdqa xmm5, xmm4 + psrldq xmm5, 4 + punpcklbw xmm5, xmm0 ; mm5 = p2..p5 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = p0..p3 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + + movdqa xmm5, xmm4 ; mm5 = p-2..p5 + punpcklbw xmm5, xmm0 ; mm5 = p-2..p1 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = p0..p3 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + psrldq xmm4, 1 ; mm4 = p-1..p5 + punpcklbw xmm4, xmm0 ; mm4 = p-1..p2 + paddusw xmm3, xmm4 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = p0..p3 + psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4 + psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + paddusw xmm3, RD42 ; mm3 += round value + psraw xmm3, 3 ; mm3 /= 8 + + pand xmm1, xmm7 ; mm1 select vals > thresh from source + pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result + paddusw xmm1, xmm7 ; combination + + packuswb xmm1, xmm0 ; pack to bytes + movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes + movdq2q mm0, xmm1 + + add rdx, 8 + cmp edx, dword arg(5) ;cols + jl .acrossnextcol; + + ; last 8 pixels + movq QWORD PTR [rdi+rdx-8], mm0 + + ; done with this rwo + add rsi,rax ; next line + mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch? + add rdi,rax ; next destination + mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch? + + dec rcx ; decrement count + jnz .nextrow ; next row + +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + add rsp,16 + pop rsp +%endif + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef RD42 + + +;void vp9_mbpost_proc_down_xmm(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) +extern sym(vp9_rv) +global sym(vp9_mbpost_proc_down_xmm) +sym(vp9_mbpost_proc_down_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 128+16 + + ; unsigned char d[16][8] at [rsp] + ; create flimit2 at [rsp+128] + mov eax, dword ptr arg(4) ;flimit + mov [rsp+128], eax + mov [rsp+128+4], eax + mov [rsp+128+8], eax + mov [rsp+128+12], eax +%define flimit4 [rsp+128] + +%if ABI_IS_32BIT=0 + lea r8, [GLOBAL(sym(vp9_rv))] +%endif + + ;rows +=8; + add dword arg(2), 8 + + ;for(c=0; cmode_info_context->mbmi.uv_mode; + build_intra_pred_mbuv_fn_t fn; + int src_stride = xd->dst.uv_stride; + + switch (mode) { + case V_PRED: + fn = vp9_intra_pred_uv_ve_mmx; + break; + case H_PRED: + fn = ho_fn; + break; + case TM_PRED: + fn = tm_fn; + break; + case DC_PRED: + if (xd->up_available) { + if (xd->left_available) { + fn = vp9_intra_pred_uv_dc_mmx2; + break; + } else { + fn = vp9_intra_pred_uv_dctop_mmx2; + break; + } + } else if (xd->left_available) { + fn = vp9_intra_pred_uv_dcleft_mmx2; + break; + } else { + fn = vp9_intra_pred_uv_dc128_mmx; + break; + } + break; + default: + return; + } + + fn(dst_u, dst_stride, xd->dst.u_buffer, src_stride); + fn(dst_v, dst_stride, xd->dst.v_buffer, src_stride); +} + +void vp9_build_intra_predictors_mbuv_sse2(MACROBLOCKD *xd) { + build_intra_predictors_mbuv_x86(xd, &xd->predictor[256], + &xd->predictor[320], 8, + vp9_intra_pred_uv_tm_sse2, + vp9_intra_pred_uv_ho_mmx2); +} + +void vp9_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *xd) { + build_intra_predictors_mbuv_x86(xd, &xd->predictor[256], + &xd->predictor[320], 8, + vp9_intra_pred_uv_tm_ssse3, + vp9_intra_pred_uv_ho_ssse3); +} + +void vp9_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *xd) { + build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer, + xd->dst.v_buffer, xd->dst.uv_stride, + vp9_intra_pred_uv_tm_sse2, + vp9_intra_pred_uv_ho_mmx2); +} + +void vp9_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *xd) { + build_intra_predictors_mbuv_x86(xd, xd->dst.u_buffer, + xd->dst.v_buffer, xd->dst.uv_stride, + vp9_intra_pred_uv_tm_ssse3, + vp9_intra_pred_uv_ho_ssse3); +} diff --git a/vp9/common/x86/vp9_sadmxn_x86.c b/vp9/common/x86/vp9_sadmxn_x86.c new file mode 100644 index 000000000..0b783ccea --- /dev/null +++ b/vp9/common/x86/vp9_sadmxn_x86.c @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" + +#if HAVE_SSE2 +unsigned int vp9_sad16x3_sse2( + const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride) { + __m128i s0, s1, s2; + __m128i r0, r1, r2; + __m128i sad; + + s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride)); + s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride)); + s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride)); + + r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * ref_stride)); + r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * ref_stride)); + r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * ref_stride)); + + sad = _mm_sad_epu8(s0, r0); + sad = _mm_add_epi16(sad, _mm_sad_epu8(s1, r1)); + sad = _mm_add_epi16(sad, _mm_sad_epu8(s2, r2)); + sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8)); + + return _mm_cvtsi128_si32(sad); +} + +unsigned int vp9_sad3x16_sse2( + const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride) { + int r; + __m128i s0, s1, s2, s3; + __m128i r0, r1, r2, r3; + __m128i sad = _mm_setzero_si128(); + __m128i mask; + const int offset = (uintptr_t)src_ptr & 3; + + /* In current use case, the offset is 1 if CONFIG_SUBPELREFMV is off. + * Here, for offset=1, we adjust src_ptr to be 4-byte aligned. Then, movd + * takes much less time. + */ + if (offset == 1) + src_ptr -= 1; + + /* mask = 0xffffffffffff0000ffffffffffff0000 */ + mask = _mm_cmpeq_epi32(sad, sad); + mask = _mm_slli_epi64(mask, 16); + + for (r = 0; r < 16; r += 4) { + s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride)); + s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride)); + s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride)); + s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride)); + r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * ref_stride)); + r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * ref_stride)); + r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * ref_stride)); + r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * ref_stride)); + + s0 = _mm_unpacklo_epi8(s0, s1); + r0 = _mm_unpacklo_epi8(r0, r1); + s2 = _mm_unpacklo_epi8(s2, s3); + r2 = _mm_unpacklo_epi8(r2, r3); + s0 = _mm_unpacklo_epi64(s0, s2); + r0 = _mm_unpacklo_epi64(r0, r2); + + // throw out extra byte + if (offset == 1) + s0 = _mm_and_si128(s0, mask); + else + s0 = _mm_slli_epi64(s0, 16); + r0 = _mm_slli_epi64(r0, 16); + + sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0)); + + src_ptr += src_stride*4; + ref_ptr += ref_stride*4; + } + + sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8)); + return _mm_cvtsi128_si32(sad); +} + +#endif diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm new file mode 100644 index 000000000..dd89710e8 --- /dev/null +++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm @@ -0,0 +1,550 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +; +; This is an implementation of some of the SSE optimizations first seen in ffvp8 +; +;*************************************************************************************/ + +;void vp9_filter_block1d8_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d8_v8_ssse3) +sym(vp9_filter_block1d8_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + lea rbx, [rdx + rdx*4] + add rbx, rdx ;pitch * 6 + +.vp9_filter_block1d8_v8_ssse3_loop: + movq xmm0, [rsi] ;A + movq xmm1, [rsi + rdx] ;B + movq xmm2, [rsi + rdx * 2] ;C + movq xmm3, [rax + rdx * 2] ;D + movq xmm4, [rsi + rdx * 4] ;E + movq xmm5, [rax + rdx * 4] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + movq xmm6, [rsi + rbx] ;G + movq xmm7, [rax + rbx] ;H + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + punpcklbw xmm6, xmm7 ;G H + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm2 + paddsw xmm0, krd + paddsw xmm4, xmm6 + paddsw xmm0, xmm4 + + psraw xmm0, 7 + packuswb xmm0, xmm0 + + add rsi, rdx + add rax, rdx + + movq [rdi], xmm0 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .vp9_filter_block1d8_v8_ssse3_loop + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_v8_ssse3) +sym(vp9_filter_block1d16_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + lea rbx, [rdx + rdx*4] + add rbx, rdx ;pitch * 6 + +.vp9_filter_block1d16_v8_ssse3_loop: + movq xmm0, [rsi] ;A + movq xmm1, [rsi + rdx] ;B + movq xmm2, [rsi + rdx * 2] ;C + movq xmm3, [rax + rdx * 2] ;D + movq xmm4, [rsi + rdx * 4] ;E + movq xmm5, [rax + rdx * 4] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + movq xmm6, [rsi + rbx] ;G + movq xmm7, [rax + rbx] ;H + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + punpcklbw xmm6, xmm7 ;G H + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm2 + paddsw xmm0, krd + paddsw xmm4, xmm6 + paddsw xmm0, xmm4 + + psraw xmm0, 7 + packuswb xmm0, xmm0 + + movq [rdi], xmm0 + + movq xmm0, [rsi + 8] ;A + movq xmm1, [rsi + rdx + 8] ;B + movq xmm2, [rsi + rdx * 2 + 8] ;C + movq xmm3, [rax + rdx * 2 + 8] ;D + movq xmm4, [rsi + rdx * 4 + 8] ;E + movq xmm5, [rax + rdx * 4 + 8] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + + movq xmm6, [rsi + rbx + 8] ;G + movq xmm7, [rax + rbx + 8] ;H + punpcklbw xmm6, xmm7 ;G H + + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm2 + paddsw xmm4, xmm6 + paddsw xmm0, krd + paddsw xmm0, xmm4 + + psraw xmm0, 7 + packuswb xmm0, xmm0 + + add rsi, rdx + add rax, rdx + + movq [rdi+8], xmm0 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .vp9_filter_block1d16_v8_ssse3_loop + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d8_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d8_h8_ssse3) +sym(vp9_filter_block1d8_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 +; movdqa krd, xmm5 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rdx, dword ptr arg(3) ;output_pitch + movsxd rcx, dword ptr arg(4) ;output_height + +.filter_block1d8_h8_rowloop_ssse3: + movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 + +; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 + movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 +;note: if we create a k0_k7 filter, we can save a pshufb +; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 + punpcklqdq xmm0, xmm3 + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf_t0t1)] + pmaddubsw xmm0, k0k1 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf_t2t3)] + pmaddubsw xmm1, k2k3 + + movdqa xmm4, xmm2 + pshufb xmm2, [GLOBAL(shuf_t4t5)] + pmaddubsw xmm2, k4k5 + + pshufb xmm4, [GLOBAL(shuf_t6t7)] + pmaddubsw xmm4, k6k7 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm2 + paddsw xmm0, xmm5 + paddsw xmm0, xmm4 + psraw xmm0, 7 + packuswb xmm0, xmm0 + + lea rsi, [rsi + rax] + movq [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .filter_block1d8_h8_rowloop_ssse3 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_h8_ssse3) +sym(vp9_filter_block1d16_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rdx, dword ptr arg(3) ;output_pitch + movsxd rcx, dword ptr arg(4) ;output_height + +.filter_block1d16_h8_rowloop_ssse3: + movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 + +; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 + movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 +;note: if we create a k0_k7 filter, we can save a pshufb +; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 + punpcklqdq xmm0, xmm3 + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf_t0t1)] + pmaddubsw xmm0, k0k1 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf_t2t3)] + pmaddubsw xmm1, k2k3 + + movdqa xmm4, xmm2 + pshufb xmm2, [GLOBAL(shuf_t4t5)] + pmaddubsw xmm2, k4k5 + + pshufb xmm4, [GLOBAL(shuf_t6t7)] + pmaddubsw xmm4, k6k7 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 + paddsw xmm0, xmm2 + paddsw xmm0, krd + psraw xmm0, 7 + packuswb xmm0, xmm0 + + + movq xmm3, [rsi + 5] +; movq xmm7, [rsi + 12] + movq xmm7, [rsi + 13] +;note: same as above +; punpcklbw xmm3, xmm7 + punpcklqdq xmm3, xmm7 + + movdqa xmm1, xmm3 + pshufb xmm3, [GLOBAL(shuf_t0t1)] + pmaddubsw xmm3, k0k1 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf_t2t3)] + pmaddubsw xmm1, k2k3 + + movdqa xmm4, xmm2 + pshufb xmm2, [GLOBAL(shuf_t4t5)] + pmaddubsw xmm2, k4k5 + + pshufb xmm4, [GLOBAL(shuf_t6t7)] + pmaddubsw xmm4, k6k7 + + paddsw xmm3, xmm1 + paddsw xmm3, xmm2 + paddsw xmm3, krd + paddsw xmm3, xmm4 + psraw xmm3, 7 + packuswb xmm3, xmm3 + punpcklqdq xmm0, xmm3 + + lea rsi, [rsi + rax] + movdqa [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .filter_block1d16_h8_rowloop_ssse3 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +shuf_t0t1: + db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +align 16 +shuf_t2t3: + db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +align 16 +shuf_t4t5: + db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +align 16 +shuf_t6t7: + db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 diff --git a/vp9/common/x86/vp9_subpixel_mmx.asm b/vp9/common/x86/vp9_subpixel_mmx.asm new file mode 100644 index 000000000..2f757fa80 --- /dev/null +++ b/vp9/common/x86/vp9_subpixel_mmx.asm @@ -0,0 +1,727 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +%define BLOCK_HEIGHT_WIDTH 4 +%define vp9_filter_weight 128 +%define VP9_FILTER_SHIFT 7 + + +;void vp9_filter_block1d_h6_mmx +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp9_filter +;) +global sym(vp9_filter_block1d_h6_mmx) +sym(vp9_filter_block1d_h6_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp9_filter + + movq mm1, [rdx + 16] ; do both the negative taps first!!! + movq mm2, [rdx + 32] ; + movq mm6, [rdx + 48] ; + movq mm7, [rdx + 64] ; + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + +.nextrow: + movq mm3, [rsi-2] ; mm3 = p-2..p5 + movq mm4, mm3 ; mm4 = p-2..p5 + psrlq mm3, 8 ; mm3 = p-1..p5 + punpcklbw mm3, mm0 ; mm3 = p-1..p2 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. + + movq mm5, mm4 ; mm5 = p-2..p5 + punpckhbw mm4, mm0 ; mm5 = p2..p5 + pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + movq mm4, mm5 ; mm4 = p-2..p5; + psrlq mm5, 16 ; mm5 = p0..p5; + punpcklbw mm5, mm0 ; mm5 = p0..p3 + pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers + paddsw mm3, mm5 ; mm3 += mm5 + + movq mm5, mm4 ; mm5 = p-2..p5 + psrlq mm4, 24 ; mm4 = p1..p5 + punpcklbw mm4, mm0 ; mm4 = p1..p4 + pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + ; do outer positive taps + movd mm4, [rsi+3] + punpcklbw mm4, mm0 ; mm5 = p3..p6 + pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers + paddsw mm3, mm4 ; mm3 += mm5 + + punpcklbw mm5, mm0 ; mm5 = p-2..p1 + pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers + paddsw mm3, mm5 ; mm3 += mm5 + + paddsw mm3, [GLOBAL(rd)] ; mm3 += round value + psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 + packuswb mm3, mm0 ; pack and unpack to saturate + punpcklbw mm3, mm0 ; + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line + add rdi, rax; +%else + movsxd r8, dword ptr arg(2) ;src_pixels_per_line + add rdi, rax; + + add rsi, r8 ; next line +%endif + + dec rcx ; decrement count + jnz .nextrow ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_filter_block1dc_v6_mmx +;( +; short *src_ptr, +; unsigned char *output_ptr, +; int output_pitch, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp9_filter +;) +global sym(vp9_filter_block1dc_v6_mmx) +sym(vp9_filter_block1dc_v6_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movq mm5, [GLOBAL(rd)] + push rbx + mov rbx, arg(7) ;vp9_filter + movq mm1, [rbx + 16] ; do both the negative taps first!!! + movq mm2, [rbx + 32] ; + movq mm6, [rbx + 48] ; + movq mm7, [rbx + 64] ; + + movsxd rdx, dword ptr arg(3) ;pixels_per_line + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + sub rsi, rdx + sub rsi, rdx + movsxd rcx, DWORD PTR arg(5) ;output_height + movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + + +.nextrow_cv: + movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. + + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 + pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 + pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi] ; mm4 = p0..p3 = row -2 + pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + add rsi, rdx ; move source forward 1 line to avoid 3 * pitch + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 + pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 + pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. + paddsw mm3, mm4 ; mm3 += mm4 + + + paddsw mm3, mm5 ; mm3 += round value + psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 + packuswb mm3, mm0 ; pack and saturate + + movd [rdi],mm3 ; store the results in the destination + ; the subsequent iterations repeat 3 out of 4 of these reads. Since the + ; recon block should be in cache this shouldn't cost much. Its obviously + ; avoidable!!!. + lea rdi, [rdi+rax] ; + dec rcx ; decrement count + jnz .nextrow_cv ; next row + + pop rbx + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void bilinear_predict8x8_mmx +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp9_bilinear_predict8x8_mmx) +sym(vp9_bilinear_predict8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = bilinear_filters_mmx[xoffset]; + ;const short *VFilter = bilinear_filters_mmx[yoffset]; + + movsxd rax, dword ptr arg(2) ;xoffset + mov rdi, arg(4) ;dst_ptr ; + + shl rax, 5 ; offset * 32 + lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))] + + add rax, rcx ; HFilter + mov rsi, arg(0) ;src_ptr ; + + movsxd rdx, dword ptr arg(5) ;dst_pitch + movq mm1, [rax] ; + + movq mm2, [rax+16] ; + movsxd rax, dword ptr arg(3) ;yoffset + + pxor mm0, mm0 ; + + shl rax, 5 ; offset*32 + add rax, rcx ; VFilter + + lea rcx, [rdi+rdx*8] ; + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; + + + + ; get the first horizontal line done ; + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP9_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + add rsi, rdx ; next line +.next_row_8x8: + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + movq mm5, mm7 ; + movq mm6, mm7 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 + + pmullw mm5, [rax] ; + pmullw mm6, [rax] ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP9_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + + pmullw mm3, [rax+16] ; + pmullw mm4, [rax+16] ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP9_FILTER_SHIFT ; + + packuswb mm3, mm4 + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, rdx ; next line + add rdi, dword ptr arg(5) ;dst_pitch ; +%else + movsxd r8, dword ptr arg(5) ;dst_pitch + add rsi, rdx ; next line + add rdi, r8 ;dst_pitch +%endif + cmp rdi, rcx ; + jne .next_row_8x8 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void bilinear_predict8x4_mmx +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp9_bilinear_predict8x4_mmx) +sym(vp9_bilinear_predict8x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = bilinear_filters_mmx[xoffset]; + ;const short *VFilter = bilinear_filters_mmx[yoffset]; + + movsxd rax, dword ptr arg(2) ;xoffset + mov rdi, arg(4) ;dst_ptr ; + + lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))] + shl rax, 5 + + mov rsi, arg(0) ;src_ptr ; + add rax, rcx + + movsxd rdx, dword ptr arg(5) ;dst_pitch + movq mm1, [rax] ; + + movq mm2, [rax+16] ; + movsxd rax, dword ptr arg(3) ;yoffset + + pxor mm0, mm0 ; + shl rax, 5 + + add rax, rcx + lea rcx, [rdi+rdx*4] ; + + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; + + ; get the first horizontal line done ; + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP9_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + add rsi, rdx ; next line +.next_row_8x4: + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movq mm4, mm3 ; make a copy of current line + + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + punpckhbw mm4, mm0 ; + + pmullw mm3, mm1 ; + pmullw mm4, mm1 ; + + movq mm5, [rsi+1] ; + movq mm6, mm5 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 ; + + pmullw mm5, mm2 ; + pmullw mm6, mm2 ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + movq mm5, mm7 ; + movq mm6, mm7 ; + + punpcklbw mm5, mm0 ; + punpckhbw mm6, mm0 + + pmullw mm5, [rax] ; + pmullw mm6, [rax] ; + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP9_FILTER_SHIFT ; + + movq mm7, mm3 ; + packuswb mm7, mm4 ; + + + pmullw mm3, [rax+16] ; + pmullw mm4, [rax+16] ; + + paddw mm3, mm5 ; + paddw mm4, mm6 ; + + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + paddw mm4, [GLOBAL(rd)] ; + psraw mm4, VP9_FILTER_SHIFT ; + + packuswb mm3, mm4 + + movq [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, rdx ; next line + add rdi, dword ptr arg(5) ;dst_pitch ; +%else + movsxd r8, dword ptr arg(5) ;dst_pitch + add rsi, rdx ; next line + add rdi, r8 +%endif + cmp rdi, rcx ; + jne .next_row_8x4 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void bilinear_predict4x4_mmx +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp9_bilinear_predict4x4_mmx) +sym(vp9_bilinear_predict4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = bilinear_filters_mmx[xoffset]; + ;const short *VFilter = bilinear_filters_mmx[yoffset]; + + movsxd rax, dword ptr arg(2) ;xoffset + mov rdi, arg(4) ;dst_ptr ; + + lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))] + shl rax, 5 + + add rax, rcx ; HFilter + mov rsi, arg(0) ;src_ptr ; + + movsxd rdx, dword ptr arg(5) ;ldst_pitch + movq mm1, [rax] ; + + movq mm2, [rax+16] ; + movsxd rax, dword ptr arg(3) ;yoffset + + pxor mm0, mm0 ; + shl rax, 5 + + add rax, rcx + lea rcx, [rdi+rdx*4] ; + + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; + + ; get the first horizontal line done ; + movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + + pmullw mm3, mm1 ; + movd mm5, [rsi+1] ; + + punpcklbw mm5, mm0 ; + pmullw mm5, mm2 ; + + paddw mm3, mm5 ; + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + movq mm7, mm3 ; + packuswb mm7, mm0 ; + + add rsi, rdx ; next line +.next_row_4x4: + movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 + + pmullw mm3, mm1 ; + movd mm5, [rsi+1] ; + + punpcklbw mm5, mm0 ; + pmullw mm5, mm2 ; + + paddw mm3, mm5 ; + + movq mm5, mm7 ; + punpcklbw mm5, mm0 ; + + pmullw mm5, [rax] ; + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + movq mm7, mm3 ; + + packuswb mm7, mm0 ; + + pmullw mm3, [rax+16] ; + paddw mm3, mm5 ; + + + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + packuswb mm3, mm0 + movd [rdi], mm3 ; store the results in the destination + +%if ABI_IS_32BIT + add rsi, rdx ; next line + add rdi, dword ptr arg(5) ;dst_pitch ; +%else + movsxd r8, dword ptr arg(5) ;dst_pitch ; + add rsi, rdx ; next line + add rdi, r8 +%endif + + cmp rdi, rcx ; + jne .next_row_4x4 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + + +SECTION_RODATA +align 16 +rd: + times 4 dw 0x40 + +align 16 +global HIDDEN_DATA(sym(vp9_six_tap_mmx)) +sym(vp9_six_tap_mmx): + times 8 dw 0 + times 8 dw 0 + times 8 dw 128 + times 8 dw 0 + times 8 dw 0 + times 8 dw 0 + + times 8 dw 0 + times 8 dw -6 + times 8 dw 123 + times 8 dw 12 + times 8 dw -1 + times 8 dw 0 + + times 8 dw 2 + times 8 dw -11 + times 8 dw 108 + times 8 dw 36 + times 8 dw -8 + times 8 dw 1 + + times 8 dw 0 + times 8 dw -9 + times 8 dw 93 + times 8 dw 50 + times 8 dw -6 + times 8 dw 0 + + times 8 dw 3 + times 8 dw -16 + times 8 dw 77 + times 8 dw 77 + times 8 dw -16 + times 8 dw 3 + + times 8 dw 0 + times 8 dw -6 + times 8 dw 50 + times 8 dw 93 + times 8 dw -9 + times 8 dw 0 + + times 8 dw 1 + times 8 dw -8 + times 8 dw 36 + times 8 dw 108 + times 8 dw -11 + times 8 dw 2 + + times 8 dw 0 + times 8 dw -1 + times 8 dw 12 + times 8 dw 123 + times 8 dw -6 + times 8 dw 0 + + +align 16 +global HIDDEN_DATA(sym(vp9_bilinear_filters_8x_mmx)) +sym(vp9_bilinear_filters_8x_mmx): + times 8 dw 128 + times 8 dw 0 + + times 8 dw 112 + times 8 dw 16 + + times 8 dw 96 + times 8 dw 32 + + times 8 dw 80 + times 8 dw 48 + + times 8 dw 64 + times 8 dw 64 + + times 8 dw 48 + times 8 dw 80 + + times 8 dw 32 + times 8 dw 96 + + times 8 dw 16 + times 8 dw 112 diff --git a/vp9/common/x86/vp9_subpixel_sse2.asm b/vp9/common/x86/vp9_subpixel_sse2.asm new file mode 100644 index 000000000..f62587406 --- /dev/null +++ b/vp9/common/x86/vp9_subpixel_sse2.asm @@ -0,0 +1,1372 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define BLOCK_HEIGHT_WIDTH 4 +%define VP9_FILTER_WEIGHT 128 +%define VP9_FILTER_SHIFT 7 + + +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +;*************************************************************************************/ +;void vp9_filter_block1d8_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short *vp9_filter +;) +global sym(vp9_filter_block1d8_h6_sse2) +sym(vp9_filter_block1d8_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp9_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;output_width +%endif + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d8_h6_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm4 + lea rsi, [rsi + rax] + +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(5) ;[output_width] +%else + add rdi, r8 +%endif + dec rcx + + jnz .filter_block1d8_h6_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_filter_block1d16_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short *vp9_filter +;) +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +;*************************************************************************************/ +global sym(vp9_filter_block1d16_h6_sse2) +sym(vp9_filter_block1d16_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(6) ;vp9_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;output_width +%endif + + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d16_h6_sse2_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + movq xmm2, MMWORD PTR [rsi +14] + pslldq xmm2, 8 + + por xmm2, xmm1 + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm4 + + movdqa xmm3, xmm2 + movdqa xmm4, xmm2 + + movdqa xmm5, xmm2 + movdqa xmm6, xmm2 + + movdqa xmm7, xmm2 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm2 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + punpcklbw xmm4, xmm0 + + movdqa XMMWORD Ptr [rdi+16], xmm4 + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(5) ;[output_width] +%else + add rdi, r8 +%endif + + dec rcx + jnz .filter_block1d16_h6_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_filter_block1d8_v6_sse2 +;( +; short *src_ptr, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; short * vp9_filter +;) +;/************************************************************************************ +; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The +; input pixel array has output_height rows. +;*************************************************************************************/ +global sym(vp9_filter_block1d8_v6_sse2) +sym(vp9_filter_block1d8_v6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(7) ;vp9_filter + movsxd rdx, dword ptr arg(3) ;pixels_per_line + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + + sub rsi, rdx + sub rsi, rdx + + movsxd rcx, DWORD PTR arg(5) ;[output_height] + pxor xmm0, xmm0 ; clear xmm0 + + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(2) ; dst_ptich +%endif + +.vp9_filter_block1d8_v6_sse2_loop: + movdqa xmm1, XMMWORD PTR [rsi] + pmullw xmm1, [rax] + + movdqa xmm2, XMMWORD PTR [rsi + rdx] + pmullw xmm2, [rax + 16] + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] + pmullw xmm3, [rax + 32] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] + pmullw xmm5, [rax + 64] + + add rsi, rdx + movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] + + pmullw xmm4, [rax + 48] + movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] + + pmullw xmm6, [rax + 80] + + paddsw xmm2, xmm5 + paddsw xmm2, xmm3 + + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + + paddsw xmm2, xmm6 + paddsw xmm2, xmm7 + + psraw xmm2, 7 + packuswb xmm2, xmm0 ; pack and saturate + + movq QWORD PTR [rdi], xmm2 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(2) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz .vp9_filter_block1d8_v6_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_filter_block1d16_v6_sse2 +;( +; unsigned short *src_ptr, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int pixels_per_line, +; unsigned int pixel_step, +; unsigned int output_height, +; unsigned int output_width, +; const short *vp9_filter +;) +;/************************************************************************************ +; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The +; input pixel array has output_height rows. +;*************************************************************************************/ +global sym(vp9_filter_block1d16_v6_sse2) +sym(vp9_filter_block1d16_v6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(7) ;vp9_filter + movsxd rdx, dword ptr arg(3) ;pixels_per_line + + mov rdi, arg(1) ;output_ptr + mov rsi, arg(0) ;src_ptr + + sub rsi, rdx + sub rsi, rdx + + movsxd rcx, DWORD PTR arg(5) ;[output_height] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(2) ; dst_ptich +%endif + +.vp9_filter_block1d16_v6_sse2_loop: +; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. + movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 + movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] + pmullw xmm1, [rax + 16] + pmullw xmm2, [rax + 16] + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 + movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] + pmullw xmm3, [rax + 64] + pmullw xmm4, [rax + 64] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 + movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] + pmullw xmm5, [rax + 32] + pmullw xmm6, [rax + 32] + + movdqa xmm7, XMMWORD PTR [rsi] ; line 1 + movdqa xmm0, XMMWORD PTR [rsi + 16] + pmullw xmm7, [rax] + pmullw xmm0, [rax] + + paddsw xmm1, xmm3 + paddsw xmm2, xmm4 + paddsw xmm1, xmm5 + paddsw xmm2, xmm6 + paddsw xmm1, xmm7 + paddsw xmm2, xmm0 + + add rsi, rdx + + movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 + movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] + pmullw xmm3, [rax + 48] + pmullw xmm4, [rax + 48] + + movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 + movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] + pmullw xmm5, [rax + 80] + pmullw xmm6, [rax + 80] + + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] + pxor xmm0, xmm0 ; clear xmm0 + + paddsw xmm1, xmm3 + paddsw xmm2, xmm4 + paddsw xmm1, xmm5 + paddsw xmm2, xmm6 + + paddsw xmm1, xmm7 + paddsw xmm2, xmm7 + + psraw xmm1, 7 + psraw xmm2, 7 + + packuswb xmm1, xmm2 ; pack and saturate + movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(2) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz .vp9_filter_block1d16_v6_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_filter_block1d8_h6_only_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int output_height, +; const short *vp9_filter +;) +; First-pass filter only when yoffset==0 +global sym(vp9_filter_block1d8_h6_only_sse2) +sym(vp9_filter_block1d8_h6_only_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(5) ;vp9_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(2) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(3) ;dst_ptich +%endif + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d8_h6_only_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 + + movq QWORD PTR [rdi], xmm4 ; store the results in the destination + lea rsi, [rsi + rax] + +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(3) ;dst_ptich +%else + add rdi, r8 +%endif + dec rcx + + jnz .filter_block1d8_h6_only_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_filter_block1d16_h6_only_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int output_height, +; const short *vp9_filter +;) +; First-pass filter only when yoffset==0 +global sym(vp9_filter_block1d16_h6_only_sse2) +sym(vp9_filter_block1d16_h6_only_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rdx, arg(5) ;vp9_filter + mov rsi, arg(0) ;src_ptr + + mov rdi, arg(2) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(3) ;dst_ptich +%endif + + pxor xmm0, xmm0 ; clear xmm0 for unpack + +.filter_block1d16_h6_only_sse2_rowloop: + movq xmm3, MMWORD PTR [rsi - 2] + movq xmm1, MMWORD PTR [rsi + 6] + + movq xmm2, MMWORD PTR [rsi +14] + pslldq xmm2, 8 + + por xmm2, xmm1 + prefetcht2 [rsi+rax-2] + + pslldq xmm1, 8 + por xmm1, xmm3 + + movdqa xmm4, xmm1 + movdqa xmm5, xmm1 + + movdqa xmm6, xmm1 + movdqa xmm7, xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm1 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 ; lower 8 bytes + + movq QWORD Ptr [rdi], xmm4 ; store the results in the destination + + movdqa xmm3, xmm2 + movdqa xmm4, xmm2 + + movdqa xmm5, xmm2 + movdqa xmm6, xmm2 + + movdqa xmm7, xmm2 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 + + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 + + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 + + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 + + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 + + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 + + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 + + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 + psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 + + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 + + punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 + pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 + + paddsw xmm4, xmm7 + paddsw xmm4, xmm5 + + paddsw xmm4, xmm3 + paddsw xmm4, xmm6 + + paddsw xmm4, xmm2 + paddsw xmm4, [GLOBAL(rd)] + + psraw xmm4, 7 + + packuswb xmm4, xmm0 ; higher 8 bytes + + movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(3) ;dst_ptich +%else + add rdi, r8 +%endif + + dec rcx + jnz .filter_block1d16_h6_only_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_filter_block1d8_v6_only_sse2 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; int dst_ptich, +; unsigned int output_height, +; const short *vp9_filter +;) +; Second-pass filter only when xoffset==0 +global sym(vp9_filter_block1d8_v6_only_sse2) +sym(vp9_filter_block1d8_v6_only_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + mov rax, arg(5) ;vp9_filter + + pxor xmm0, xmm0 ; clear xmm0 + + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(3) ; dst_ptich +%endif + +.vp9_filter_block1d8_v6_only_sse2_loop: + movq xmm1, MMWORD PTR [rsi] + movq xmm2, MMWORD PTR [rsi + rdx] + movq xmm3, MMWORD PTR [rsi + rdx * 2] + movq xmm5, MMWORD PTR [rsi + rdx * 4] + add rsi, rdx + movq xmm4, MMWORD PTR [rsi + rdx * 2] + movq xmm6, MMWORD PTR [rsi + rdx * 4] + + punpcklbw xmm1, xmm0 + pmullw xmm1, [rax] + + punpcklbw xmm2, xmm0 + pmullw xmm2, [rax + 16] + + punpcklbw xmm3, xmm0 + pmullw xmm3, [rax + 32] + + punpcklbw xmm5, xmm0 + pmullw xmm5, [rax + 64] + + punpcklbw xmm4, xmm0 + pmullw xmm4, [rax + 48] + + punpcklbw xmm6, xmm0 + pmullw xmm6, [rax + 80] + + paddsw xmm2, xmm5 + paddsw xmm2, xmm3 + + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + + paddsw xmm2, xmm6 + paddsw xmm2, xmm7 + + psraw xmm2, 7 + packuswb xmm2, xmm0 ; pack and saturate + + movq QWORD PTR [rdi], xmm2 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[dst_ptich] +%else + add rdi, r8 +%endif + dec rcx ; decrement count + jnz .vp9_filter_block1d8_v6_only_sse2_loop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_unpack_block1d16_h6_sse2 +;( +; unsigned char *src_ptr, +; unsigned short *output_ptr, +; unsigned int src_pixels_per_line, +; unsigned int output_height, +; unsigned int output_width +;) +global sym(vp9_unpack_block1d16_h6_sse2) +sym(vp9_unpack_block1d16_h6_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;output_ptr + + movsxd rcx, dword ptr arg(3) ;output_height + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source + + pxor xmm0, xmm0 ; clear xmm0 for unpack +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source +%endif + +.unpack_block1d16_h6_sse2_rowloop: + movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 + movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 + + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 + punpcklbw xmm1, xmm0 + + movdqa XMMWORD Ptr [rdi], xmm1 + movdqa XMMWORD Ptr [rdi + 16], xmm3 + + lea rsi, [rsi + rax] +%if ABI_IS_32BIT + add rdi, DWORD Ptr arg(4) ;[output_width] +%else + add rdi, r8 +%endif + dec rcx + jnz .unpack_block1d16_h6_sse2_rowloop ; next row + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_bilinear_predict16x16_sse2 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +extern sym(vp9_bilinear_filters_mmx) +global sym(vp9_bilinear_predict16x16_sse2) +sym(vp9_bilinear_predict16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ;const short *HFilter = bilinear_filters_mmx[xoffset] + ;const short *VFilter = bilinear_filters_mmx[yoffset] + + lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))] + movsxd rax, dword ptr arg(2) ;xoffset + + cmp rax, 0 ;skip first_pass filter if xoffset=0 + je .b16x16_sp_only + + shl rax, 5 + add rax, rcx ;HFilter + + mov rdi, arg(4) ;dst_ptr + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(5) ;dst_pitch + + movdqa xmm1, [rax] + movdqa xmm2, [rax+16] + + movsxd rax, dword ptr arg(3) ;yoffset + + cmp rax, 0 ;skip second_pass filter if yoffset=0 + je .b16x16_fp_only + + shl rax, 5 + add rax, rcx ;VFilter + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + pxor xmm0, xmm0 + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ;dst_pitch +%endif + ; get the first horizontal line done + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movdqa xmm4, xmm3 ; make a copy of current line + + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm1 + + movdqu xmm5, [rsi+1] + movdqa xmm6, xmm5 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, xmm2 + pmullw xmm6, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP9_FILTER_SHIFT + + movdqa xmm7, xmm3 + packuswb xmm7, xmm4 + + add rsi, rdx ; next line +.next_row: + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movdqa xmm4, xmm3 ; make a copy of current line + + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm1 + + movdqu xmm5, [rsi+1] + movdqa xmm6, xmm5 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, xmm2 + pmullw xmm6, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + movdqa xmm5, xmm7 + movdqa xmm6, xmm7 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, [rax] + pmullw xmm6, [rax] + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP9_FILTER_SHIFT + + movdqa xmm7, xmm3 + packuswb xmm7, xmm4 + + pmullw xmm3, [rax+16] + pmullw xmm4, [rax+16] + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP9_FILTER_SHIFT + + packuswb xmm3, xmm4 + movdqa [rdi], xmm3 ; store the results in the destination + + add rsi, rdx ; next line +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(5) ;dst_pitch +%else + add rdi, r8 +%endif + + cmp rdi, rcx + jne .next_row + + jmp .done + +.b16x16_sp_only: + movsxd rax, dword ptr arg(3) ;yoffset + shl rax, 5 + add rax, rcx ;VFilter + + mov rdi, arg(4) ;dst_ptr + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(5) ;dst_pitch + + movdqa xmm1, [rax] + movdqa xmm2, [rax+16] + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + + pxor xmm0, xmm0 + + ; get the first horizontal line done + movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + + add rsi, rax ; next line +.next_row_spo: + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + + movdqa xmm5, xmm7 + movdqa xmm6, xmm7 + + movdqa xmm4, xmm3 ; make a copy of current line + movdqa xmm7, xmm3 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm5, xmm1 + pmullw xmm6, xmm1 + pmullw xmm3, xmm2 + pmullw xmm4, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP9_FILTER_SHIFT + + packuswb xmm3, xmm4 + movdqa [rdi], xmm3 ; store the results in the destination + + add rsi, rax ; next line + add rdi, rdx ;dst_pitch + cmp rdi, rcx + jne .next_row_spo + + jmp .done + +.b16x16_fp_only: + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + pxor xmm0, xmm0 + +.next_row_fpo: + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 + movdqa xmm4, xmm3 ; make a copy of current line + + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 + punpckhbw xmm4, xmm0 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm1 + + movdqu xmm5, [rsi+1] + movdqa xmm6, xmm5 + + punpcklbw xmm5, xmm0 + punpckhbw xmm6, xmm0 + + pmullw xmm5, xmm2 + pmullw xmm6, xmm2 + + paddw xmm3, xmm5 + paddw xmm4, xmm6 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] + psraw xmm4, VP9_FILTER_SHIFT + + packuswb xmm3, xmm4 + movdqa [rdi], xmm3 ; store the results in the destination + + add rsi, rax ; next line + add rdi, rdx ; dst_pitch + cmp rdi, rcx + jne .next_row_fpo + +.done: + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_bilinear_predict8x8_sse2 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +extern sym(vp9_bilinear_filters_mmx) +global sym(vp9_bilinear_predict8x8_sse2) +sym(vp9_bilinear_predict8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 144 ; reserve 144 bytes + + ;const short *HFilter = bilinear_filters_mmx[xoffset] + ;const short *VFilter = bilinear_filters_mmx[yoffset] + lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))] + + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + ;Read 9-line unaligned data in and put them on stack. This gives a big + ;performance boost. + movdqu xmm0, [rsi] + lea rax, [rdx + rdx*2] + movdqu xmm1, [rsi+rdx] + movdqu xmm2, [rsi+rdx*2] + add rsi, rax + movdqu xmm3, [rsi] + movdqu xmm4, [rsi+rdx] + movdqu xmm5, [rsi+rdx*2] + add rsi, rax + movdqu xmm6, [rsi] + movdqu xmm7, [rsi+rdx] + + movdqa XMMWORD PTR [rsp], xmm0 + + movdqu xmm0, [rsi+rdx*2] + + movdqa XMMWORD PTR [rsp+16], xmm1 + movdqa XMMWORD PTR [rsp+32], xmm2 + movdqa XMMWORD PTR [rsp+48], xmm3 + movdqa XMMWORD PTR [rsp+64], xmm4 + movdqa XMMWORD PTR [rsp+80], xmm5 + movdqa XMMWORD PTR [rsp+96], xmm6 + movdqa XMMWORD PTR [rsp+112], xmm7 + movdqa XMMWORD PTR [rsp+128], xmm0 + + movsxd rax, dword ptr arg(2) ;xoffset + shl rax, 5 + add rax, rcx ;HFilter + + mov rdi, arg(4) ;dst_ptr + movsxd rdx, dword ptr arg(5) ;dst_pitch + + movdqa xmm1, [rax] + movdqa xmm2, [rax+16] + + movsxd rax, dword ptr arg(3) ;yoffset + shl rax, 5 + add rax, rcx ;VFilter + + lea rcx, [rdi+rdx*8] + + movdqa xmm5, [rax] + movdqa xmm6, [rax+16] + + pxor xmm0, xmm0 + + ; get the first horizontal line done + movdqa xmm3, XMMWORD PTR [rsp] + movdqa xmm4, xmm3 ; make a copy of current line + psrldq xmm4, 1 + + punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 + punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm2 + + paddw xmm3, xmm4 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + movdqa xmm7, xmm3 + add rsp, 16 ; next line +.next_row8x8: + movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + movdqa xmm4, xmm3 ; make a copy of current line + psrldq xmm4, 1 + + punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 + punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 + + pmullw xmm3, xmm1 + pmullw xmm4, xmm2 + + paddw xmm3, xmm4 + pmullw xmm7, xmm5 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + movdqa xmm4, xmm3 + + pmullw xmm3, xmm6 + paddw xmm3, xmm7 + + movdqa xmm7, xmm4 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + packuswb xmm3, xmm0 + movq [rdi], xmm3 ; store the results in the destination + + add rsp, 16 ; next line + add rdi, rdx + + cmp rdi, rcx + jne .next_row8x8 + + ;add rsp, 144 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +rd: + times 8 dw 0x40 diff --git a/vp9/common/x86/vp9_subpixel_ssse3.asm b/vp9/common/x86/vp9_subpixel_ssse3.asm new file mode 100644 index 000000000..4a16f1928 --- /dev/null +++ b/vp9/common/x86/vp9_subpixel_ssse3.asm @@ -0,0 +1,1515 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define BLOCK_HEIGHT_WIDTH 4 +%define VP9_FILTER_WEIGHT 128 +%define VP9_FILTER_SHIFT 7 + + +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +; +; This is an implementation of some of the SSE optimizations first seen in ffvp8 +; +;*************************************************************************************/ +;void vp9_filter_block1d8_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp9_filter_index +;) +global sym(vp9_filter_block1d8_h6_ssse3) +sym(vp9_filter_block1d8_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 + + movdqa xmm7, [GLOBAL(rd)] + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + mov rdi, arg(2) ;output_ptr + + cmp esi, DWORD PTR [rax] + je vp9_filter_block1d8_h4_ssse3 + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + + sub rdi, rdx +;xmm3 free +.filter_block1d8_h6_rowloop_ssse3: + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 + + movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 + + punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 + + movdqa xmm1, xmm0 + pmaddubsw xmm0, xmm4 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf2bfrom1)] + + pshufb xmm2, [GLOBAL(shuf3bfrom1)] + pmaddubsw xmm1, xmm5 + + lea rdi, [rdi + rdx] + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] + dec rcx + + paddsw xmm0, xmm1 + paddsw xmm2, xmm7 + + paddsw xmm0, xmm2 + + psraw xmm0, 7 + + packuswb xmm0, xmm0 + + movq MMWORD Ptr [rdi], xmm0 + jnz .filter_block1d8_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +vp9_filter_block1d8_h4_ssse3: + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] + movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] + + mov rsi, arg(0) ;src_ptr + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + + sub rdi, rdx + +.filter_block1d8_h4_rowloop_ssse3: + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 + + movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 + + punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 + + movdqa xmm2, xmm0 + pshufb xmm0, xmm3 + + pshufb xmm2, xmm4 + pmaddubsw xmm0, xmm5 + + lea rdi, [rdi + rdx] + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] + dec rcx + + paddsw xmm0, xmm7 + + paddsw xmm0, xmm2 + + psraw xmm0, 7 + + packuswb xmm0, xmm0 + + movq MMWORD Ptr [rdi], xmm0 + + jnz .filter_block1d8_h4_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +;void vp9_filter_block1d16_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp9_filter_index +;) +global sym(vp9_filter_block1d16_h6_ssse3) +sym(vp9_filter_block1d16_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + mov rdi, arg(2) ;output_ptr + + mov rsi, arg(0) ;src_ptr + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + movsxd rdx, dword ptr arg(3) ;output_pitch + +.filter_block1d16_h6_rowloop_ssse3: + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 + + movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 + + punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 + + movdqa xmm1, xmm0 + pmaddubsw xmm0, xmm4 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf2bfrom1)] + + pshufb xmm2, [GLOBAL(shuf3bfrom1)] + movq xmm3, MMWORD PTR [rsi + 6] + + pmaddubsw xmm1, xmm5 + movq xmm7, MMWORD PTR [rsi + 11] + + pmaddubsw xmm2, xmm6 + punpcklbw xmm3, xmm7 + + paddsw xmm0, xmm1 + movdqa xmm1, xmm3 + + pmaddubsw xmm3, xmm4 + paddsw xmm0, xmm2 + + movdqa xmm2, xmm1 + paddsw xmm0, [GLOBAL(rd)] + + pshufb xmm1, [GLOBAL(shuf2bfrom1)] + pshufb xmm2, [GLOBAL(shuf3bfrom1)] + + psraw xmm0, 7 + pmaddubsw xmm1, xmm5 + + pmaddubsw xmm2, xmm6 + packuswb xmm0, xmm0 + + lea rsi, [rsi + rax] + paddsw xmm3, xmm1 + + paddsw xmm3, xmm2 + + paddsw xmm3, [GLOBAL(rd)] + + psraw xmm3, 7 + + packuswb xmm3, xmm3 + + punpcklqdq xmm0, xmm3 + + movdqa XMMWORD Ptr [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .filter_block1d16_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d4_h6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; unsigned int vp9_filter_index +;) +global sym(vp9_filter_block1d4_h6_ssse3) +sym(vp9_filter_block1d4_h6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + movdqa xmm7, [GLOBAL(rd)] + + cmp esi, DWORD PTR [rax] + je .vp9_filter_block1d4_h4_ssse3 + + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + +;xmm3 free +.filter_block1d4_h6_rowloop_ssse3: + movdqu xmm0, XMMWORD PTR [rsi - 2] + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf1b)] + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf2b)] + pmaddubsw xmm0, xmm4 + pshufb xmm2, [GLOBAL(shuf3b)] + pmaddubsw xmm1, xmm5 + +;-- + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] +;-- + paddsw xmm0, xmm1 + paddsw xmm0, xmm7 + pxor xmm1, xmm1 + paddsw xmm0, xmm2 + psraw xmm0, 7 + packuswb xmm0, xmm0 + + movd DWORD PTR [rdi], xmm0 + + add rdi, rdx + dec rcx + jnz .filter_block1d4_h6_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +.vp9_filter_block1d4_h4_ssse3: + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 + movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] + movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rcx, dword ptr arg(4) ;output_height + + movsxd rdx, dword ptr arg(3) ;output_pitch + +.filter_block1d4_h4_rowloop_ssse3: + movdqu xmm1, XMMWORD PTR [rsi - 2] + + movdqa xmm2, xmm1 + pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] + pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] + pmaddubsw xmm1, xmm5 + +;-- + pmaddubsw xmm2, xmm6 + + lea rsi, [rsi + rax] +;-- + paddsw xmm1, xmm7 + paddsw xmm1, xmm2 + psraw xmm1, 7 + packuswb xmm1, xmm1 + + movd DWORD PTR [rdi], xmm1 + + add rdi, rdx + dec rcx + jnz .filter_block1d4_h4_rowloop_ssse3 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + + +;void vp9_filter_block1d16_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp9_filter_index +;) +global sym(vp9_filter_block1d16_v6_ssse3) +sym(vp9_filter_block1d16_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + cmp esi, DWORD PTR [rax] + je .vp9_filter_block1d16_v4_ssse3 + + movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + +.vp9_filter_block1d16_v6_ssse3_loop: + movq xmm1, MMWORD PTR [rsi] ;A + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4] ;F + + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, [GLOBAL(rd)] + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 ;store the results + + movq xmm1, MMWORD PTR [rsi + 8] ;A + movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, [GLOBAL(rd)] + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi+8], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .vp9_filter_block1d16_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +.vp9_filter_block1d16_v4_ssse3: + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + +.vp9_filter_block1d16_v4_ssse3_loop: + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + pmaddubsw xmm3, xmm6 + pmaddubsw xmm2, xmm7 + movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B + movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E + + paddsw xmm2, [GLOBAL(rd)] + paddsw xmm2, xmm3 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + punpcklbw xmm5, xmm4 ;B D + punpcklbw xmm1, xmm0 ;C E + + pmaddubsw xmm1, xmm6 + pmaddubsw xmm5, xmm7 + + movdqa xmm4, [GLOBAL(rd)] + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm5, xmm1 + paddsw xmm5, xmm4 + psraw xmm5, 7 + packuswb xmm5, xmm5 + + punpcklqdq xmm2, xmm5 + + movdqa XMMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .vp9_filter_block1d16_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d8_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp9_filter_index +;) +global sym(vp9_filter_block1d8_v6_ssse3) +sym(vp9_filter_block1d8_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ; out_pitch +%endif + movsxd rcx, DWORD PTR arg(4) ;[output_height] + + cmp esi, DWORD PTR [rax] + je .vp9_filter_block1d8_v4_ssse3 + + movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp9_filter_block1d8_v6_ssse3_loop: + movq xmm1, MMWORD PTR [rsi] ;A + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + movq xmm0, MMWORD PTR [rax + rdx * 4] ;F + movdqa xmm4, [GLOBAL(rd)] + + pmaddubsw xmm3, xmm6 + punpcklbw xmm1, xmm0 ;A F + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm5 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm1 + paddsw xmm2, xmm4 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp9_filter_block1d8_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +.vp9_filter_block1d8_v4_ssse3: + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 + movdqa xmm5, [GLOBAL(rd)] + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp9_filter_block1d8_v4_ssse3_loop: + movq xmm2, MMWORD PTR [rsi + rdx] ;B + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E + + punpcklbw xmm2, xmm4 ;B D + punpcklbw xmm3, xmm0 ;C E + + pmaddubsw xmm3, xmm6 + pmaddubsw xmm2, xmm7 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw xmm2, xmm3 + paddsw xmm2, xmm5 + psraw xmm2, 7 + packuswb xmm2, xmm2 + + movq MMWORD PTR [rdi], xmm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp9_filter_block1d8_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +;void vp9_filter_block1d4_v6_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; unsigned int vp9_filter_index +;) +global sym(vp9_filter_block1d4_v6_ssse3) +sym(vp9_filter_block1d4_v6_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movsxd rdx, DWORD PTR arg(5) ;table index + xor rsi, rsi + shl rdx, 4 ; + + lea rax, [GLOBAL(k0_k5)] + add rax, rdx + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + mov rdi, arg(2) ;output_ptr +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ; out_pitch +%endif + movsxd rcx, DWORD PTR arg(4) ;[output_height] + + cmp esi, DWORD PTR [rax] + je .vp9_filter_block1d4_v4_ssse3 + + movq mm5, MMWORD PTR [rax] ;k0_k5 + movq mm6, MMWORD PTR [rax+256] ;k2_k4 + movq mm7, MMWORD PTR [rax+128] ;k1_k3 + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp9_filter_block1d4_v6_ssse3_loop: + movd mm1, DWORD PTR [rsi] ;A + movd mm2, DWORD PTR [rsi + rdx] ;B + movd mm3, DWORD PTR [rsi + rdx * 2] ;C + movd mm4, DWORD PTR [rax + rdx * 2] ;D + movd mm0, DWORD PTR [rsi + rdx * 4] ;E + + punpcklbw mm2, mm4 ;B D + punpcklbw mm3, mm0 ;C E + + movd mm0, DWORD PTR [rax + rdx * 4] ;F + + movq mm4, [GLOBAL(rd)] + + pmaddubsw mm3, mm6 + punpcklbw mm1, mm0 ;A F + pmaddubsw mm2, mm7 + pmaddubsw mm1, mm5 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw mm2, mm3 + paddsw mm2, mm1 + paddsw mm2, mm4 + psraw mm2, 7 + packuswb mm2, mm2 + + movd DWORD PTR [rdi], mm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp9_filter_block1d4_v6_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +.vp9_filter_block1d4_v4_ssse3: + movq mm6, MMWORD PTR [rax+256] ;k2_k4 + movq mm7, MMWORD PTR [rax+128] ;k1_k3 + movq mm5, MMWORD PTR [GLOBAL(rd)] + + mov rsi, arg(0) ;src_ptr + + mov rax, rsi + add rax, rdx + +.vp9_filter_block1d4_v4_ssse3_loop: + movd mm2, DWORD PTR [rsi + rdx] ;B + movd mm3, DWORD PTR [rsi + rdx * 2] ;C + movd mm4, DWORD PTR [rax + rdx * 2] ;D + movd mm0, DWORD PTR [rsi + rdx * 4] ;E + + punpcklbw mm2, mm4 ;B D + punpcklbw mm3, mm0 ;C E + + pmaddubsw mm3, mm6 + pmaddubsw mm2, mm7 + add rsi, rdx + add rax, rdx +;-- +;-- + paddsw mm2, mm3 + paddsw mm2, mm5 + psraw mm2, 7 + packuswb mm2, mm2 + + movd DWORD PTR [rdi], mm2 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;[out_pitch] +%else + add rdi, r8 +%endif + dec rcx + jnz .vp9_filter_block1d4_v4_ssse3_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_bilinear_predict16x16_ssse3 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp9_bilinear_predict16x16_ssse3) +sym(vp9_bilinear_predict16x16_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + lea rcx, [GLOBAL(bilinear_filters_ssse3)] + movsxd rax, dword ptr arg(2) ; xoffset + + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je .b16x16_sp_only + + shl rax, 4 + lea rax, [rax + rcx] ; HFilter + + mov rdi, arg(4) ; dst_ptr + mov rsi, arg(0) ; src_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm1, [rax] + + movsxd rax, dword ptr arg(3) ; yoffset + + cmp rax, 0 ; skip second_pass filter if yoffset=0 + je .b16x16_fp_only + + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rdx, dword ptr arg(1) ; src_pixels_per_line + + movdqa xmm2, [rax] + +%if ABI_IS_32BIT=0 + movsxd r8, dword ptr arg(5) ; dst_pitch +%endif + movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 + movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 + + punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 + movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 + + movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 + + lea rsi, [rsi + rdx] ; next line + + pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 + + punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 + pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value + psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128 + + movdqa xmm7, xmm3 + packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + +.next_row: + movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 + movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 + + punpcklbw xmm6, xmm5 + movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 + + movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 + lea rsi, [rsi + rdx] ; next line + + pmaddubsw xmm6, xmm1 + + punpcklbw xmm4, xmm5 + pmaddubsw xmm4, xmm1 + + paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value + psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128 + + paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value + psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128 + + packuswb xmm6, xmm4 + movdqa xmm5, xmm7 + + punpcklbw xmm5, xmm6 + pmaddubsw xmm5, xmm2 + + punpckhbw xmm7, xmm6 + pmaddubsw xmm7, xmm2 + + paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value + psraw xmm5, VP9_FILTER_SHIFT ; xmm5 /= 128 + + paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value + psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128 + + packuswb xmm5, xmm7 + movdqa xmm7, xmm6 + + movdqa [rdi], xmm5 ; store the results in the destination +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(5) ; dst_pitch +%else + add rdi, r8 +%endif + + cmp rdi, rcx + jne .next_row + + jmp .done + +.b16x16_sp_only: + movsxd rax, dword ptr arg(3) ; yoffset + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + mov rdi, arg(4) ; dst_ptr + mov rsi, arg(0) ; src_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm1, [rax] ; VFilter + + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ; src_pixels_per_line + + ; get the first horizontal line done + movq xmm4, [rsi] ; load row 0 + movq xmm2, [rsi + 8] ; load row 0 + + lea rsi, [rsi + rax] ; next line +.next_row_sp: + movq xmm3, [rsi] ; load row + 1 + movq xmm5, [rsi + 8] ; load row + 1 + + punpcklbw xmm4, xmm3 + punpcklbw xmm2, xmm5 + + pmaddubsw xmm4, xmm1 + movq xmm7, [rsi + rax] ; load row + 2 + + pmaddubsw xmm2, xmm1 + movq xmm6, [rsi + rax + 8] ; load row + 2 + + punpcklbw xmm3, xmm7 + punpcklbw xmm5, xmm6 + + pmaddubsw xmm3, xmm1 + paddw xmm4, [GLOBAL(rd)] + + pmaddubsw xmm5, xmm1 + paddw xmm2, [GLOBAL(rd)] + + psraw xmm4, VP9_FILTER_SHIFT + psraw xmm2, VP9_FILTER_SHIFT + + packuswb xmm4, xmm2 + paddw xmm3, [GLOBAL(rd)] + + movdqa [rdi], xmm4 ; store row 0 + paddw xmm5, [GLOBAL(rd)] + + psraw xmm3, VP9_FILTER_SHIFT + psraw xmm5, VP9_FILTER_SHIFT + + packuswb xmm3, xmm5 + movdqa xmm4, xmm7 + + movdqa [rdi + rdx],xmm3 ; store row 1 + lea rsi, [rsi + 2*rax] + + movdqa xmm2, xmm6 + lea rdi, [rdi + 2*rdx] + + cmp rdi, rcx + jne .next_row_sp + + jmp .done + +.b16x16_fp_only: + lea rcx, [rdi+rdx*8] + lea rcx, [rcx+rdx*8] + movsxd rax, dword ptr arg(1) ; src_pixels_per_line + +.next_row_fp: + movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 + movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 + + punpcklbw xmm2, xmm4 + movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 + + pmaddubsw xmm2, xmm1 + movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 + + lea rsi, [rsi + rax] ; next line + punpcklbw xmm3, xmm4 + + pmaddubsw xmm3, xmm1 + movq xmm5, [rsi] + + paddw xmm2, [GLOBAL(rd)] + movq xmm7, [rsi+1] + + movq xmm6, [rsi+8] + psraw xmm2, VP9_FILTER_SHIFT + + punpcklbw xmm5, xmm7 + movq xmm7, [rsi+9] + + paddw xmm3, [GLOBAL(rd)] + pmaddubsw xmm5, xmm1 + + psraw xmm3, VP9_FILTER_SHIFT + punpcklbw xmm6, xmm7 + + packuswb xmm2, xmm3 + pmaddubsw xmm6, xmm1 + + movdqa [rdi], xmm2 ; store the results in the destination + paddw xmm5, [GLOBAL(rd)] + + lea rdi, [rdi + rdx] ; dst_pitch + psraw xmm5, VP9_FILTER_SHIFT + + paddw xmm6, [GLOBAL(rd)] + psraw xmm6, VP9_FILTER_SHIFT + + packuswb xmm5, xmm6 + lea rsi, [rsi + rax] ; next line + + movdqa [rdi], xmm5 ; store the results in the destination + lea rdi, [rdi + rdx] ; dst_pitch + + cmp rdi, rcx + + jne .next_row_fp + +.done: + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_bilinear_predict8x8_ssse3 +;( +; unsigned char *src_ptr, +; int src_pixels_per_line, +; int xoffset, +; int yoffset, +; unsigned char *dst_ptr, +; int dst_pitch +;) +global sym(vp9_bilinear_predict8x8_ssse3) +sym(vp9_bilinear_predict8x8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 144 ; reserve 144 bytes + + lea rcx, [GLOBAL(bilinear_filters_ssse3)] + + mov rsi, arg(0) ;src_ptr + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line + + ;Read 9-line unaligned data in and put them on stack. This gives a big + ;performance boost. + movdqu xmm0, [rsi] + lea rax, [rdx + rdx*2] + movdqu xmm1, [rsi+rdx] + movdqu xmm2, [rsi+rdx*2] + add rsi, rax + movdqu xmm3, [rsi] + movdqu xmm4, [rsi+rdx] + movdqu xmm5, [rsi+rdx*2] + add rsi, rax + movdqu xmm6, [rsi] + movdqu xmm7, [rsi+rdx] + + movdqa XMMWORD PTR [rsp], xmm0 + + movdqu xmm0, [rsi+rdx*2] + + movdqa XMMWORD PTR [rsp+16], xmm1 + movdqa XMMWORD PTR [rsp+32], xmm2 + movdqa XMMWORD PTR [rsp+48], xmm3 + movdqa XMMWORD PTR [rsp+64], xmm4 + movdqa XMMWORD PTR [rsp+80], xmm5 + movdqa XMMWORD PTR [rsp+96], xmm6 + movdqa XMMWORD PTR [rsp+112], xmm7 + movdqa XMMWORD PTR [rsp+128], xmm0 + + movsxd rax, dword ptr arg(2) ; xoffset + cmp rax, 0 ; skip first_pass filter if xoffset=0 + je .b8x8_sp_only + + shl rax, 4 + add rax, rcx ; HFilter + + mov rdi, arg(4) ; dst_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm0, [rax] + + movsxd rax, dword ptr arg(3) ; yoffset + cmp rax, 0 ; skip second_pass filter if yoffset=0 + je .b8x8_fp_only + + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + lea rcx, [rdi+rdx*8] + + movdqa xmm1, [rax] + + ; get the first horizontal line done + movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx + + psrldq xmm5, 1 + lea rsp, [rsp + 16] ; next line + + punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 + pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 + + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value + psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 + + movdqa xmm7, xmm3 + packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + +.next_row: + movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 + lea rsp, [rsp + 16] ; next line + + movdqa xmm5, xmm6 + + psrldq xmm5, 1 + + punpcklbw xmm6, xmm5 + pmaddubsw xmm6, xmm0 + + paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value + psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128 + + packuswb xmm6, xmm6 + + punpcklbw xmm7, xmm6 + pmaddubsw xmm7, xmm1 + + paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value + psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128 + + packuswb xmm7, xmm7 + + movq [rdi], xmm7 ; store the results in the destination + lea rdi, [rdi + rdx] + + movdqa xmm7, xmm6 + + cmp rdi, rcx + jne .next_row + + jmp .done8x8 + +.b8x8_sp_only: + movsxd rax, dword ptr arg(3) ; yoffset + shl rax, 4 + lea rax, [rax + rcx] ; VFilter + + mov rdi, arg(4) ;dst_ptr + movsxd rdx, dword ptr arg(5) ; dst_pitch + + movdqa xmm0, [rax] ; VFilter + + movq xmm1, XMMWORD PTR [rsp] + movq xmm2, XMMWORD PTR [rsp+16] + + movq xmm3, XMMWORD PTR [rsp+32] + punpcklbw xmm1, xmm2 + + movq xmm4, XMMWORD PTR [rsp+48] + punpcklbw xmm2, xmm3 + + movq xmm5, XMMWORD PTR [rsp+64] + punpcklbw xmm3, xmm4 + + movq xmm6, XMMWORD PTR [rsp+80] + punpcklbw xmm4, xmm5 + + movq xmm7, XMMWORD PTR [rsp+96] + punpcklbw xmm5, xmm6 + + pmaddubsw xmm1, xmm0 + pmaddubsw xmm2, xmm0 + + pmaddubsw xmm3, xmm0 + pmaddubsw xmm4, xmm0 + + pmaddubsw xmm5, xmm0 + punpcklbw xmm6, xmm7 + + pmaddubsw xmm6, xmm0 + paddw xmm1, [GLOBAL(rd)] + + paddw xmm2, [GLOBAL(rd)] + psraw xmm1, VP9_FILTER_SHIFT + + paddw xmm3, [GLOBAL(rd)] + psraw xmm2, VP9_FILTER_SHIFT + + paddw xmm4, [GLOBAL(rd)] + psraw xmm3, VP9_FILTER_SHIFT + + paddw xmm5, [GLOBAL(rd)] + psraw xmm4, VP9_FILTER_SHIFT + + paddw xmm6, [GLOBAL(rd)] + psraw xmm5, VP9_FILTER_SHIFT + + psraw xmm6, VP9_FILTER_SHIFT + packuswb xmm1, xmm1 + + packuswb xmm2, xmm2 + movq [rdi], xmm1 + + packuswb xmm3, xmm3 + movq [rdi+rdx], xmm2 + + packuswb xmm4, xmm4 + movq xmm1, XMMWORD PTR [rsp+112] + + lea rdi, [rdi + 2*rdx] + movq xmm2, XMMWORD PTR [rsp+128] + + packuswb xmm5, xmm5 + movq [rdi], xmm3 + + packuswb xmm6, xmm6 + movq [rdi+rdx], xmm4 + + lea rdi, [rdi + 2*rdx] + punpcklbw xmm7, xmm1 + + movq [rdi], xmm5 + pmaddubsw xmm7, xmm0 + + movq [rdi+rdx], xmm6 + punpcklbw xmm1, xmm2 + + pmaddubsw xmm1, xmm0 + paddw xmm7, [GLOBAL(rd)] + + psraw xmm7, VP9_FILTER_SHIFT + paddw xmm1, [GLOBAL(rd)] + + psraw xmm1, VP9_FILTER_SHIFT + packuswb xmm7, xmm7 + + packuswb xmm1, xmm1 + lea rdi, [rdi + 2*rdx] + + movq [rdi], xmm7 + + movq [rdi+rdx], xmm1 + lea rsp, [rsp + 144] + + jmp .done8x8 + +.b8x8_fp_only: + lea rcx, [rdi+rdx*8] + +.next_row_fp: + movdqa xmm1, XMMWORD PTR [rsp] + movdqa xmm3, XMMWORD PTR [rsp+16] + + movdqa xmm2, xmm1 + movdqa xmm5, XMMWORD PTR [rsp+32] + + psrldq xmm2, 1 + movdqa xmm7, XMMWORD PTR [rsp+48] + + movdqa xmm4, xmm3 + psrldq xmm4, 1 + + movdqa xmm6, xmm5 + psrldq xmm6, 1 + + punpcklbw xmm1, xmm2 + pmaddubsw xmm1, xmm0 + + punpcklbw xmm3, xmm4 + pmaddubsw xmm3, xmm0 + + punpcklbw xmm5, xmm6 + pmaddubsw xmm5, xmm0 + + movdqa xmm2, xmm7 + psrldq xmm2, 1 + + punpcklbw xmm7, xmm2 + pmaddubsw xmm7, xmm0 + + paddw xmm1, [GLOBAL(rd)] + psraw xmm1, VP9_FILTER_SHIFT + + paddw xmm3, [GLOBAL(rd)] + psraw xmm3, VP9_FILTER_SHIFT + + paddw xmm5, [GLOBAL(rd)] + psraw xmm5, VP9_FILTER_SHIFT + + paddw xmm7, [GLOBAL(rd)] + psraw xmm7, VP9_FILTER_SHIFT + + packuswb xmm1, xmm1 + packuswb xmm3, xmm3 + + packuswb xmm5, xmm5 + movq [rdi], xmm1 + + packuswb xmm7, xmm7 + movq [rdi+rdx], xmm3 + + lea rdi, [rdi + 2*rdx] + movq [rdi], xmm5 + + lea rsp, [rsp + 4*16] + movq [rdi+rdx], xmm7 + + lea rdi, [rdi + 2*rdx] + cmp rdi, rcx + + jne .next_row_fp + + lea rsp, [rsp + 16] + +.done8x8: + ;add rsp, 144 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +shuf1b: + db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 +shuf2b: + db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 +shuf3b: + db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 + +align 16 +shuf2bfrom1: + db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 +align 16 +shuf3bfrom1: + db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 + +align 16 +rd: + times 8 dw 0x40 + +align 16 +k0_k5: + times 8 db 0, 0 ;placeholder + times 8 db 0, 0 + times 8 db 2, 1 + times 8 db 0, 0 + times 8 db 3, 3 + times 8 db 0, 0 + times 8 db 1, 2 + times 8 db 0, 0 +k1_k3: + times 8 db 0, 0 ;placeholder + times 8 db -6, 12 + times 8 db -11, 36 + times 8 db -9, 50 + times 8 db -16, 77 + times 8 db -6, 93 + times 8 db -8, 108 + times 8 db -1, 123 +k2_k4: + times 8 db 128, 0 ;placeholder + times 8 db 123, -1 + times 8 db 108, -8 + times 8 db 93, -6 + times 8 db 77, -16 + times 8 db 50, -9 + times 8 db 36, -11 + times 8 db 12, -6 +align 16 +bilinear_filters_ssse3: + times 8 db 128, 0 + times 8 db 120, 8 + times 8 db 112, 16 + times 8 db 104, 24 + times 8 db 96, 32 + times 8 db 88, 40 + times 8 db 80, 48 + times 8 db 72, 56 + times 8 db 64, 64 + times 8 db 56, 72 + times 8 db 48, 80 + times 8 db 40, 88 + times 8 db 32, 96 + times 8 db 24, 104 + times 8 db 16, 112 + times 8 db 8, 120 + diff --git a/vp9/common/x86/vp9_subpixel_x86.h b/vp9/common/x86/vp9_subpixel_x86.h new file mode 100644 index 000000000..4c224da3b --- /dev/null +++ b/vp9/common/x86/vp9_subpixel_x86.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef SUBPIXEL_X86_H +#define SUBPIXEL_X86_H + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ + +#if HAVE_MMX +extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx); +extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx); +extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx); +extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx); +extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx); +extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx); +extern prototype_subpixel_predict(vp9_bilinear_predict8x4_mmx); +extern prototype_subpixel_predict(vp9_bilinear_predict4x4_mmx); + + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp9_subpix_sixtap16x16 +#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx + +#undef vp9_subpix_sixtap8x8 +#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx + +#undef vp9_subpix_sixtap8x4 +#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx + +#undef vp9_subpix_sixtap4x4 +#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx + +#undef vp9_subpix_bilinear16x16 +#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx + +#undef vp9_subpix_bilinear8x8 +#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_mmx + +#undef vp9_subpix_bilinear8x4 +#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_mmx + +#undef vp9_subpix_bilinear4x4 +#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_mmx + +#endif +#endif + + +#if HAVE_SSE2 +extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2); +extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2); +extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2); +extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2); +extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2); + + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp9_subpix_sixtap16x16 +#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2 + +#undef vp9_subpix_sixtap8x8 +#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2 + +#undef vp9_subpix_sixtap8x4 +#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2 + +#undef vp9_subpix_bilinear16x16 +#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2 + +#undef vp9_subpix_bilinear8x8 +#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2 + +#endif +#endif + +#if HAVE_SSSE3 +extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3); +extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3); +extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3); +extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3); +extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3); +extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp9_subpix_sixtap16x16 +#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3 + +#undef vp9_subpix_sixtap8x8 +#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3 + +#undef vp9_subpix_sixtap8x4 +#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3 + +#undef vp9_subpix_sixtap4x4 +#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3 + + +#undef vp9_subpix_bilinear16x16 +#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3 + +#undef vp9_subpix_bilinear8x8 +#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3 + +#endif +#endif + + + +#endif diff --git a/vp9/common/x86/vp9_vp8_asm_stubs.c b/vp9/common/x86/vp9_vp8_asm_stubs.c new file mode 100644 index 000000000..f804af698 --- /dev/null +++ b/vp9/common/x86/vp9_vp8_asm_stubs.c @@ -0,0 +1,602 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_subpixel.h" + +extern const short vp9_six_tap_mmx[16][6 * 8]; + +extern const short vp9_bilinear_filters_8x_mmx[16][2 * 8]; + +extern void vp9_filter_block1d_h6_mmx(unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp9_filter); + +extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr, + unsigned char *output_ptr, + int output_pitch, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp9_filter); + +extern void vp9_filter_block1d8_h6_sse2(unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp9_filter); + +extern void vp9_filter_block1d16_h6_sse2(unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp9_filter); + +extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr, + unsigned char *output_ptr, + int dst_ptich, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp9_filter); + +extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr, + unsigned char *output_ptr, + int dst_ptich, + unsigned int pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const short *vp9_filter); + +extern void vp9_unpack_block1d16_h6_sse2(unsigned char *src_ptr, + unsigned short *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width); + +extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + int dst_pitch, + unsigned int output_height, + const short *vp9_filter); + +extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr, + unsigned int src_pixels_per_lin, + unsigned char *output_ptr, + int dst_pitch, + unsigned int output_height, + const short *vp9_filter); + +extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + int dst_pitch, + unsigned int output_height, + const short *vp9_filter); + +extern prototype_subpixel_predict(vp9_bilinear_predict8x8_mmx); + +#if HAVE_MMX +void vp9_sixtap_predict4x4_mmx(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { +#ifdef ANNOUNCE_FUNCTION + printf("vp9_sixtap_predict4x4_mmx\n"); +#endif + /* Temp data bufffer used in filtering */ + DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16); + const short *hfilter, *vfilter; + hfilter = vp9_six_tap_mmx[xoffset]; + vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2, + src_pixels_per_line, 1, 9, 8, hfilter); + vfilter = vp9_six_tap_mmx[yoffset]; + vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch, + 8, 4, 4, 4, vfilter); +} + +void vp9_sixtap_predict16x16_mmx(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { +#ifdef ANNOUNCE_FUNCTION + printf("vp9_sixtap_predict16x16_mmx\n"); +#endif + /* Temp data bufffer used in filtering */ + DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24); + const short *hfilter, *vfilter; + + hfilter = vp9_six_tap_mmx[xoffset]; + vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), + fdata2, src_pixels_per_line, 1, 21, 32, + hfilter); + vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, + fdata2 + 4, src_pixels_per_line, 1, 21, 32, + hfilter); + vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, + fdata2 + 8, src_pixels_per_line, 1, 21, 32, + hfilter); + vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, + fdata2 + 12, src_pixels_per_line, 1, 21, 32, + hfilter); + + vfilter = vp9_six_tap_mmx[yoffset]; + vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr, dst_pitch, + 32, 16, 16, 16, vfilter); + vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4, dst_pitch, + 32, 16, 16, 16, vfilter); + vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8, dst_pitch, + 32, 16, 16, 16, vfilter); + vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch, + 32, 16, 16, 16, vfilter); +} + +void vp9_sixtap_predict8x8_mmx(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { +#ifdef ANNOUNCE_FUNCTION + printf("vp9_sixtap_predict8x8_mmx\n"); +#endif + /* Temp data bufffer used in filtering */ + DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); + const short *hfilter, *vfilter; + + hfilter = vp9_six_tap_mmx[xoffset]; + vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), + fdata2, src_pixels_per_line, 1, 13, 16, + hfilter); + vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, + fdata2 + 4, src_pixels_per_line, 1, 13, 16, + hfilter); + + vfilter = vp9_six_tap_mmx[yoffset]; + vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch, + 16, 8, 8, 8, vfilter); + vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch, + 16, 8, 8, 8, vfilter); +} + +void vp9_sixtap_predict8x4_mmx(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { +#ifdef ANNOUNCE_FUNCTION + printf("vp9_sixtap_predict8x4_mmx\n"); +#endif + /* Temp data bufffer used in filtering */ + DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); + const short *hfilter, *vfilter; + + hfilter = vp9_six_tap_mmx[xoffset]; + vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), + fdata2, src_pixels_per_line, 1, 9, 16, hfilter); + vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, + fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter); + + vfilter = vp9_six_tap_mmx[yoffset]; + vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch, + 16, 8, 4, 8, vfilter); + vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch, + 16, 8, 4, 8, vfilter); +} + +void vp9_bilinear_predict16x16_mmx(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + vp9_bilinear_predict8x8_mmx(src_ptr, + src_pixels_per_line, xoffset, yoffset, + dst_ptr, dst_pitch); + vp9_bilinear_predict8x8_mmx(src_ptr + 8, + src_pixels_per_line, xoffset, yoffset, + dst_ptr + 8, dst_pitch); + vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, + src_pixels_per_line, xoffset, yoffset, + dst_ptr + dst_pitch * 8, dst_pitch); + vp9_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, + src_pixels_per_line, xoffset, yoffset, + dst_ptr + dst_pitch * 8 + 8, dst_pitch); +} +#endif + +#if HAVE_SSE2 +void vp9_sixtap_predict16x16_sse2(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + /* Temp data bufffer used in filtering */ + DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24); + const short *hfilter, *vfilter; +#ifdef ANNOUNCE_FUNCTION + printf("vp9_sixtap_predict16x16_sse2\n"); +#endif + + if (xoffset) { + if (yoffset) { + hfilter = vp9_six_tap_mmx[xoffset]; + vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, + src_pixels_per_line, 1, 21, 32, hfilter); + vfilter = vp9_six_tap_mmx[yoffset]; + vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch, + 32, 16, 16, dst_pitch, vfilter); + } else { + /* First-pass only */ + hfilter = vp9_six_tap_mmx[xoffset]; + vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 16, hfilter); + } + } else { + /* Second-pass only */ + vfilter = vp9_six_tap_mmx[yoffset]; + vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, + src_pixels_per_line, 21, 32); + vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch, + 32, 16, 16, dst_pitch, vfilter); + } +} + +void vp9_sixtap_predict8x8_sse2(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + /* Temp data bufffer used in filtering */ + DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); + const short *hfilter, *vfilter; +#ifdef ANNOUNCE_FUNCTION + printf("vp9_sixtap_predict8x8_sse2\n"); +#endif + + if (xoffset) { + if (yoffset) { + hfilter = vp9_six_tap_mmx[xoffset]; + vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, + src_pixels_per_line, 1, 13, 16, hfilter); + vfilter = vp9_six_tap_mmx[yoffset]; + vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch, + 16, 8, 8, dst_pitch, vfilter); + } else { + /* First-pass only */ + hfilter = vp9_six_tap_mmx[xoffset]; + vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 8, hfilter); + } + } else { + /* Second-pass only */ + vfilter = vp9_six_tap_mmx[yoffset]; + vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 8, vfilter); + } +} + +void vp9_sixtap_predict8x4_sse2(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + /* Temp data bufffer used in filtering */ + DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); + const short *hfilter, *vfilter; +#ifdef ANNOUNCE_FUNCTION + printf("vp9_sixtap_predict8x4_sse2\n"); +#endif + + if (xoffset) { + if (yoffset) { + hfilter = vp9_six_tap_mmx[xoffset]; + vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, + src_pixels_per_line, 1, 9, 16, hfilter); + vfilter = vp9_six_tap_mmx[yoffset]; + vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch, + 16, 8, 4, dst_pitch, vfilter); + } else { + /* First-pass only */ + hfilter = vp9_six_tap_mmx[xoffset]; + vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 4, hfilter); + } + } else { + /* Second-pass only */ + vfilter = vp9_six_tap_mmx[yoffset]; + vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 4, vfilter); + } +} +#endif + +#if HAVE_SSSE3 +extern void vp9_filter_block1d8_h6_ssse3(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp9_filter_index); + +extern void vp9_filter_block1d16_h6_ssse3(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp9_filter_index); + +extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp9_filter_index); + +extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp9_filter_index); + +extern void vp9_filter_block1d4_h6_ssse3(unsigned char *src_ptr, + unsigned int src_pixels_per_line, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int output_height, + unsigned int vp9_filter_index); + +extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr, + unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + unsigned int vp9_filter_index); + +void vp9_sixtap_predict16x16_ssse3(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24); +#ifdef ANNOUNCE_FUNCTION + printf("vp9_sixtap_predict16x16_ssse3\n"); +#endif + + if (xoffset) { + if (yoffset) { + vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + fdata2, 16, 21, xoffset); + vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch, + 16, yoffset); + } else { + /* First-pass only */ + vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 16, xoffset); + } + } else { + /* Second-pass only */ + vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 16, yoffset); + } +} + +void vp9_sixtap_predict8x8_ssse3(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256); +#ifdef ANNOUNCE_FUNCTION + printf("vp9_sixtap_predict8x8_ssse3\n"); +#endif + + if (xoffset) { + if (yoffset) { + vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, fdata2, 8, 13, xoffset); + vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset); + } else { + vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 8, xoffset); + } + } else { + /* Second-pass only */ + vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 8, yoffset); + } +} + +void vp9_sixtap_predict8x4_ssse3(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256); +#ifdef ANNOUNCE_FUNCTION + printf("vp9_sixtap_predict8x4_ssse3\n"); +#endif + + if (xoffset) { + if (yoffset) { + vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, fdata2, 8, 9, xoffset); + vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset); + } else { + /* First-pass only */ + vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 4, xoffset); + } + } else { + /* Second-pass only */ + vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 4, yoffset); + } +} + +void vp9_sixtap_predict4x4_ssse3(unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_ptr, + int dst_pitch) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9); +#ifdef ANNOUNCE_FUNCTION + printf("vp9_sixtap_predict4x4_ssse3\n"); +#endif + + if (xoffset) { + if (yoffset) { + vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, fdata2, 4, 9, xoffset); + vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset); + } else { + vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, + dst_ptr, dst_pitch, 4, xoffset); + } + } else { + vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), + src_pixels_per_line, + dst_ptr, dst_pitch, 4, yoffset); + } +} + +void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr, + const unsigned int src_stride, + const short *hfilter_aligned16, + const short *vfilter_aligned16, + unsigned char *dst_ptr, + unsigned int dst_stride) { + if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); + + vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride, + fdata2, 16, 23, hfilter_aligned16); + vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16, + vfilter_aligned16); + } else { + if (hfilter_aligned16[3] != 128) { + vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, + 16, hfilter_aligned16); + } else { + vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride, + dst_ptr, dst_stride, 16, vfilter_aligned16); + } + } +} + +void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr, + const unsigned int src_stride, + const short *hfilter_aligned16, + const short *vfilter_aligned16, + unsigned char *dst_ptr, + unsigned int dst_stride) { + if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); + + vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride, + fdata2, 16, 15, hfilter_aligned16); + vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8, + vfilter_aligned16); + } else { + if (hfilter_aligned16[3] != 128) { + vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8, + hfilter_aligned16); + } else { + vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride, + dst_ptr, dst_stride, 8, vfilter_aligned16); + } + } +} + +void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr, + const unsigned int src_stride, + const short *hfilter_aligned16, + const short *vfilter_aligned16, + unsigned char *dst_ptr, + unsigned int dst_stride) { + if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); + + vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride, + fdata2, 16, 11, hfilter_aligned16); + vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4, + vfilter_aligned16); + } else { + if (hfilter_aligned16[3] != 128) { + vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4, + hfilter_aligned16); + } else { + vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride, + dst_ptr, dst_stride, 4, vfilter_aligned16); + } + } +} +#endif diff --git a/vp9/common/x86/vp9_x86_systemdependent.c b/vp9/common/x86/vp9_x86_systemdependent.c new file mode 100644 index 000000000..8a9f04c9a --- /dev/null +++ b/vp9/common/x86/vp9_x86_systemdependent.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vpx_ports/x86.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_pragmas.h" +#include "vp9/common/vp9_onyxc_int.h" + +void vp9_arch_x86_common_init(VP9_COMMON *ctx) { +#if CONFIG_RUNTIME_CPU_DETECT + VP9_COMMON_RTCD *rtcd = &ctx->rtcd; + int flags = x86_simd_caps(); + + /* Note: + * + * This platform can be built without runtime CPU detection as well. If + * you modify any of the function mappings present in this file, be sure + * to also update them in static mapings (/filename_.h) + */ + + /* Override default functions with fastest ones for this CPU. */ +#if HAVE_MMX +// The commented functions need to be re-written for vpx. + if (flags & HAS_MMX) { + +#if CONFIG_POSTPROC + rtcd->postproc.down = vp9_mbpost_proc_down_mmx; + /*rtcd->postproc.across = vp9_mbpost_proc_across_ip_c;*/ + rtcd->postproc.downacross = vp9_post_proc_down_and_across_mmx; + rtcd->postproc.addnoise = vp9_plane_add_noise_mmx; +#endif + } + +#endif +#if HAVE_SSE2 + + if (flags & HAS_SSE2) { + + + // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_sse2; + +#if CONFIG_POSTPROC + rtcd->postproc.down = vp9_mbpost_proc_down_xmm; + rtcd->postproc.across = vp9_mbpost_proc_across_ip_xmm; + rtcd->postproc.downacross = vp9_post_proc_down_and_across_xmm; + rtcd->postproc.addnoise = vp9_plane_add_noise_wmt; +#endif + } + +#endif + +#if HAVE_SSSE3 + + if (flags & HAS_SSSE3) { + + /* these are disable because of unsupported diagonal pred modes + rtcd->recon.build_intra_predictors_mbuv = + vp9_build_intra_predictors_mbuv_ssse3; + rtcd->recon.build_intra_predictors_mbuv_s = + vp9_build_intra_predictors_mbuv_s_ssse3; + */ + } +#endif + +#endif +} diff --git a/vp9/common/x86/x86_systemdependent.c b/vp9/common/x86/x86_systemdependent.c deleted file mode 100644 index 62e75ffde..000000000 --- a/vp9/common/x86/x86_systemdependent.c +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vpx_ports/x86.h" -#include "vp9/common/loopfilter.h" -#include "vp9/common/pragmas.h" -#include "vp9/common/onyxc_int.h" - -void vp9_arch_x86_common_init(VP9_COMMON *ctx) { -#if CONFIG_RUNTIME_CPU_DETECT - VP9_COMMON_RTCD *rtcd = &ctx->rtcd; - int flags = x86_simd_caps(); - - /* Note: - * - * This platform can be built without runtime CPU detection as well. If - * you modify any of the function mappings present in this file, be sure - * to also update them in static mapings (/filename_.h) - */ - - /* Override default functions with fastest ones for this CPU. */ -#if HAVE_MMX -// The commented functions need to be re-written for vpx. - if (flags & HAS_MMX) { - -#if CONFIG_POSTPROC - rtcd->postproc.down = vp9_mbpost_proc_down_mmx; - /*rtcd->postproc.across = vp9_mbpost_proc_across_ip_c;*/ - rtcd->postproc.downacross = vp9_post_proc_down_and_across_mmx; - rtcd->postproc.addnoise = vp9_plane_add_noise_mmx; -#endif - } - -#endif -#if HAVE_SSE2 - - if (flags & HAS_SSE2) { - - - // rtcd->idct.iwalsh16 = vp9_short_inv_walsh4x4_sse2; - -#if CONFIG_POSTPROC - rtcd->postproc.down = vp9_mbpost_proc_down_xmm; - rtcd->postproc.across = vp9_mbpost_proc_across_ip_xmm; - rtcd->postproc.downacross = vp9_post_proc_down_and_across_xmm; - rtcd->postproc.addnoise = vp9_plane_add_noise_wmt; -#endif - } - -#endif - -#if HAVE_SSSE3 - - if (flags & HAS_SSSE3) { - - /* these are disable because of unsupported diagonal pred modes - rtcd->recon.build_intra_predictors_mbuv = - vp9_build_intra_predictors_mbuv_ssse3; - rtcd->recon.build_intra_predictors_mbuv_s = - vp9_build_intra_predictors_mbuv_s_ssse3; - */ - } -#endif - -#endif -} -- cgit v1.2.3