diff options
48 files changed, 313 insertions, 6033 deletions
@@ -1,18 +1,26 @@ Adrian Grange <agrange@google.com> +Alex Converse <aconverse@google.com> <alex.converse@gmail.com> Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com> +Alpha Lam <hclam@google.com> <hclam@chromium.org> +Deb Mukherjee <debargha@google.com> +Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com> +Guillaume Martres <gmartres@google.com> <smarter3@gmail.com> Hangyu Kuang <hkuang@google.com> Jim Bankoski <jimbankoski@google.com> -John Koleszar <jkoleszar@google.com> Johann Koenig <johannkoenig@google.com> Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com> -Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com> +John Koleszar <jkoleszar@google.com> +Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org> +Marco Paniconi <marpan@google.com> +Marco Paniconi <marpan@google.com> <marpan@chromium.org> Pascal Massimino <pascal.massimino@gmail.com> +Paul Wilkins <paulwilkins@google.com> +Ralph Giles <giles@xiph.org> <giles@entropywave.com> +Ralph Giles <giles@xiph.org> <giles@mozilla.com> Sami Pietilä <samipietila@google.com> +Tamar Levy <tamar.levy@intel.com> +Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com> Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com> Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com> Tom Finegan <tomfinegan@google.com> -Ralph Giles <giles@xiph.org> <giles@entropywave.com> -Ralph Giles <giles@xiph.org> <giles@mozilla.com> -Alpha Lam <hclam@google.com> <hclam@chromium.org> -Deb Mukherjee <debargha@google.com> Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com> @@ -3,10 +3,11 @@ Aaron Watry <awatry@gmail.com> Abo Talib Mahfoodh <ab.mahfoodh@gmail.com> +Adam Xu <adam@xuyaowu.com> Adrian Grange <agrange@google.com> Ahmad Sharif <asharif@google.com> Alexander Voronov <avoronov@graphics.cs.msu.ru> -Alex Converse <alex.converse@gmail.com> +Alex Converse <aconverse@google.com> Alexis Ballier <aballier@gentoo.org> Alok Ahuja <waveletcoeff@gmail.com> Alpha Lam <hclam@google.com> @@ -14,44 +15,58 @@ A.Mahfoodh <ab.mahfoodh@gmail.com> Ami Fischman <fischman@chromium.org> Andoni Morales Alastruey <ylatuya@gmail.com> Andres Mejia <mcitadel@gmail.com> +Andrew Russell <anrussell@google.com> Aron Rosenberg <arosenberg@logitech.com> Attila Nagy <attilanagy@google.com> changjun.yang <changjun.yang@intel.com> +Charles 'Buck' Krasic <ckrasic@google.com> chm <chm@rock-chips.com> Christian Duvivier <cduvivier@google.com> Daniel Kang <ddkang@google.com> Deb Mukherjee <debargha@google.com> +Dim Temp <dimtemp0@gmail.com> Dmitry Kovalev <dkovalev@google.com> Dragan Mrdjan <dmrdjan@mips.com> -Erik Niemeyer <erik.a.niemeyer@gmail.com> +Ehsan Akhgari <ehsan.akhgari@gmail.com> +Erik Niemeyer <erik.a.niemeyer@intel.com> Fabio Pedretti <fabio.ped@libero.it> Frank Galligan <fgalligan@google.com> Fredrik Söderquist <fs@opera.com> Fritz Koenig <frkoenig@google.com> Gaute Strokkenes <gaute.strokkenes@broadcom.com> Giuseppe Scrivano <gscrivano@gnu.org> +Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com> Guillaume Martres <gmartres@google.com> Guillermo Ballester Valor <gbvalor@gmail.com> Hangyu Kuang <hkuang@google.com> +Hanno Böck <hanno@hboeck.de> Henrik Lundin <hlundin@google.com> Hui Su <huisu@google.com> Ivan Maltz <ivanmaltz@google.com> +Jacek Caban <cjacek@gmail.com> +JackyChen <jackychen@google.com> James Berry <jamesberry@google.com> +James Yu <james.yu@linaro.org> James Zern <jzern@google.com> +Jan Gerber <j@mailb.org> Jan Kratochvil <jan.kratochvil@redhat.com> Janne Salonen <jsalonen@google.com> Jeff Faust <jfaust@google.com> Jeff Muizelaar <jmuizelaar@mozilla.com> Jeff Petkau <jpet@chromium.org> +Jia Jia <jia.jia@linaro.org> Jim Bankoski <jimbankoski@google.com> Jingning Han <jingning@google.com> +Joey Parrish <joeyparrish@google.com> Johann Koenig <johannkoenig@google.com> John Koleszar <jkoleszar@google.com> +John Stark <jhnstrk@gmail.com> Joshua Bleecher Snyder <josh@treelinelabs.com> Joshua Litt <joshualitt@google.com> Justin Clift <justin@salasaga.org> Justin Lebar <justin.lebar@gmail.com> KO Myung-Hun <komh@chollian.net> +Lawrence Velázquez <larryv@macports.org> Lou Quillio <louquillio@google.com> Luca Barbato <lu_zero@gentoo.org> Makoto Kato <makoto.kt@gmail.com> @@ -65,6 +80,7 @@ Michael Kohler <michaelkohler@live.com> Mike Frysinger <vapier@chromium.org> Mike Hommey <mhommey@mozilla.com> Mikhal Shemer <mikhal@google.com> +Minghai Shang <minghai@google.com> Morton Jonuschat <yabawock@gmail.com> Parag Salasakar <img.mips1@gmail.com> Pascal Massimino <pascal.massimino@gmail.com> @@ -72,6 +88,8 @@ Patrik Westin <patrik.westin@gmail.com> Paul Wilkins <paulwilkins@google.com> Pavol Rusnak <stick@gk2.sk> Paweł Hajdan <phajdan@google.com> +Pengchong Jin <pengchong@google.com> +Peter de Rivaz <peter.derivaz@gmail.com> Philip Jägenstedt <philipj@opera.com> Priit Laes <plaes@plaes.org> Rafael Ávila de Espíndola <rafael.espindola@gmail.com> @@ -79,22 +97,29 @@ Rafaël Carré <funman@videolan.org> Ralph Giles <giles@xiph.org> Rob Bradford <rob@linux.intel.com> Ronald S. Bultje <rbultje@google.com> +Rui Ueyama <ruiu@google.com> Sami Pietilä <samipietila@google.com> Scott Graham <scottmg@chromium.org> Scott LaVarnway <slavarnway@google.com> +Sean McGovern <gseanmcg@gmail.com> +Sergey Ulanov <sergeyu@chromium.org> Shimon Doodkin <helpmepro1@gmail.com> Stefan Holmer <holmer@google.com> Suman Sunkara <sunkaras@google.com> Taekhyun Kim <takim@nvidia.com> Takanori MATSUURA <t.matsuu@gmail.com> Tamar Levy <tamar.levy@intel.com> +Tao Bai <michaelbai@chromium.org> Tero Rintaluoma <teror@google.com> Thijs Vermeir <thijsvermeir@gmail.com> +Tim Kopp <tkopp@google.com> Timothy B. Terriberry <tterribe@xiph.org> Tom Finegan <tomfinegan@google.com> Vignesh Venkatasubramanian <vigneshv@google.com> Yaowu Xu <yaowu@google.com> +Yongzhe Wang <yongzhe@google.com> Yunqing Wang <yunqingwang@google.com> +Zoe Liu <zoeliu@google.com> Google Inc. The Mozilla Foundation The Xiph.Org Foundation @@ -1,3 +1,26 @@ +2015-04-03 v1.4.0 "Indian Runner Duck" + This release includes significant improvements to the VP9 codec. + + - Upgrading: + This release is ABI incompatible with 1.3.0. It drops the compatibility + layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec + controls for VP9. + + - Enhancements: + Faster VP9 encoding and decoding + Multithreaded VP9 decoding (tile and frame-based) + Multithreaded VP9 encoding - on by default + YUV 4:2:2 and 4:4:4 support in VP9 + 10 and 12bit support in VP9 + 64bit ARM support by replacing ARM assembly with intrinsics + + - Bug Fixes: + Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0 + files. + + - Known Issues: + Frame Parallel decoding fails for segmented and non-420 files. + 2013-11-15 v1.3.0 "Forest" This release introduces the VP9 codec in a backward-compatible way. All existing users of VP8 can continue to use the library without @@ -1,4 +1,4 @@ -README - 30 May 2014 +README - 23 March 2015 Welcome to the WebM VP8/VP9 Codec SDK! @@ -62,12 +62,6 @@ COMPILING THE APPLICATIONS/LIBRARIES: armv7s-darwin-gcc mips32-linux-gcc mips64-linux-gcc - ppc32-darwin8-gcc - ppc32-darwin9-gcc - ppc32-linux-gcc - ppc64-darwin8-gcc - ppc64-darwin9-gcc - ppc64-linux-gcc sparc-solaris-gcc x86-android-gcc x86-darwin8-gcc @@ -78,6 +72,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86-darwin11-gcc x86-darwin12-gcc x86-darwin13-gcc + x86-darwin14-gcc x86-iphonesimulator-gcc x86-linux-gcc x86-linux-icc @@ -95,6 +90,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86_64-darwin11-gcc x86_64-darwin12-gcc x86_64-darwin13-gcc + x86_64-darwin14-gcc x86_64-iphonesimulator-gcc x86_64-linux-gcc x86_64-linux-icc @@ -111,6 +107,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: universal-darwin11-gcc universal-darwin12-gcc universal-darwin13-gcc + universal-darwin14-gcc generic-gnu The generic-gnu target, in conjunction with the CROSS environment variable, diff --git a/build/make/Makefile b/build/make/Makefile index e048e9cc8..fc7749a55 100644 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -383,8 +383,8 @@ LIBS=$(call enabled,LIBS) .libs: $(LIBS) @touch $@ $(foreach lib,$(filter %_g.a,$(LIBS)),$(eval $(call archive_template,$(lib)))) -$(foreach lib,$(filter %so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib)))) -$(foreach lib,$(filter %$(VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib)))) +$(foreach lib,$(filter %so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib)))) +$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib)))) INSTALL-LIBS=$(call cond_enabled,CONFIG_INSTALL_LIBS,INSTALL-LIBS) ifeq ($(MAKECMDGOALS),dist) diff --git a/build/make/configure.sh b/build/make/configure.sh index e54de21ca..84ca4b9d3 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -640,12 +640,6 @@ process_common_toolchain() { *i[3456]86*) tgt_isa=x86 ;; - *powerpc64*) - tgt_isa=ppc64 - ;; - *powerpc*) - tgt_isa=ppc32 - ;; *sparc*) tgt_isa=sparc ;; @@ -1070,29 +1064,6 @@ EOF check_add_asflags -march=${tgt_isa} check_add_asflags -KPIC ;; - ppc*) - enable_feature ppc - bits=${tgt_isa##ppc} - link_with_cc=gcc - setup_gnu_toolchain - add_asflags -force_cpusubtype_ALL -I"\$(dir \$<)darwin" - soft_enable altivec - enabled altivec && add_cflags -maltivec - - case "$tgt_os" in - linux*) - add_asflags -maltivec -mregnames -I"\$(dir \$<)linux" - ;; - darwin*) - darwin_arch="-arch ppc" - enabled ppc64 && darwin_arch="${darwin_arch}64" - add_cflags ${darwin_arch} -m${bits} -fasm-blocks - add_asflags ${darwin_arch} -force_cpusubtype_ALL -I"\$(dir \$<)darwin" - add_ldflags ${darwin_arch} -m${bits} - enabled altivec && add_cflags -faltivec - ;; - esac - ;; x86*) case ${tgt_os} in win*) @@ -112,12 +112,6 @@ all_platforms="${all_platforms} armv7-win32-vs12" all_platforms="${all_platforms} armv7s-darwin-gcc" all_platforms="${all_platforms} mips32-linux-gcc" all_platforms="${all_platforms} mips64-linux-gcc" -all_platforms="${all_platforms} ppc32-darwin8-gcc" -all_platforms="${all_platforms} ppc32-darwin9-gcc" -all_platforms="${all_platforms} ppc32-linux-gcc" -all_platforms="${all_platforms} ppc64-darwin8-gcc" -all_platforms="${all_platforms} ppc64-darwin9-gcc" -all_platforms="${all_platforms} ppc64-linux-gcc" all_platforms="${all_platforms} sparc-solaris-gcc" all_platforms="${all_platforms} x86-android-gcc" all_platforms="${all_platforms} x86-darwin8-gcc" @@ -247,8 +241,6 @@ ARCH_LIST=" mips x86 x86_64 - ppc32 - ppc64 " ARCH_EXT_LIST=" edsp @@ -269,8 +261,6 @@ ARCH_EXT_LIST=" sse4_1 avx avx2 - - altivec " HAVE_LIST=" ${ARCH_EXT_LIST} @@ -621,12 +611,6 @@ process_toolchain() { universal-darwin*) darwin_ver=${tgt_os##darwin} - # Snow Leopard (10.6/darwin10) dropped support for PPC - # Include PPC support for all prior versions - if [ $darwin_ver -lt 10 ]; then - fat_bin_archs="$fat_bin_archs ppc32-${tgt_os}-gcc" - fi - # Tiger (10.4/darwin8) brought support for x86 if [ $darwin_ver -ge 8 ]; then fat_bin_archs="$fat_bin_archs x86-${tgt_os}-${tgt_cc}" @@ -727,7 +711,7 @@ process_toolchain() { esac # Other toolchain specific defaults - case $toolchain in x86*|ppc*|universal*) soft_enable postproc;; esac + case $toolchain in x86*|universal*) soft_enable postproc;; esac if enabled postproc_visualizer; then enabled postproc || die "postproc_visualizer requires postproc to be enabled" @@ -230,25 +230,27 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) BUILD_LIBVPX_SO := $(if $(BUILD_LIBVPX),$(CONFIG_SHARED)) +SO_VERSION_MAJOR := 2 +SO_VERSION_MINOR := 0 +SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) -LIBVPX_SO := libvpx.$(VERSION_MAJOR).dylib +LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib EXPORT_FILE := libvpx.syms LIBVPX_SO_SYMLINKS := $(addprefix $(LIBSUBDIR)/, \ libvpx.dylib ) else -LIBVPX_SO := libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH) +LIBVPX_SO := libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH) EXPORT_FILE := libvpx.ver -SYM_LINK := libvpx.so LIBVPX_SO_SYMLINKS := $(addprefix $(LIBSUBDIR)/, \ - libvpx.so libvpx.so.$(VERSION_MAJOR) \ - libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR)) + libvpx.so libvpx.so.$(SO_VERSION_MAJOR) \ + libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR)) endif LIBS-$(BUILD_LIBVPX_SO) += $(BUILD_PFX)$(LIBVPX_SO)\ $(notdir $(LIBVPX_SO_SYMLINKS)) $(BUILD_PFX)$(LIBVPX_SO): $(LIBVPX_OBJS) $(EXPORT_FILE) $(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm -$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(VERSION_MAJOR) +$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR) $(BUILD_PFX)$(LIBVPX_SO): EXPORTS_FILE = $(EXPORT_FILE) libvpx.ver: $(call enabled,CODEC_EXPORTS) diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index ba82da4e1..ba73f8612 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -230,7 +230,7 @@ INSTANTIATE_TEST_CASE_P( &vp9_idct4x4_1_add_c, TX_4X4, 1))); -#if HAVE_NEON +#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( NEON, PartialIDctTest, ::testing::Values( @@ -258,7 +258,7 @@ INSTANTIATE_TEST_CASE_P( &vp9_idct4x4_16_add_c, &vp9_idct4x4_1_add_neon, TX_4X4, 1))); -#endif // HAVE_NEON +#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( diff --git a/test/tools_common.sh b/test/tools_common.sh index 34c15168a..0ae011e9e 100755 --- a/test/tools_common.sh +++ b/test/tools_common.sh @@ -402,6 +402,7 @@ VP9_IVF_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-09-subpixel-00.ivf" VP9_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-00-quantizer-00.webm" VP9_FPM_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-07-frame_parallel-1.webm" +VP9_LT_50_FRAMES_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-02-size-32x08.webm" YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv" YUV_RAW_INPUT_WIDTH=352 diff --git a/test/vp9_frame_parallel_test.cc b/test/vp9_frame_parallel_test.cc index 0594d752b..f0df88afa 100644 --- a/test/vp9_frame_parallel_test.cc +++ b/test/vp9_frame_parallel_test.cc @@ -29,7 +29,7 @@ using std::string; #if CONFIG_WEBM_IO -struct FileList { +struct PauseFileList { const char *name; // md5 sum for decoded frames which does not include skipped frames. const char *expected_md5; @@ -39,7 +39,8 @@ struct FileList { // Decodes |filename| with |num_threads|. Pause at the specified frame_num, // seek to next key frame and then continue decoding until the end. Return // the md5 of the decoded frames which does not include skipped frames. -string DecodeFile(const string &filename, int num_threads, int pause_num) { +string DecodeFileWithPause(const string &filename, int num_threads, + int pause_num) { libvpx_test::WebMVideoSource video(filename); video.Init(); int in_frames = 0; @@ -92,12 +93,12 @@ string DecodeFile(const string &filename, int num_threads, int pause_num) { return string(md5.Get()); } -void DecodeFiles(const FileList files[]) { - for (const FileList *iter = files; iter->name != NULL; ++iter) { +void DecodeFilesWithPause(const PauseFileList files[]) { + for (const PauseFileList *iter = files; iter->name != NULL; ++iter) { SCOPED_TRACE(iter->name); for (int t = 2; t <= 8; ++t) { EXPECT_EQ(iter->expected_md5, - DecodeFile(iter->name, t, iter->pause_frame_num)) + DecodeFileWithPause(iter->name, t, iter->pause_frame_num)) << "threads = " << t; } } @@ -106,19 +107,19 @@ void DecodeFiles(const FileList files[]) { TEST(VP9MultiThreadedFrameParallel, PauseSeekResume) { // vp90-2-07-frame_parallel-1.webm is a 40 frame video file with // one key frame for every ten frames. - static const FileList files[] = { + static const PauseFileList files[] = { { "vp90-2-07-frame_parallel-1.webm", - "6ea7c3875d67252e7caf2bc6e75b36b1", 6}, + "6ea7c3875d67252e7caf2bc6e75b36b1", 6 }, { "vp90-2-07-frame_parallel-1.webm", - "4bb634160c7356a8d7d4299b6dc83a45", 12}, + "4bb634160c7356a8d7d4299b6dc83a45", 12 }, { "vp90-2-07-frame_parallel-1.webm", - "89772591e6ef461f9fa754f916c78ed8", 26}, - { NULL, NULL, 0}, + "89772591e6ef461f9fa754f916c78ed8", 26 }, + { NULL, NULL, 0 }, }; - DecodeFiles(files); + DecodeFilesWithPause(files); } -struct InvalidFileList { +struct FileList { const char *name; // md5 sum for decoded frames which does not include corrupted frames. const char *expected_md5; @@ -128,8 +129,8 @@ struct InvalidFileList { // Decodes |filename| with |num_threads|. Return the md5 of the decoded // frames which does not include corrupted frames. -string DecodeInvalidFile(const string &filename, int num_threads, - int expected_frame_count) { +string DecodeFile(const string &filename, int num_threads, + int expected_frame_count) { libvpx_test::WebMVideoSource video(filename); video.Init(); @@ -173,37 +174,47 @@ string DecodeInvalidFile(const string &filename, int num_threads, return string(md5.Get()); } -void DecodeInvalidFiles(const InvalidFileList files[]) { - for (const InvalidFileList *iter = files; iter->name != NULL; ++iter) { +void DecodeFiles(const FileList files[]) { + for (const FileList *iter = files; iter->name != NULL; ++iter) { SCOPED_TRACE(iter->name); for (int t = 2; t <= 8; ++t) { EXPECT_EQ(iter->expected_md5, - DecodeInvalidFile(iter->name, t, iter->expected_frame_count)) + DecodeFile(iter->name, t, iter->expected_frame_count)) << "threads = " << t; } } } TEST(VP9MultiThreadedFrameParallel, InvalidFileTest) { - static const InvalidFileList files[] = { + static const FileList files[] = { // invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with // one key frame for every ten frames. The 11th frame has corrupted data. { "invalid-vp90-2-07-frame_parallel-1.webm", - "0549d0f45f60deaef8eb708e6c0eb6cb", 30}, + "0549d0f45f60deaef8eb708e6c0eb6cb", 30 }, // invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with // one key frame for every ten frames. The 1st and 31st frames have // corrupted data. { "invalid-vp90-2-07-frame_parallel-2.webm", - "6a1f3cf6f9e7a364212fadb9580d525e", 20}, + "6a1f3cf6f9e7a364212fadb9580d525e", 20 }, // invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with // one key frame for every ten frames. The 5th and 13th frames have // corrupted data. { "invalid-vp90-2-07-frame_parallel-3.webm", - "8256544308de926b0681e04685b98677", 27}, - { NULL, NULL, 0}, + "8256544308de926b0681e04685b98677", 27 }, + { NULL, NULL, 0 }, }; - DecodeInvalidFiles(files); + DecodeFiles(files); } +TEST(VP9MultiThreadedFrameParallel, ValidFileTest) { + static const FileList files[] = { +#if CONFIG_VP9_HIGHBITDEPTH + { "vp92-2-20-10bit-yuv420.webm", + "a16b99df180c584e8db2ffeda987d293", 10 }, +#endif + { NULL, NULL, 0 }, + }; + DecodeFiles(files); +} #endif // CONFIG_WEBM_IO } // namespace diff --git a/test/vpxdec.sh b/test/vpxdec.sh index d73a447ee..2f23ca7c1 100755 --- a/test/vpxdec.sh +++ b/test/vpxdec.sh @@ -17,7 +17,8 @@ # Environment check: Make sure input is available. vpxdec_verify_environment() { if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_WEBM_FILE}" ] || \ - [ ! -e "${VP9_FPM_WEBM_FILE}" ] ; then + [ ! -e "${VP9_FPM_WEBM_FILE}" ] || \ + [ ! -e "${VP9_LT_50_FRAMES_WEBM_FILE}" ] ; then elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH." return 1 fi @@ -87,12 +88,24 @@ vpxdec_vp9_webm_frame_parallel() { --frame-parallel done fi +} +vpxdec_vp9_webm_less_than_50_frames() { + # ensure that reaching eof in webm_guess_framerate doesn't result in invalid + # frames in actual webm_read_frame calls. + if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \ + [ "$(webm_io_available)" = "yes" ]; then + local readonly decoder="$(vpx_tool_path vpxdec)" + eval "${VPX_TEST_PREFIX}" "${decoder}" "${VP9_LT_50_FRAMES_WEBM_FILE}" \ + --summary --noblit 2>&1 \ + | awk '{ exit $1 != 10; }' + fi } vpxdec_tests="vpxdec_vp8_ivf vpxdec_vp8_ivf_pipe_input vpxdec_vp9_webm - vpxdec_vp9_webm_frame_parallel" + vpxdec_vp9_webm_frame_parallel + vpxdec_vp9_webm_less_than_50_frames" run_tests vpxdec_verify_environment "${vpxdec_tests}" diff --git a/vp8/common/debugmodes.c b/vp8/common/debugmodes.c index 46064e61d..159fddc6a 100644 --- a/vp8/common/debugmodes.c +++ b/vp8/common/debugmodes.c @@ -81,7 +81,6 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f fprintf(mvs, "\n"); /* print out the block modes */ - mb_index = 0; fprintf(mvs, "Mbs for Frame %d\n", frame); { int b_row; @@ -129,7 +128,6 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f /* print out the block modes */ - mb_index = 0; fprintf(mvs, "MVs for Frame %d\n", frame); { int b_row; diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c index 069332660..edd6ca09b 100644 --- a/vp8/common/mfqe.c +++ b/vp8/common/mfqe.c @@ -153,11 +153,11 @@ static void multiframe_quality_enhance_block actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8; act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8; #ifdef USE_SSD - sad = (vp8_variance16x16(y, y_stride, yd, yd_stride, &sse)); + vp8_variance16x16(y, y_stride, yd, yd_stride, &sse); sad = (sse + 128)>>8; - usad = (vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse)); + vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse); usad = (sse + 32)>>6; - vsad = (vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse)); + vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse); vsad = (sse + 32)>>6; #else sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, UINT_MAX) + 128) >> 8; @@ -170,11 +170,11 @@ static void multiframe_quality_enhance_block actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6; act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6; #ifdef USE_SSD - sad = (vp8_variance8x8(y, y_stride, yd, yd_stride, &sse)); + vp8_variance8x8(y, y_stride, yd, yd_stride, &sse); sad = (sse + 32)>>6; - usad = (vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse)); + vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse); usad = (sse + 8)>>4; - vsad = (vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse)); + vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse); vsad = (sse + 8)>>4; #else sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, UINT_MAX) + 32) >> 6; diff --git a/vp8/common/ppc/copy_altivec.asm b/vp8/common/ppc/copy_altivec.asm deleted file mode 100644 index a4ce91583..000000000 --- a/vp8/common/ppc/copy_altivec.asm +++ /dev/null @@ -1,47 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl copy_mem16x16_ppc - -;# r3 unsigned char *src -;# r4 int src_stride -;# r5 unsigned char *dst -;# r6 int dst_stride - -;# Make the assumption that input will not be aligned, -;# but the output will be. So two reads and a perm -;# for the input, but only one store for the output. -copy_mem16x16_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xe000 - mtspr 256, r12 ;# set VRSAVE - - li r10, 16 - mtctr r10 - -cp_16x16_loop: - lvsl v0, 0, r3 ;# permutate value for alignment - - lvx v1, 0, r3 - lvx v2, r10, r3 - - vperm v1, v1, v2, v0 - - stvx v1, 0, r5 - - add r3, r3, r4 ;# increment source pointer - add r5, r5, r6 ;# increment destination pointer - - bdnz cp_16x16_loop - - mtspr 256, r11 ;# reset old VRSAVE - - blr diff --git a/vp8/common/ppc/filter_altivec.asm b/vp8/common/ppc/filter_altivec.asm deleted file mode 100644 index 4da2e94f9..000000000 --- a/vp8/common/ppc/filter_altivec.asm +++ /dev/null @@ -1,1013 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl sixtap_predict_ppc - .globl sixtap_predict8x4_ppc - .globl sixtap_predict8x8_ppc - .globl sixtap_predict16x16_ppc - -.macro load_c V, LABEL, OFF, R0, R1 - lis \R0, \LABEL@ha - la \R1, \LABEL@l(\R0) - lvx \V, \OFF, \R1 -.endm - -.macro load_hfilter V0, V1 - load_c \V0, HFilter, r5, r9, r10 - - addi r5, r5, 16 - lvx \V1, r5, r10 -.endm - -;# Vertical filtering -.macro Vprolog - load_c v0, VFilter, r6, r3, r10 - - vspltish v5, 8 - vspltish v6, 3 - vslh v6, v5, v6 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - vspltb v1, v0, 1 - vspltb v2, v0, 2 - vspltb v3, v0, 3 - vspltb v4, v0, 4 - vspltb v5, v0, 5 - vspltb v0, v0, 0 -.endm - -.macro vpre_load - Vprolog - li r10, 16 - lvx v10, 0, r9 ;# v10..v14 = first 5 rows - lvx v11, r10, r9 - addi r9, r9, 32 - lvx v12, 0, r9 - lvx v13, r10, r9 - addi r9, r9, 32 - lvx v14, 0, r9 -.endm - -.macro Msum Re, Ro, V, T, TMP - ;# (Re,Ro) += (V*T) - vmuleub \TMP, \V, \T ;# trashes v8 - vadduhm \Re, \Re, \TMP ;# Re = evens, saturation unnecessary - vmuloub \TMP, \V, \T - vadduhm \Ro, \Ro, \TMP ;# Ro = odds -.endm - -.macro vinterp_no_store P0 P1 P2 P3 P4 P5 - vmuleub v8, \P0, v0 ;# 64 + 4 positive taps - vadduhm v16, v6, v8 - vmuloub v8, \P0, v0 - vadduhm v17, v6, v8 - Msum v16, v17, \P2, v2, v8 - Msum v16, v17, \P3, v3, v8 - Msum v16, v17, \P5, v5, v8 - - vmuleub v18, \P1, v1 ;# 2 negative taps - vmuloub v19, \P1, v1 - Msum v18, v19, \P4, v4, v8 - - vsubuhs v16, v16, v18 ;# subtract neg from pos - vsubuhs v17, v17, v19 - vsrh v16, v16, v7 ;# divide by 128 - vsrh v17, v17, v7 ;# v16 v17 = evens, odds - vmrghh v18, v16, v17 ;# v18 v19 = 16-bit result in order - vmrglh v19, v16, v17 - vpkuhus \P0, v18, v19 ;# P0 = 8-bit result -.endm - -.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5 - vmuleub v24, \P0, v13 ;# 64 + 4 positive taps - vadduhm v21, v20, v24 - vmuloub v24, \P0, v13 - vadduhm v22, v20, v24 - Msum v21, v22, \P2, v15, v25 - Msum v21, v22, \P3, v16, v25 - Msum v21, v22, \P5, v18, v25 - - vmuleub v23, \P1, v14 ;# 2 negative taps - vmuloub v24, \P1, v14 - Msum v23, v24, \P4, v17, v25 - - vsubuhs v21, v21, v23 ;# subtract neg from pos - vsubuhs v22, v22, v24 - vsrh v21, v21, v19 ;# divide by 128 - vsrh v22, v22, v19 ;# v16 v17 = evens, odds - vmrghh v23, v21, v22 ;# v18 v19 = 16-bit result in order - vmrglh v24, v21, v22 - vpkuhus \P0, v23, v24 ;# P0 = 8-bit result -.endm - - -.macro Vinterp P0 P1 P2 P3 P4 P5 - vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5 - stvx \P0, 0, r7 - add r7, r7, r8 ;# 33 ops per 16 pels -.endm - - -.macro luma_v P0, P1, P2, P3, P4, P5 - addi r9, r9, 16 ;# P5 = newest input row - lvx \P5, 0, r9 - Vinterp \P0, \P1, \P2, \P3, \P4, \P5 -.endm - -.macro luma_vtwo - luma_v v10, v11, v12, v13, v14, v15 - luma_v v11, v12, v13, v14, v15, v10 -.endm - -.macro luma_vfour - luma_vtwo - luma_v v12, v13, v14, v15, v10, v11 - luma_v v13, v14, v15, v10, v11, v12 -.endm - -.macro luma_vsix - luma_vfour - luma_v v14, v15, v10, v11, v12, v13 - luma_v v15, v10, v11, v12, v13, v14 -.endm - -.macro Interp4 R I I4 - vmsummbm \R, v13, \I, v15 - vmsummbm \R, v14, \I4, \R -.endm - -.macro Read8x8 VD, RS, RP, increment_counter - lvsl v21, 0, \RS ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx \VD, 0, \RS - lvx v20, r10, \RS - -.if \increment_counter - add \RS, \RS, \RP -.endif - - vperm \VD, \VD, v20, v21 -.endm - -.macro interp_8x8 R - vperm v20, \R, \R, v16 ;# v20 = 0123 1234 2345 3456 - vperm v21, \R, \R, v17 ;# v21 = 4567 5678 6789 789A - Interp4 v20, v20, v21 ;# v20 = result 0 1 2 3 - vperm \R, \R, \R, v18 ;# R = 89AB 9ABC ABCx BCxx - Interp4 v21, v21, \R ;# v21 = result 4 5 6 7 - - vpkswus \R, v20, v21 ;# R = 0 1 2 3 4 5 6 7 - vsrh \R, \R, v19 - - vpkuhus \R, \R, \R ;# saturate and pack - -.endm - -.macro Read4x4 VD, RS, RP, increment_counter - lvsl v21, 0, \RS ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v20, 0, \RS - -.if \increment_counter - add \RS, \RS, \RP -.endif - - vperm \VD, v20, v20, v21 -.endm - .text - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch -sixtap_predict_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xff87 - ori r12, r12, 0xffc0 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - slwi. r5, r5, 5 ;# index into horizontal filter array - - vspltish v19, 7 - - ;# If there isn't any filtering to be done for the horizontal, then - ;# just skip to the second pass. - beq- vertical_only_4x4 - - ;# load up horizontal filter - load_hfilter v13, v14 - - ;# rounding added in on the multiply - vspltisw v16, 8 - vspltisw v15, 3 - vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 - - ;# Load up permutation constants - load_c v16, B_0123, 0, r9, r10 - load_c v17, B_4567, 0, r9, r10 - load_c v18, B_89AB, 0, r9, r10 - - ;# Back off input buffer by 2 bytes. Need 2 before and 3 after - addi r3, r3, -2 - - addi r9, r3, 0 - li r10, 16 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - - slwi. r6, r6, 4 ;# index into vertical filter array - - ;# filter a line - interp_8x8 v2 - interp_8x8 v3 - interp_8x8 v4 - interp_8x8 v5 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional 5 lines that are needed - ;# for the vertical filter. - beq- store_4x4 - - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r9, r9, r4 - sub r9, r9, r4 - - Read8x8 v0, r9, r4, 1 - Read8x8 v1, r9, r4, 0 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 0 - - interp_8x8 v0 - interp_8x8 v1 - interp_8x8 v6 - interp_8x8 v7 - interp_8x8 v8 - - b second_pass_4x4 - -vertical_only_4x4: - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r3, r3, r4 - sub r3, r3, r4 - li r10, 16 - - Read8x8 v0, r3, r4, 1 - Read8x8 v1, r3, r4, 1 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 0 - - slwi r6, r6, 4 ;# index into vertical filter array - -second_pass_4x4: - load_c v20, b_hilo_4x4, 0, r9, r10 - load_c v21, b_hilo, 0, r9, r10 - - ;# reposition input so that it can go through the - ;# filtering phase with one pass. - vperm v0, v0, v1, v20 ;# 0 1 x x - vperm v2, v2, v3, v20 ;# 2 3 x x - vperm v4, v4, v5, v20 ;# 4 5 x x - vperm v6, v6, v7, v20 ;# 6 7 x x - - vperm v0, v0, v2, v21 ;# 0 1 2 3 - vperm v4, v4, v6, v21 ;# 4 5 6 7 - - vsldoi v1, v0, v4, 4 - vsldoi v2, v0, v4, 8 - vsldoi v3, v0, v4, 12 - - vsldoi v5, v4, v8, 4 - - load_c v13, VFilter, r6, r9, r10 - - vspltish v15, 8 - vspltish v20, 3 - vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - vspltb v14, v13, 1 - vspltb v15, v13, 2 - vspltb v16, v13, 3 - vspltb v17, v13, 4 - vspltb v18, v13, 5 - vspltb v13, v13, 0 - - vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 - - stvx v0, 0, r1 - - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - lwz r0, 4(r1) - stw r0, 0(r7) - add r7, r7, r8 - - lwz r0, 8(r1) - stw r0, 0(r7) - add r7, r7, r8 - - lwz r0, 12(r1) - stw r0, 0(r7) - - b exit_4x4 - -store_4x4: - - stvx v2, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v3, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v4, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v5, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - -exit_4x4: - - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -.macro w_8x8 V, D, R, P - stvx \V, 0, r1 - lwz \R, 0(r1) - stw \R, 0(r7) - lwz \R, 4(r1) - stw \R, 4(r7) - add \D, \D, \P -.endm - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch - -sixtap_predict8x4_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xffc0 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - slwi. r5, r5, 5 ;# index into horizontal filter array - - vspltish v19, 7 - - ;# If there isn't any filtering to be done for the horizontal, then - ;# just skip to the second pass. - beq- second_pass_pre_copy_8x4 - - load_hfilter v13, v14 - - ;# rounding added in on the multiply - vspltisw v16, 8 - vspltisw v15, 3 - vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 - - ;# Load up permutation constants - load_c v16, B_0123, 0, r9, r10 - load_c v17, B_4567, 0, r9, r10 - load_c v18, B_89AB, 0, r9, r10 - - ;# Back off input buffer by 2 bytes. Need 2 before and 3 after - addi r3, r3, -2 - - addi r9, r3, 0 - li r10, 16 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - - slwi. r6, r6, 4 ;# index into vertical filter array - - ;# filter a line - interp_8x8 v2 - interp_8x8 v3 - interp_8x8 v4 - interp_8x8 v5 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional 5 lines that are needed - ;# for the vertical filter. - beq- store_8x4 - - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r9, r9, r4 - sub r9, r9, r4 - - Read8x8 v0, r9, r4, 1 - Read8x8 v1, r9, r4, 0 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 0 - - interp_8x8 v0 - interp_8x8 v1 - interp_8x8 v6 - interp_8x8 v7 - interp_8x8 v8 - - b second_pass_8x4 - -second_pass_pre_copy_8x4: - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r3, r3, r4 - sub r3, r3, r4 - li r10, 16 - - Read8x8 v0, r3, r4, 1 - Read8x8 v1, r3, r4, 1 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 1 - - slwi r6, r6, 4 ;# index into vertical filter array - -second_pass_8x4: - load_c v13, VFilter, r6, r9, r10 - - vspltish v15, 8 - vspltish v20, 3 - vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - vspltb v14, v13, 1 - vspltb v15, v13, 2 - vspltb v16, v13, 3 - vspltb v17, v13, 4 - vspltb v18, v13, 5 - vspltb v13, v13, 0 - - vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 - vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 - vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 - vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 - - cmpi cr0, r8, 8 - beq cr0, store_aligned_8x4 - - w_8x8 v0, r7, r0, r8 - w_8x8 v1, r7, r0, r8 - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - - b exit_8x4 - -store_aligned_8x4: - - load_c v10, b_hilo, 0, r9, r10 - - vperm v0, v0, v1, v10 - vperm v2, v2, v3, v10 - - stvx v0, 0, r7 - addi r7, r7, 16 - stvx v2, 0, r7 - - b exit_8x4 - -store_8x4: - cmpi cr0, r8, 8 - beq cr0, store_aligned2_8x4 - - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - w_8x8 v4, r7, r0, r8 - w_8x8 v5, r7, r0, r8 - - b exit_8x4 - -store_aligned2_8x4: - load_c v10, b_hilo, 0, r9, r10 - - vperm v2, v2, v3, v10 - vperm v4, v4, v5, v10 - - stvx v2, 0, r7 - addi r7, r7, 16 - stvx v4, 0, r7 - -exit_8x4: - - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - - blr - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch - -;# Because the width that needs to be filtered will fit in a single altivec -;# register there is no need to loop. Everything can stay in registers. -sixtap_predict8x8_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xffc0 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - slwi. r5, r5, 5 ;# index into horizontal filter array - - vspltish v19, 7 - - ;# If there isn't any filtering to be done for the horizontal, then - ;# just skip to the second pass. - beq- second_pass_pre_copy_8x8 - - load_hfilter v13, v14 - - ;# rounding added in on the multiply - vspltisw v16, 8 - vspltisw v15, 3 - vslw v15, v16, v15 ;# 0x00000040000000400000004000000040 - - ;# Load up permutation constants - load_c v16, B_0123, 0, r9, r10 - load_c v17, B_4567, 0, r9, r10 - load_c v18, B_89AB, 0, r9, r10 - - ;# Back off input buffer by 2 bytes. Need 2 before and 3 after - addi r3, r3, -2 - - addi r9, r3, 0 - li r10, 16 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 1 - Read8x8 v9, r3, r4, 1 - - slwi. r6, r6, 4 ;# index into vertical filter array - - ;# filter a line - interp_8x8 v2 - interp_8x8 v3 - interp_8x8 v4 - interp_8x8 v5 - interp_8x8 v6 - interp_8x8 v7 - interp_8x8 v8 - interp_8x8 v9 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional 5 lines that are needed - ;# for the vertical filter. - beq- store_8x8 - - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r9, r9, r4 - sub r9, r9, r4 - - Read8x8 v0, r9, r4, 1 - Read8x8 v1, r9, r4, 0 - Read8x8 v10, r3, r4, 1 - Read8x8 v11, r3, r4, 1 - Read8x8 v12, r3, r4, 0 - - interp_8x8 v0 - interp_8x8 v1 - interp_8x8 v10 - interp_8x8 v11 - interp_8x8 v12 - - b second_pass_8x8 - -second_pass_pre_copy_8x8: - ;# only needed if there is a vertical filter present - ;# if the second filter is not null then need to back off by 2*pitch - sub r3, r3, r4 - sub r3, r3, r4 - li r10, 16 - - Read8x8 v0, r3, r4, 1 - Read8x8 v1, r3, r4, 1 - Read8x8 v2, r3, r4, 1 - Read8x8 v3, r3, r4, 1 - Read8x8 v4, r3, r4, 1 - Read8x8 v5, r3, r4, 1 - Read8x8 v6, r3, r4, 1 - Read8x8 v7, r3, r4, 1 - Read8x8 v8, r3, r4, 1 - Read8x8 v9, r3, r4, 1 - Read8x8 v10, r3, r4, 1 - Read8x8 v11, r3, r4, 1 - Read8x8 v12, r3, r4, 0 - - slwi r6, r6, 4 ;# index into vertical filter array - -second_pass_8x8: - load_c v13, VFilter, r6, r9, r10 - - vspltish v15, 8 - vspltish v20, 3 - vslh v20, v15, v20 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - vspltb v14, v13, 1 - vspltb v15, v13, 2 - vspltb v16, v13, 3 - vspltb v17, v13, 4 - vspltb v18, v13, 5 - vspltb v13, v13, 0 - - vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5 - vinterp_no_store_8x8 v1, v2, v3, v4, v5, v6 - vinterp_no_store_8x8 v2, v3, v4, v5, v6, v7 - vinterp_no_store_8x8 v3, v4, v5, v6, v7, v8 - vinterp_no_store_8x8 v4, v5, v6, v7, v8, v9 - vinterp_no_store_8x8 v5, v6, v7, v8, v9, v10 - vinterp_no_store_8x8 v6, v7, v8, v9, v10, v11 - vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12 - - cmpi cr0, r8, 8 - beq cr0, store_aligned_8x8 - - w_8x8 v0, r7, r0, r8 - w_8x8 v1, r7, r0, r8 - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - w_8x8 v4, r7, r0, r8 - w_8x8 v5, r7, r0, r8 - w_8x8 v6, r7, r0, r8 - w_8x8 v7, r7, r0, r8 - - b exit_8x8 - -store_aligned_8x8: - - load_c v10, b_hilo, 0, r9, r10 - - vperm v0, v0, v1, v10 - vperm v2, v2, v3, v10 - vperm v4, v4, v5, v10 - vperm v6, v6, v7, v10 - - stvx v0, 0, r7 - addi r7, r7, 16 - stvx v2, 0, r7 - addi r7, r7, 16 - stvx v4, 0, r7 - addi r7, r7, 16 - stvx v6, 0, r7 - - b exit_8x8 - -store_8x8: - cmpi cr0, r8, 8 - beq cr0, store_aligned2_8x8 - - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - w_8x8 v4, r7, r0, r8 - w_8x8 v5, r7, r0, r8 - w_8x8 v6, r7, r0, r8 - w_8x8 v7, r7, r0, r8 - w_8x8 v8, r7, r0, r8 - w_8x8 v9, r7, r0, r8 - - b exit_8x8 - -store_aligned2_8x8: - load_c v10, b_hilo, 0, r9, r10 - - vperm v2, v2, v3, v10 - vperm v4, v4, v5, v10 - vperm v6, v6, v7, v10 - vperm v8, v8, v9, v10 - - stvx v2, 0, r7 - addi r7, r7, 16 - stvx v4, 0, r7 - addi r7, r7, 16 - stvx v6, 0, r7 - addi r7, r7, 16 - stvx v8, 0, r7 - -exit_8x8: - - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch - -;# Two pass filtering. First pass is Horizontal edges, second pass is vertical -;# edges. One of the filters can be null, but both won't be. Needs to use a -;# temporary buffer because the source buffer can't be modified and the buffer -;# for the destination is not large enough to hold the temporary data. -sixtap_predict16x16_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xf000 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-416(r1) ;# create space on the stack - - ;# Three possiblities - ;# 1. First filter is null. Don't use a temp buffer. - ;# 2. Second filter is null. Don't use a temp buffer. - ;# 3. Neither are null, use temp buffer. - - ;# First Pass (horizontal edge) - ;# setup pointers for src - ;# if possiblity (1) then setup the src pointer to be the orginal and jump - ;# to second pass. this is based on if x_offset is 0. - - ;# load up horizontal filter - slwi. r5, r5, 5 ;# index into horizontal filter array - - load_hfilter v4, v5 - - beq- copy_horizontal_16x21 - - ;# Back off input buffer by 2 bytes. Need 2 before and 3 after - addi r3, r3, -2 - - slwi. r6, r6, 4 ;# index into vertical filter array - - ;# setup constants - ;# v14 permutation value for alignment - load_c v14, b_hperm, 0, r9, r10 - - ;# These statements are guessing that there won't be a second pass, - ;# but if there is then inside the bypass they need to be set - li r0, 16 ;# prepare for no vertical filter - - ;# Change the output pointer and pitch to be the actual - ;# desination instead of a temporary buffer. - addi r9, r7, 0 - addi r5, r8, 0 - - ;# no vertical filter, so write the output from the first pass - ;# directly into the output buffer. - beq- no_vertical_filter_bypass - - ;# if the second filter is not null then need to back off by 2*pitch - sub r3, r3, r4 - sub r3, r3, r4 - - ;# setup counter for the number of lines that are going to be filtered - li r0, 21 - - ;# use the stack as temporary storage - la r9, 48(r1) - li r5, 16 - -no_vertical_filter_bypass: - - mtctr r0 - - ;# rounding added in on the multiply - vspltisw v10, 8 - vspltisw v12, 3 - vslw v12, v10, v12 ;# 0x00000040000000400000004000000040 - - ;# downshift by 7 ( divide by 128 ) at the end - vspltish v13, 7 - - ;# index to the next set of vectors in the row. - li r10, 16 - li r12, 32 - -horizontal_loop_16x16: - - lvsl v15, 0, r3 ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v1, 0, r3 - lvx v2, r10, r3 - lvx v3, r12, r3 - - vperm v8, v1, v2, v15 - vperm v9, v2, v3, v15 ;# v8 v9 = 21 input pixels left-justified - - vsldoi v11, v8, v9, 4 - - ;# set 0 - vmsummbm v6, v4, v8, v12 ;# taps times elements - vmsummbm v0, v5, v11, v6 - - ;# set 1 - vsldoi v10, v8, v9, 1 - vsldoi v11, v8, v9, 5 - - vmsummbm v6, v4, v10, v12 - vmsummbm v1, v5, v11, v6 - - ;# set 2 - vsldoi v10, v8, v9, 2 - vsldoi v11, v8, v9, 6 - - vmsummbm v6, v4, v10, v12 - vmsummbm v2, v5, v11, v6 - - ;# set 3 - vsldoi v10, v8, v9, 3 - vsldoi v11, v8, v9, 7 - - vmsummbm v6, v4, v10, v12 - vmsummbm v3, v5, v11, v6 - - vpkswus v0, v0, v1 ;# v0 = 0 4 8 C 1 5 9 D (16-bit) - vpkswus v1, v2, v3 ;# v1 = 2 6 A E 3 7 B F - - vsrh v0, v0, v13 ;# divide v0, v1 by 128 - vsrh v1, v1, v13 - - vpkuhus v0, v0, v1 ;# v0 = scrambled 8-bit result - vperm v0, v0, v0, v14 ;# v0 = correctly-ordered result - - stvx v0, 0, r9 - add r9, r9, r5 - - add r3, r3, r4 - - bdnz horizontal_loop_16x16 - - ;# check again to see if vertical filter needs to be done. - cmpi cr0, r6, 0 - beq cr0, end_16x16 - - ;# yes there is, so go to the second pass - b second_pass_16x16 - -copy_horizontal_16x21: - li r10, 21 - mtctr r10 - - li r10, 16 - - sub r3, r3, r4 - sub r3, r3, r4 - - ;# this is done above if there is a horizontal filter, - ;# if not it needs to be done down here. - slwi r6, r6, 4 ;# index into vertical filter array - - ;# always write to the stack when doing a horizontal copy - la r9, 48(r1) - -copy_horizontal_loop_16x21: - lvsl v15, 0, r3 ;# permutate value for alignment - - lvx v1, 0, r3 - lvx v2, r10, r3 - - vperm v8, v1, v2, v15 - - stvx v8, 0, r9 - addi r9, r9, 16 - - add r3, r3, r4 - - bdnz copy_horizontal_loop_16x21 - -second_pass_16x16: - - ;# always read from the stack when doing a vertical filter - la r9, 48(r1) - - ;# downshift by 7 ( divide by 128 ) at the end - vspltish v7, 7 - - vpre_load - - luma_vsix - luma_vsix - luma_vfour - -end_16x16: - - addi r1, r1, 416 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .data - - .align 4 -HFilter: - .byte 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0 - .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12, 0, -6,123, 12 - .byte -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0 - .byte 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36, 2,-11,108, 36 - .byte -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0, -8, 1, 0, 0 - .byte 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50, 0, -9, 93, 50 - .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 - .byte 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77, 3,-16, 77, 77 - .byte -16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0,-16, 3, 0, 0 - .byte 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93, 0, -6, 50, 93 - .byte -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0, -9, 0, 0, 0 - .byte 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108, 1, -8, 36,108 - .byte -11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0,-11, 2, 0, 0 - .byte 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123, 0, -1, 12,123 - .byte -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0, -6, 0, 0, 0 - - .align 4 -VFilter: - .byte 0, 0,128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 0, 6,123, 12, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 2, 11,108, 36, 8, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 0, 9, 93, 50, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 3, 16, 77, 77, 16, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 0, 6, 50, 93, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 1, 8, 36,108, 11, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 0, 1, 12,123, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - - .align 4 -b_hperm: - .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 - - .align 4 -B_0123: - .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 - - .align 4 -B_4567: - .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - - .align 4 -B_89AB: - .byte 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 - - .align 4 -b_hilo: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 - - .align 4 -b_hilo_4x4: - .byte 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0 diff --git a/vp8/common/ppc/filter_bilinear_altivec.asm b/vp8/common/ppc/filter_bilinear_altivec.asm deleted file mode 100644 index fd8aa665f..000000000 --- a/vp8/common/ppc/filter_bilinear_altivec.asm +++ /dev/null @@ -1,677 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl bilinear_predict4x4_ppc - .globl bilinear_predict8x4_ppc - .globl bilinear_predict8x8_ppc - .globl bilinear_predict16x16_ppc - -.macro load_c V, LABEL, OFF, R0, R1 - lis \R0, \LABEL@ha - la \R1, \LABEL@l(\R0) - lvx \V, \OFF, \R1 -.endm - -.macro load_vfilter V0, V1 - load_c \V0, vfilter_b, r6, r9, r10 - - addi r6, r6, 16 - lvx \V1, r6, r10 -.endm - -.macro HProlog jump_label - ;# load up horizontal filter - slwi. r5, r5, 4 ;# index into horizontal filter array - - ;# index to the next set of vectors in the row. - li r10, 16 - li r12, 32 - - ;# downshift by 7 ( divide by 128 ) at the end - vspltish v19, 7 - - ;# If there isn't any filtering to be done for the horizontal, then - ;# just skip to the second pass. - beq \jump_label - - load_c v20, hfilter_b, r5, r9, r0 - - ;# setup constants - ;# v14 permutation value for alignment - load_c v28, b_hperm_b, 0, r9, r0 - - ;# rounding added in on the multiply - vspltisw v21, 8 - vspltisw v18, 3 - vslw v18, v21, v18 ;# 0x00000040000000400000004000000040 - - slwi. r6, r6, 5 ;# index into vertical filter array -.endm - -;# Filters a horizontal line -;# expects: -;# r3 src_ptr -;# r4 pitch -;# r10 16 -;# r12 32 -;# v17 perm intput -;# v18 rounding -;# v19 shift -;# v20 filter taps -;# v21 tmp -;# v22 tmp -;# v23 tmp -;# v24 tmp -;# v25 tmp -;# v26 tmp -;# v27 tmp -;# v28 perm output -;# -.macro HFilter V - vperm v24, v21, v21, v10 ;# v20 = 0123 1234 2345 3456 - vperm v25, v21, v21, v11 ;# v21 = 4567 5678 6789 789A - - vmsummbm v24, v20, v24, v18 - vmsummbm v25, v20, v25, v18 - - vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) - - vsrh v24, v24, v19 ;# divide v0, v1 by 128 - - vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result -.endm - -.macro hfilter_8 V, increment_counter - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 9 bytes wide, output is 8 bytes. - lvx v21, 0, r3 - lvx v22, r10, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - vperm v21, v21, v22, v17 - - HFilter \V -.endm - - -.macro load_and_align_8 V, increment_counter - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v21, 0, r3 - lvx v22, r10, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - - vperm \V, v21, v22, v17 -.endm - -.macro write_aligned_8 V, increment_counter - stvx \V, 0, r7 - -.if \increment_counter - add r7, r7, r8 -.endif -.endm - -.macro vfilter_16 P0 P1 - vmuleub v22, \P0, v20 ;# 64 + 4 positive taps - vadduhm v22, v18, v22 - vmuloub v23, \P0, v20 - vadduhm v23, v18, v23 - - vmuleub v24, \P1, v21 - vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary - vmuloub v25, \P1, v21 - vadduhm v23, v23, v25 ;# Ro = odds - - vsrh v22, v22, v19 ;# divide by 128 - vsrh v23, v23, v19 ;# v16 v17 = evens, odds - vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order - vmrglh v23, v22, v23 - vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result -.endm - - -.macro w_8x8 V, D, R, P - stvx \V, 0, r1 - lwz \R, 0(r1) - stw \R, 0(r7) - lwz \R, 4(r1) - stw \R, 4(r7) - add \D, \D, \P -.endm - - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch -bilinear_predict4x4_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xf830 - ori r12, r12, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - HProlog second_pass_4x4_pre_copy_b - - ;# Load up permutation constants - load_c v10, b_0123_b, 0, r9, r12 - load_c v11, b_4567_b, 0, r9, r12 - - hfilter_8 v0, 1 - hfilter_8 v1, 1 - hfilter_8 v2, 1 - hfilter_8 v3, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq store_out_4x4_b - - hfilter_8 v4, 0 - - b second_pass_4x4_b - -second_pass_4x4_pre_copy_b: - slwi r6, r6, 5 ;# index into vertical filter array - - load_and_align_8 v0, 1 - load_and_align_8 v1, 1 - load_and_align_8 v2, 1 - load_and_align_8 v3, 1 - load_and_align_8 v4, 1 - -second_pass_4x4_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - -store_out_4x4_b: - - stvx v0, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v1, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v2, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - add r7, r7, r8 - - stvx v3, 0, r1 - lwz r0, 0(r1) - stw r0, 0(r7) - -exit_4x4: - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch -bilinear_predict8x4_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xf830 - ori r12, r12, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - HProlog second_pass_8x4_pre_copy_b - - ;# Load up permutation constants - load_c v10, b_0123_b, 0, r9, r12 - load_c v11, b_4567_b, 0, r9, r12 - - hfilter_8 v0, 1 - hfilter_8 v1, 1 - hfilter_8 v2, 1 - hfilter_8 v3, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq store_out_8x4_b - - hfilter_8 v4, 0 - - b second_pass_8x4_b - -second_pass_8x4_pre_copy_b: - slwi r6, r6, 5 ;# index into vertical filter array - - load_and_align_8 v0, 1 - load_and_align_8 v1, 1 - load_and_align_8 v2, 1 - load_and_align_8 v3, 1 - load_and_align_8 v4, 1 - -second_pass_8x4_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - -store_out_8x4_b: - - cmpi cr0, r8, 8 - beq cr0, store_aligned_8x4_b - - w_8x8 v0, r7, r0, r8 - w_8x8 v1, r7, r0, r8 - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - - b exit_8x4 - -store_aligned_8x4_b: - load_c v10, b_hilo_b, 0, r9, r10 - - vperm v0, v0, v1, v10 - vperm v2, v2, v3, v10 - - stvx v0, 0, r7 - addi r7, r7, 16 - stvx v2, 0, r7 - -exit_8x4: - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch -bilinear_predict8x8_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xfff0 - ori r12, r12, 0xffff - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - HProlog second_pass_8x8_pre_copy_b - - ;# Load up permutation constants - load_c v10, b_0123_b, 0, r9, r12 - load_c v11, b_4567_b, 0, r9, r12 - - hfilter_8 v0, 1 - hfilter_8 v1, 1 - hfilter_8 v2, 1 - hfilter_8 v3, 1 - hfilter_8 v4, 1 - hfilter_8 v5, 1 - hfilter_8 v6, 1 - hfilter_8 v7, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq store_out_8x8_b - - hfilter_8 v8, 0 - - b second_pass_8x8_b - -second_pass_8x8_pre_copy_b: - slwi r6, r6, 5 ;# index into vertical filter array - - load_and_align_8 v0, 1 - load_and_align_8 v1, 1 - load_and_align_8 v2, 1 - load_and_align_8 v3, 1 - load_and_align_8 v4, 1 - load_and_align_8 v5, 1 - load_and_align_8 v6, 1 - load_and_align_8 v7, 1 - load_and_align_8 v8, 0 - -second_pass_8x8_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - vfilter_16 v4, v5 - vfilter_16 v5, v6 - vfilter_16 v6, v7 - vfilter_16 v7, v8 - -store_out_8x8_b: - - cmpi cr0, r8, 8 - beq cr0, store_aligned_8x8_b - - w_8x8 v0, r7, r0, r8 - w_8x8 v1, r7, r0, r8 - w_8x8 v2, r7, r0, r8 - w_8x8 v3, r7, r0, r8 - w_8x8 v4, r7, r0, r8 - w_8x8 v5, r7, r0, r8 - w_8x8 v6, r7, r0, r8 - w_8x8 v7, r7, r0, r8 - - b exit_8x8 - -store_aligned_8x8_b: - load_c v10, b_hilo_b, 0, r9, r10 - - vperm v0, v0, v1, v10 - vperm v2, v2, v3, v10 - vperm v4, v4, v5, v10 - vperm v6, v6, v7, v10 - - stvx v0, 0, r7 - addi r7, r7, 16 - stvx v2, 0, r7 - addi r7, r7, 16 - stvx v4, 0, r7 - addi r7, r7, 16 - stvx v6, 0, r7 - -exit_8x8: - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - - blr - -;# Filters a horizontal line -;# expects: -;# r3 src_ptr -;# r4 pitch -;# r10 16 -;# r12 32 -;# v17 perm intput -;# v18 rounding -;# v19 shift -;# v20 filter taps -;# v21 tmp -;# v22 tmp -;# v23 tmp -;# v24 tmp -;# v25 tmp -;# v26 tmp -;# v27 tmp -;# v28 perm output -;# -.macro hfilter_16 V, increment_counter - - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v21, 0, r3 - lvx v22, r10, r3 - lvx v23, r12, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - vperm v21, v21, v22, v17 - vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified - - ;# set 0 - vmsummbm v24, v20, v21, v18 ;# taps times elements - - ;# set 1 - vsldoi v23, v21, v22, 1 - vmsummbm v25, v20, v23, v18 - - ;# set 2 - vsldoi v23, v21, v22, 2 - vmsummbm v26, v20, v23, v18 - - ;# set 3 - vsldoi v23, v21, v22, 3 - vmsummbm v27, v20, v23, v18 - - vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) - vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F - - vsrh v24, v24, v19 ;# divide v0, v1 by 128 - vsrh v25, v25, v19 - - vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result - vperm \V, \V, v0, v28 ;# \V = correctly-ordered result -.endm - -.macro load_and_align_16 V, increment_counter - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v21, 0, r3 - lvx v22, r10, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - - vperm \V, v21, v22, v17 -.endm - -.macro write_16 V, increment_counter - stvx \V, 0, r7 - -.if \increment_counter - add r7, r7, r8 -.endif -.endm - - .align 2 -;# r3 unsigned char * src -;# r4 int src_pitch -;# r5 int x_offset -;# r6 int y_offset -;# r7 unsigned char * dst -;# r8 int dst_pitch -bilinear_predict16x16_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - HProlog second_pass_16x16_pre_copy_b - - hfilter_16 v0, 1 - hfilter_16 v1, 1 - hfilter_16 v2, 1 - hfilter_16 v3, 1 - hfilter_16 v4, 1 - hfilter_16 v5, 1 - hfilter_16 v6, 1 - hfilter_16 v7, 1 - hfilter_16 v8, 1 - hfilter_16 v9, 1 - hfilter_16 v10, 1 - hfilter_16 v11, 1 - hfilter_16 v12, 1 - hfilter_16 v13, 1 - hfilter_16 v14, 1 - hfilter_16 v15, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq store_out_16x16_b - - hfilter_16 v16, 0 - - b second_pass_16x16_b - -second_pass_16x16_pre_copy_b: - slwi r6, r6, 5 ;# index into vertical filter array - - load_and_align_16 v0, 1 - load_and_align_16 v1, 1 - load_and_align_16 v2, 1 - load_and_align_16 v3, 1 - load_and_align_16 v4, 1 - load_and_align_16 v5, 1 - load_and_align_16 v6, 1 - load_and_align_16 v7, 1 - load_and_align_16 v8, 1 - load_and_align_16 v9, 1 - load_and_align_16 v10, 1 - load_and_align_16 v11, 1 - load_and_align_16 v12, 1 - load_and_align_16 v13, 1 - load_and_align_16 v14, 1 - load_and_align_16 v15, 1 - load_and_align_16 v16, 0 - -second_pass_16x16_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - vfilter_16 v4, v5 - vfilter_16 v5, v6 - vfilter_16 v6, v7 - vfilter_16 v7, v8 - vfilter_16 v8, v9 - vfilter_16 v9, v10 - vfilter_16 v10, v11 - vfilter_16 v11, v12 - vfilter_16 v12, v13 - vfilter_16 v13, v14 - vfilter_16 v14, v15 - vfilter_16 v15, v16 - -store_out_16x16_b: - - write_16 v0, 1 - write_16 v1, 1 - write_16 v2, 1 - write_16 v3, 1 - write_16 v4, 1 - write_16 v5, 1 - write_16 v6, 1 - write_16 v7, 1 - write_16 v8, 1 - write_16 v9, 1 - write_16 v10, 1 - write_16 v11, 1 - write_16 v12, 1 - write_16 v13, 1 - write_16 v14, 1 - write_16 v15, 0 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .data - - .align 4 -hfilter_b: - .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0 - .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0 - .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0 - .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0 - .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0 - .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0 - .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0 - .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0 - - .align 4 -vfilter_b: - .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128 - .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 - .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 - .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 - .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 - .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 - .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 - .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 - .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 - .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 - .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 - - .align 4 -b_hperm_b: - .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 - - .align 4 -b_0123_b: - .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 - - .align 4 -b_4567_b: - .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - -b_hilo_b: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 diff --git a/vp8/common/ppc/idctllm_altivec.asm b/vp8/common/ppc/idctllm_altivec.asm deleted file mode 100644 index 117d9cfc8..000000000 --- a/vp8/common/ppc/idctllm_altivec.asm +++ /dev/null @@ -1,189 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl short_idct4x4llm_ppc - -.macro load_c V, LABEL, OFF, R0, R1 - lis \R0, \LABEL@ha - la \R1, \LABEL@l(\R0) - lvx \V, \OFF, \R1 -.endm - -;# r3 short *input -;# r4 short *output -;# r5 int pitch - .align 2 -short_idct4x4llm_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - load_c v8, sinpi8sqrt2, 0, r9, r10 - load_c v9, cospi8sqrt2minus1, 0, r9, r10 - load_c v10, hi_hi, 0, r9, r10 - load_c v11, lo_lo, 0, r9, r10 - load_c v12, shift_16, 0, r9, r10 - - li r10, 16 - lvx v0, 0, r3 ;# input ip[0], ip[ 4] - lvx v1, r10, r3 ;# input ip[8], ip[12] - - ;# first pass - vupkhsh v2, v0 - vupkhsh v3, v1 - vaddsws v6, v2, v3 ;# a1 = ip[0]+ip[8] - vsubsws v7, v2, v3 ;# b1 = ip[0]-ip[8] - - vupklsh v0, v0 - vmulosh v4, v0, v8 - vsraw v4, v4, v12 - vaddsws v4, v4, v0 ;# ip[ 4] * sin(pi/8) * sqrt(2) - - vupklsh v1, v1 - vmulosh v5, v1, v9 - vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2) - vaddsws v5, v5, v1 - - vsubsws v4, v4, v5 ;# c1 - - vmulosh v3, v1, v8 - vsraw v3, v3, v12 - vaddsws v3, v3, v1 ;# ip[12] * sin(pi/8) * sqrt(2) - - vmulosh v5, v0, v9 - vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2) - vaddsws v5, v5, v0 - - vaddsws v3, v3, v5 ;# d1 - - vaddsws v0, v6, v3 ;# a1 + d1 - vsubsws v3, v6, v3 ;# a1 - d1 - - vaddsws v1, v7, v4 ;# b1 + c1 - vsubsws v2, v7, v4 ;# b1 - c1 - - ;# transpose input - vmrghw v4, v0, v1 ;# a0 b0 a1 b1 - vmrghw v5, v2, v3 ;# c0 d0 c1 d1 - - vmrglw v6, v0, v1 ;# a2 b2 a3 b3 - vmrglw v7, v2, v3 ;# c2 d2 c3 d3 - - vperm v0, v4, v5, v10 ;# a0 b0 c0 d0 - vperm v1, v4, v5, v11 ;# a1 b1 c1 d1 - - vperm v2, v6, v7, v10 ;# a2 b2 c2 d2 - vperm v3, v6, v7, v11 ;# a3 b3 c3 d3 - - ;# second pass - vaddsws v6, v0, v2 ;# a1 = ip[0]+ip[8] - vsubsws v7, v0, v2 ;# b1 = ip[0]-ip[8] - - vmulosh v4, v1, v8 - vsraw v4, v4, v12 - vaddsws v4, v4, v1 ;# ip[ 4] * sin(pi/8) * sqrt(2) - - vmulosh v5, v3, v9 - vsraw v5, v5, v12 ;# ip[12] * cos(pi/8) * sqrt(2) - vaddsws v5, v5, v3 - - vsubsws v4, v4, v5 ;# c1 - - vmulosh v2, v3, v8 - vsraw v2, v2, v12 - vaddsws v2, v2, v3 ;# ip[12] * sin(pi/8) * sqrt(2) - - vmulosh v5, v1, v9 - vsraw v5, v5, v12 ;# ip[ 4] * cos(pi/8) * sqrt(2) - vaddsws v5, v5, v1 - - vaddsws v3, v2, v5 ;# d1 - - vaddsws v0, v6, v3 ;# a1 + d1 - vsubsws v3, v6, v3 ;# a1 - d1 - - vaddsws v1, v7, v4 ;# b1 + c1 - vsubsws v2, v7, v4 ;# b1 - c1 - - vspltish v6, 4 - vspltish v7, 3 - - vpkswss v0, v0, v1 - vpkswss v1, v2, v3 - - vaddshs v0, v0, v6 - vaddshs v1, v1, v6 - - vsrah v0, v0, v7 - vsrah v1, v1, v7 - - ;# transpose output - vmrghh v2, v0, v1 ;# a0 c0 a1 c1 a2 c2 a3 c3 - vmrglh v3, v0, v1 ;# b0 d0 b1 d1 b2 d2 b3 d3 - - vmrghh v0, v2, v3 ;# a0 b0 c0 d0 a1 b1 c1 d1 - vmrglh v1, v2, v3 ;# a2 b2 c2 d2 a3 b3 c3 d3 - - stwu r1,-416(r1) ;# create space on the stack - - stvx v0, 0, r1 - lwz r6, 0(r1) - stw r6, 0(r4) - lwz r6, 4(r1) - stw r6, 4(r4) - - add r4, r4, r5 - - lwz r6, 8(r1) - stw r6, 0(r4) - lwz r6, 12(r1) - stw r6, 4(r4) - - add r4, r4, r5 - - stvx v1, 0, r1 - lwz r6, 0(r1) - stw r6, 0(r4) - lwz r6, 4(r1) - stw r6, 4(r4) - - add r4, r4, r5 - - lwz r6, 8(r1) - stw r6, 0(r4) - lwz r6, 12(r1) - stw r6, 4(r4) - - addi r1, r1, 416 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 4 -sinpi8sqrt2: - .short 35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468 - - .align 4 -cospi8sqrt2minus1: - .short 20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091 - - .align 4 -shift_16: - .long 16, 16, 16, 16 - - .align 4 -hi_hi: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 - - .align 4 -lo_lo: - .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 diff --git a/vp8/common/ppc/loopfilter_altivec.c b/vp8/common/ppc/loopfilter_altivec.c deleted file mode 100644 index 71bf6e2d7..000000000 --- a/vp8/common/ppc/loopfilter_altivec.c +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "loopfilter.h" -#include "onyxc_int.h" - -typedef void loop_filter_function_y_ppc -( - unsigned char *s, // source pointer - int p, // pitch - const signed char *flimit, - const signed char *limit, - const signed char *thresh -); - -typedef void loop_filter_function_uv_ppc -( - unsigned char *u, // source pointer - unsigned char *v, // source pointer - int p, // pitch - const signed char *flimit, - const signed char *limit, - const signed char *thresh -); - -typedef void loop_filter_function_s_ppc -( - unsigned char *s, // source pointer - int p, // pitch - const signed char *flimit -); - -loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc; -loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc; -loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc; -loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc; - -loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc; -loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc; -loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc; -loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc; - -loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc; -loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc; - -// Horizontal MB filtering -void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr); - - if (u_ptr) - mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr); -} - -void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void)u_ptr; - (void)v_ptr; - (void)uv_stride; - loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim); -} - -// Vertical MB Filtering -void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr); - - if (u_ptr) - mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr); -} - -void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void)u_ptr; - (void)v_ptr; - (void)uv_stride; - loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim); -} - -// Horizontal B Filtering -void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - // These should all be done at once with one call, instead of 3 - loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); - loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); - loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); - - if (u_ptr) - loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr); -} - -void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void)u_ptr; - (void)v_ptr; - (void)uv_stride; - loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim); - loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim); - loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim); -} - -// Vertical B Filtering -void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr); - - if (u_ptr) - loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr); -} - -void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi) -{ - (void)u_ptr; - (void)v_ptr; - (void)uv_stride; - loop_filter_simple_vertical_edge_ppc(y_ptr + 4, y_stride, lfi->flim); - loop_filter_simple_vertical_edge_ppc(y_ptr + 8, y_stride, lfi->flim); - loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim); -} diff --git a/vp8/common/ppc/loopfilter_filters_altivec.asm b/vp8/common/ppc/loopfilter_filters_altivec.asm deleted file mode 100644 index 61df4e976..000000000 --- a/vp8/common/ppc/loopfilter_filters_altivec.asm +++ /dev/null @@ -1,1253 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl mbloop_filter_horizontal_edge_y_ppc - .globl loop_filter_horizontal_edge_y_ppc - .globl mbloop_filter_vertical_edge_y_ppc - .globl loop_filter_vertical_edge_y_ppc - - .globl mbloop_filter_horizontal_edge_uv_ppc - .globl loop_filter_horizontal_edge_uv_ppc - .globl mbloop_filter_vertical_edge_uv_ppc - .globl loop_filter_vertical_edge_uv_ppc - - .globl loop_filter_simple_horizontal_edge_ppc - .globl loop_filter_simple_vertical_edge_ppc - - .text -;# We often need to perform transposes (and other transpose-like operations) -;# on matrices of data. This is simplified by the fact that we usually -;# operate on hunks of data whose dimensions are powers of 2, or at least -;# divisible by highish powers of 2. -;# -;# These operations can be very confusing. They become more straightforward -;# when we think of them as permutations of address bits: Concatenate a -;# group of vector registers and think of it as occupying a block of -;# memory beginning at address zero. The low four bits 0...3 of the -;# address then correspond to position within a register, the higher-order -;# address bits select the register. -;# -;# Although register selection, at the code level, is arbitrary, things -;# are simpler if we use contiguous ranges of register numbers, simpler -;# still if the low-order bits of the register number correspond to -;# conceptual address bits. We do this whenever reasonable. -;# -;# A 16x16 transpose can then be thought of as an operation on -;# a 256-element block of memory. It takes 8 bits 0...7 to address this -;# memory and the effect of a transpose is to interchange address bit -;# 0 with 4, 1 with 5, 2 with 6, and 3 with 7. Bits 0...3 index the -;# column, which is interchanged with the row addressed by bits 4..7. -;# -;# The altivec merge instructions provide a rapid means of effecting -;# many of these transforms. They operate at three widths (8,16,32). -;# Writing V(x) for vector register #x, paired merges permute address -;# indices as follows. -;# -;# 0->1 1->2 2->3 3->(4+d) (4+s)->0: -;# -;# vmrghb V( x), V( y), V( y + (1<<s)) -;# vmrglb V( x + (1<<d)), V( y), V( y + (1<<s)) -;# -;# -;# =0= 1->2 2->3 3->(4+d) (4+s)->1: -;# -;# vmrghh V( x), V( y), V( y + (1<<s)) -;# vmrglh V( x + (1<<d)), V( y), V( y + (1<<s)) -;# -;# -;# =0= =1= 2->3 3->(4+d) (4+s)->2: -;# -;# vmrghw V( x), V( y), V( y + (1<<s)) -;# vmrglw V( x + (1<<d)), V( y), V( y + (1<<s)) -;# -;# -;# Unfortunately, there is no doubleword merge instruction. -;# The following sequence uses "vperm" is a substitute. -;# Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c) -;# are in registers Vhihi and Vlolo, we can also effect the permutation -;# -;# =0= =1= =2= 3->(4+d) (4+s)->3 by the sequence: -;# -;# vperm V( x), V( y), V( y + (1<<s)), Vhihi -;# vperm V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo -;# -;# -;# Except for bits s and d, the other relationships between register -;# number (= high-order part of address) bits are at the disposal of -;# the programmer. -;# - -;# To avoid excess transposes, we filter all 3 vertical luma subblock -;# edges together. This requires a single 16x16 transpose, which, in -;# the above language, amounts to the following permutation of address -;# indices: 0<->4 1<->5 2<->6 3<->7, which we accomplish by -;# 4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0. -;# -;# Except for the fact that the destination registers get written -;# before we are done referencing the old contents, the cyclic transform -;# is effected by -;# -;# x = 0; do { -;# vmrghb V(2x), V(x), V(x+8); -;# vmrghb V(2x+1), V(x), V(x+8); -;# } while( ++x < 8); -;# -;# For clarity, and because we can afford it, we do this transpose -;# using all 32 registers, alternating the banks 0..15 and 16 .. 31, -;# leaving the final result in 16 .. 31, as the lower registers are -;# used in the filtering itself. -;# -.macro Tpair A, B, X, Y - vmrghb \A, \X, \Y - vmrglb \B, \X, \Y -.endm - -;# Each step takes 8*2 = 16 instructions - -.macro t16_even - Tpair v16,v17, v0,v8 - Tpair v18,v19, v1,v9 - Tpair v20,v21, v2,v10 - Tpair v22,v23, v3,v11 - Tpair v24,v25, v4,v12 - Tpair v26,v27, v5,v13 - Tpair v28,v29, v6,v14 - Tpair v30,v31, v7,v15 -.endm - -.macro t16_odd - Tpair v0,v1, v16,v24 - Tpair v2,v3, v17,v25 - Tpair v4,v5, v18,v26 - Tpair v6,v7, v19,v27 - Tpair v8,v9, v20,v28 - Tpair v10,v11, v21,v29 - Tpair v12,v13, v22,v30 - Tpair v14,v15, v23,v31 -.endm - -;# Whole transpose takes 4*16 = 64 instructions - -.macro t16_full - t16_odd - t16_even - t16_odd - t16_even -.endm - -;# Vertical edge filtering requires transposes. For the simple filter, -;# we need to convert 16 rows of 4 pels each into 4 registers of 16 pels -;# each. Writing 0 ... 63 for the pixel indices, the desired result is: -;# -;# v0 = 0 1 ... 14 15 -;# v1 = 16 17 ... 30 31 -;# v2 = 32 33 ... 47 48 -;# v3 = 49 50 ... 62 63 -;# -;# In frame-buffer memory, the layout is: -;# -;# 0 16 32 48 -;# 1 17 33 49 -;# ... -;# 15 31 47 63. -;# -;# We begin by reading the data 32 bits at a time (using scalar operations) -;# into a temporary array, reading the rows of the array into vector registers, -;# with the following layout: -;# -;# v0 = 0 16 32 48 4 20 36 52 8 24 40 56 12 28 44 60 -;# v1 = 1 17 33 49 5 21 ... 45 61 -;# v2 = 2 18 ... 46 62 -;# v3 = 3 19 ... 47 63 -;# -;# From the "address-bit" perspective discussed above, we simply need to -;# interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone. -;# In other words, we transpose each of the four 4x4 submatrices. -;# -;# This transformation is its own inverse, and we need to perform it -;# again before writing the pixels back into the frame buffer. -;# -;# It acts in place on registers v0...v3, uses v4...v7 as temporaries, -;# and assumes that v14/v15 contain the b_hihi/b_lolo selectors -;# defined above. We think of both groups of 4 registers as having -;# "addresses" {0,1,2,3} * 16. -;# -.macro Transpose4times4x4 Vlo, Vhi - - ;# d=s=0 0->1 1->2 2->3 3->4 4->0 =5= - - vmrghb v4, v0, v1 - vmrglb v5, v0, v1 - vmrghb v6, v2, v3 - vmrglb v7, v2, v3 - - ;# d=0 s=1 =0= 1->2 2->3 3->4 4->5 5->1 - - vmrghh v0, v4, v6 - vmrglh v1, v4, v6 - vmrghh v2, v5, v7 - vmrglh v3, v5, v7 - - ;# d=s=0 =0= =1= 2->3 3->4 4->2 =5= - - vmrghw v4, v0, v1 - vmrglw v5, v0, v1 - vmrghw v6, v2, v3 - vmrglw v7, v2, v3 - - ;# d=0 s=1 =0= =1= =2= 3->4 4->5 5->3 - - vperm v0, v4, v6, \Vlo - vperm v1, v4, v6, \Vhi - vperm v2, v5, v7, \Vlo - vperm v3, v5, v7, \Vhi -.endm -;# end Transpose4times4x4 - - -;# Normal mb vertical edge filter transpose. -;# -;# We read 8 columns of data, initially in the following pattern: -;# -;# (0,0) (1,0) ... (7,0) (0,1) (1,1) ... (7,1) -;# (0,2) (1,2) ... (7,2) (0,3) (1,3) ... (7,3) -;# ... -;# (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15) -;# -;# and wish to convert to: -;# -;# (0,0) ... (0,15) -;# (1,0) ... (1,15) -;# ... -;# (7,0) ... (7,15). -;# -;# In "address bit" language, we wish to map -;# -;# 0->4 1->5 2->6 3->0 4->1 5->2 6->3, i.e., I -> (I+4) mod 7. -;# -;# This can be accomplished by 4 iterations of the cyclic transform -;# -;# I -> (I+1) mod 7; -;# -;# each iteration can be realized by (d=0, s=2): -;# -;# x = 0; do Tpair( V(2x),V(2x+1), V(x),V(x+4)) while( ++x < 4); -;# -;# The input/output is in registers v0...v7. We use v10...v17 as mirrors; -;# preserving v8 = sign converter. -;# -;# Inverse transpose is similar, except here I -> (I+3) mod 7 and the -;# result lands in the "mirror" registers v10...v17 -;# -.macro t8x16_odd - Tpair v10, v11, v0, v4 - Tpair v12, v13, v1, v5 - Tpair v14, v15, v2, v6 - Tpair v16, v17, v3, v7 -.endm - -.macro t8x16_even - Tpair v0, v1, v10, v14 - Tpair v2, v3, v11, v15 - Tpair v4, v5, v12, v16 - Tpair v6, v7, v13, v17 -.endm - -.macro transpose8x16_fwd - t8x16_odd - t8x16_even - t8x16_odd - t8x16_even -.endm - -.macro transpose8x16_inv - t8x16_odd - t8x16_even - t8x16_odd -.endm - -.macro Transpose16x16 - vmrghb v0, v16, v24 - vmrglb v1, v16, v24 - vmrghb v2, v17, v25 - vmrglb v3, v17, v25 - vmrghb v4, v18, v26 - vmrglb v5, v18, v26 - vmrghb v6, v19, v27 - vmrglb v7, v19, v27 - vmrghb v8, v20, v28 - vmrglb v9, v20, v28 - vmrghb v10, v21, v29 - vmrglb v11, v21, v29 - vmrghb v12, v22, v30 - vmrglb v13, v22, v30 - vmrghb v14, v23, v31 - vmrglb v15, v23, v31 - vmrghb v16, v0, v8 - vmrglb v17, v0, v8 - vmrghb v18, v1, v9 - vmrglb v19, v1, v9 - vmrghb v20, v2, v10 - vmrglb v21, v2, v10 - vmrghb v22, v3, v11 - vmrglb v23, v3, v11 - vmrghb v24, v4, v12 - vmrglb v25, v4, v12 - vmrghb v26, v5, v13 - vmrglb v27, v5, v13 - vmrghb v28, v6, v14 - vmrglb v29, v6, v14 - vmrghb v30, v7, v15 - vmrglb v31, v7, v15 - vmrghb v0, v16, v24 - vmrglb v1, v16, v24 - vmrghb v2, v17, v25 - vmrglb v3, v17, v25 - vmrghb v4, v18, v26 - vmrglb v5, v18, v26 - vmrghb v6, v19, v27 - vmrglb v7, v19, v27 - vmrghb v8, v20, v28 - vmrglb v9, v20, v28 - vmrghb v10, v21, v29 - vmrglb v11, v21, v29 - vmrghb v12, v22, v30 - vmrglb v13, v22, v30 - vmrghb v14, v23, v31 - vmrglb v15, v23, v31 - vmrghb v16, v0, v8 - vmrglb v17, v0, v8 - vmrghb v18, v1, v9 - vmrglb v19, v1, v9 - vmrghb v20, v2, v10 - vmrglb v21, v2, v10 - vmrghb v22, v3, v11 - vmrglb v23, v3, v11 - vmrghb v24, v4, v12 - vmrglb v25, v4, v12 - vmrghb v26, v5, v13 - vmrglb v27, v5, v13 - vmrghb v28, v6, v14 - vmrglb v29, v6, v14 - vmrghb v30, v7, v15 - vmrglb v31, v7, v15 -.endm - -;# load_g loads a global vector (whose address is in the local variable Gptr) -;# into vector register Vreg. Trashes r0 -.macro load_g Vreg, Gptr - lwz r0, \Gptr - lvx \Vreg, 0, r0 -.endm - -;# exploit the saturation here. if the answer is negative -;# it will be clamped to 0. orring 0 with a positive -;# number will be the positive number (abs) -;# RES = abs( A-B), trashes TMP -.macro Abs RES, TMP, A, B - vsububs \RES, \A, \B - vsububs \TMP, \B, \A - vor \RES, \RES, \TMP -.endm - -;# RES = Max( RES, abs( A-B)), trashes TMP -.macro max_abs RES, TMP, A, B - vsububs \TMP, \A, \B - vmaxub \RES, \RES, \TMP - vsububs \TMP, \B, \A - vmaxub \RES, \RES, \TMP -.endm - -.macro Masks - ;# build masks - ;# input is all 8 bit unsigned (0-255). need to - ;# do abs(vala-valb) > limit. but no need to compare each - ;# value to the limit. find the max of the absolute differences - ;# and compare that to the limit. - ;# First hev - Abs v14, v13, v2, v3 ;# |P1 - P0| - max_abs v14, v13, v5, v4 ;# |Q1 - Q0| - - vcmpgtub v10, v14, v10 ;# HEV = true if thresh exceeded - - ;# Next limit - max_abs v14, v13, v0, v1 ;# |P3 - P2| - max_abs v14, v13, v1, v2 ;# |P2 - P1| - max_abs v14, v13, v6, v5 ;# |Q2 - Q1| - max_abs v14, v13, v7, v6 ;# |Q3 - Q2| - - vcmpgtub v9, v14, v9 ;# R = true if limit exceeded - - ;# flimit - Abs v14, v13, v3, v4 ;# |P0 - Q0| - - vcmpgtub v8, v14, v8 ;# X = true if flimit exceeded - - vor v8, v8, v9 ;# R = true if flimit or limit exceeded - ;# done building masks -.endm - -.macro build_constants RFL, RLI, RTH, FL, LI, TH - ;# build constants - lvx \FL, 0, \RFL ;# flimit - lvx \LI, 0, \RLI ;# limit - lvx \TH, 0, \RTH ;# thresh - - vspltisb v11, 8 - vspltisb v12, 4 - vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 -.endm - -.macro load_data_y - ;# setup strides/pointers to be able to access - ;# all of the data - add r5, r4, r4 ;# r5 = 2 * stride - sub r6, r3, r5 ;# r6 -> 2 rows back - neg r7, r4 ;# r7 = -stride - - ;# load 16 pixels worth of data to work on - sub r0, r6, r5 ;# r0 -> 4 rows back (temp) - lvx v0, 0, r0 ;# P3 (read only) - lvx v1, r7, r6 ;# P2 - lvx v2, 0, r6 ;# P1 - lvx v3, r7, r3 ;# P0 - lvx v4, 0, r3 ;# Q0 - lvx v5, r4, r3 ;# Q1 - lvx v6, r5, r3 ;# Q2 - add r0, r3, r5 ;# r0 -> 2 rows fwd (temp) - lvx v7, r4, r0 ;# Q3 (read only) -.endm - -;# Expects -;# v10 == HEV -;# v13 == tmp -;# v14 == tmp -.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT - vxor \P1, \P1, v11 ;# SP1 - vxor \P0, \P0, v11 ;# SP0 - vxor \Q0, \Q0, v11 ;# SQ0 - vxor \Q1, \Q1, v11 ;# SQ1 - - vsubsbs v13, \P1, \Q1 ;# f = c (P1 - Q1) -.if \HEV_PRESENT - vand v13, v13, v10 ;# f &= hev -.endif - vsubsbs v14, \Q0, \P0 ;# -126 <= X = Q0-P0 <= +126 - vaddsbs v13, v13, v14 - vaddsbs v13, v13, v14 - vaddsbs v13, v13, v14 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) - - vandc v13, v13, v8 ;# f &= mask - - vspltisb v8, 3 - vspltisb v9, 4 - - vaddsbs v14, v13, v9 ;# f1 = c (f+4) - vaddsbs v15, v13, v8 ;# f2 = c (f+3) - - vsrab v13, v14, v8 ;# f1 >>= 3 - vsrab v15, v15, v8 ;# f2 >>= 3 - - vsubsbs \Q0, \Q0, v13 ;# u1 = c (SQ0 - f1) - vaddsbs \P0, \P0, v15 ;# u2 = c (SP0 + f2) -.endm - -.macro vp8_mbfilter - Masks - - ;# start the fitering here - vxor v1, v1, v11 ;# SP2 - vxor v2, v2, v11 ;# SP1 - vxor v3, v3, v11 ;# SP0 - vxor v4, v4, v11 ;# SQ0 - vxor v5, v5, v11 ;# SQ1 - vxor v6, v6, v11 ;# SQ2 - - ;# add outer taps if we have high edge variance - vsubsbs v13, v2, v5 ;# f = c (SP1-SQ1) - - vsubsbs v14, v4, v3 ;# SQ0-SP0 - vaddsbs v13, v13, v14 - vaddsbs v13, v13, v14 - vaddsbs v13, v13, v14 ;# f = c( c(SP1-SQ1) + 3*(SQ0-SP0)) - - vandc v13, v13, v8 ;# f &= mask - vand v15, v13, v10 ;# f2 = f & hev - - ;# save bottom 3 bits so that we round one side +4 and the other +3 - vspltisb v8, 3 - vspltisb v9, 4 - - vaddsbs v14, v15, v9 ;# f1 = c (f+4) - vaddsbs v15, v15, v8 ;# f2 = c (f+3) - - vsrab v14, v14, v8 ;# f1 >>= 3 - vsrab v15, v15, v8 ;# f2 >>= 3 - - vsubsbs v4, v4, v14 ;# u1 = c (SQ0 - f1) - vaddsbs v3, v3, v15 ;# u2 = c (SP0 + f2) - - ;# only apply wider filter if not high edge variance - vandc v13, v13, v10 ;# f &= ~hev - - vspltisb v9, 2 - vnor v8, v8, v8 - vsrb v9, v8, v9 ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f - vupkhsb v9, v9 ;# 0x003f003f003f003f003f003f003f003f - vspltisb v8, 9 - - ;# roughly 1/7th difference across boundary - vspltish v10, 7 - vmulosb v14, v8, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) - vmulesb v15, v8, v13 - vaddshs v14, v14, v9 ;# += 63 - vaddshs v15, v15, v9 - vsrah v14, v14, v10 ;# >>= 7 - vsrah v15, v15, v10 - vmrglh v10, v15, v14 - vmrghh v15, v15, v14 - - vpkshss v10, v15, v10 ;# X = saturated down to bytes - - vsubsbs v6, v6, v10 ;# subtract from Q and add to P - vaddsbs v1, v1, v10 - - vxor v6, v6, v11 - vxor v1, v1, v11 - - ;# roughly 2/7th difference across boundary - vspltish v10, 7 - vaddubm v12, v8, v8 - vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) - vmulesb v15, v12, v13 - vaddshs v14, v14, v9 - vaddshs v15, v15, v9 - vsrah v14, v14, v10 ;# >>= 7 - vsrah v15, v15, v10 - vmrglh v10, v15, v14 - vmrghh v15, v15, v14 - - vpkshss v10, v15, v10 ;# X = saturated down to bytes - - vsubsbs v5, v5, v10 ;# subtract from Q and add to P - vaddsbs v2, v2, v10 - - vxor v5, v5, v11 - vxor v2, v2, v11 - - ;# roughly 3/7th difference across boundary - vspltish v10, 7 - vaddubm v12, v12, v8 - vmulosb v14, v12, v13 ;# A = c( c(P1-Q1) + 3*(Q0-P0)) - vmulesb v15, v12, v13 - vaddshs v14, v14, v9 - vaddshs v15, v15, v9 - vsrah v14, v14, v10 ;# >>= 7 - vsrah v15, v15, v10 - vmrglh v10, v15, v14 - vmrghh v15, v15, v14 - - vpkshss v10, v15, v10 ;# X = saturated down to bytes - - vsubsbs v4, v4, v10 ;# subtract from Q and add to P - vaddsbs v3, v3, v10 - - vxor v4, v4, v11 - vxor v3, v3, v11 -.endm - -.macro SBFilter - Masks - - common_adjust v3, v4, v2, v5, 1 - - ;# outer tap adjustments - vspltisb v8, 1 - - vaddubm v13, v13, v8 ;# f += 1 - vsrab v13, v13, v8 ;# f >>= 1 - - vandc v13, v13, v10 ;# f &= ~hev - - vsubsbs v5, v5, v13 ;# u1 = c (SQ1 - f) - vaddsbs v2, v2, v13 ;# u2 = c (SP1 + f) - - vxor v2, v2, v11 - vxor v3, v3, v11 - vxor v4, v4, v11 - vxor v5, v5, v11 -.endm - - .align 2 -mbloop_filter_horizontal_edge_y_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - mtspr 256, r12 ;# set VRSAVE - - build_constants r5, r6, r7, v8, v9, v10 - - load_data_y - - vp8_mbfilter - - stvx v1, r7, r6 ;# P2 - stvx v2, 0, r6 ;# P1 - stvx v3, r7, r3 ;# P0 - stvx v4, 0, r3 ;# Q0 - stvx v5, r4, r3 ;# Q1 - stvx v6, r5, r3 ;# Q2 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char *s -;# r4 int p -;# r5 const signed char *flimit -;# r6 const signed char *limit -;# r7 const signed char *thresh -loop_filter_horizontal_edge_y_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - mtspr 256, r12 ;# set VRSAVE - - build_constants r5, r6, r7, v8, v9, v10 - - load_data_y - - SBFilter - - stvx v2, 0, r6 ;# P1 - stvx v3, r7, r3 ;# P0 - stvx v4, 0, r3 ;# Q0 - stvx v5, r4, r3 ;# Q1 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -;# Filtering a vertical mb. Each mb is aligned on a 16 byte boundary. -;# So we can read in an entire mb aligned. However if we want to filter the mb -;# edge we run into problems. For the loopfilter we require 4 bytes before the mb -;# and 4 after for a total of 8 bytes. Reading 16 bytes inorder to get 4 is a bit -;# of a waste. So this is an even uglier way to get around that. -;# Using the regular register file words are read in and then saved back out to -;# memory to align and order them up. Then they are read in using the -;# vector register file. -.macro RLVmb V, R - lwzux r0, r3, r4 - stw r0, 4(\R) - lwz r0,-4(r3) - stw r0, 0(\R) - lwzux r0, r3, r4 - stw r0,12(\R) - lwz r0,-4(r3) - stw r0, 8(\R) - lvx \V, 0, \R -.endm - -.macro WLVmb V, R - stvx \V, 0, \R - lwz r0,12(\R) - stwux r0, r3, r4 - lwz r0, 8(\R) - stw r0,-4(r3) - lwz r0, 4(\R) - stwux r0, r3, r4 - lwz r0, 0(\R) - stw r0,-4(r3) -.endm - - .align 2 -;# r3 unsigned char *s -;# r4 int p -;# r5 const signed char *flimit -;# r6 const signed char *limit -;# r7 const signed char *thresh -mbloop_filter_vertical_edge_y_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xc000 - mtspr 256, r12 ;# set VRSAVE - - la r9, -48(r1) ;# temporary space for reading in vectors - sub r3, r3, r4 - - RLVmb v0, r9 - RLVmb v1, r9 - RLVmb v2, r9 - RLVmb v3, r9 - RLVmb v4, r9 - RLVmb v5, r9 - RLVmb v6, r9 - RLVmb v7, r9 - - transpose8x16_fwd - - build_constants r5, r6, r7, v8, v9, v10 - - vp8_mbfilter - - transpose8x16_inv - - add r3, r3, r4 - neg r4, r4 - - WLVmb v17, r9 - WLVmb v16, r9 - WLVmb v15, r9 - WLVmb v14, r9 - WLVmb v13, r9 - WLVmb v12, r9 - WLVmb v11, r9 - WLVmb v10, r9 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -.macro RL V, R, P - lvx \V, 0, \R - add \R, \R, \P -.endm - -.macro WL V, R, P - stvx \V, 0, \R - add \R, \R, \P -.endm - -.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3 - ;# K = |P0-P1| already - Abs v14, v13, \Q0, \Q1 ;# M = |Q0-Q1| - vmaxub v14, v14, v4 ;# M = max( |P0-P1|, |Q0-Q1|) - vcmpgtub v10, v14, v0 - - Abs v4, v5, \Q2, \Q3 ;# K = |Q2-Q3| = next |P0-P1] - - max_abs v14, v13, \Q1, \Q2 ;# M = max( M, |Q1-Q2|) - max_abs v14, v13, \P1, \P2 ;# M = max( M, |P1-P2|) - max_abs v14, v13, \P2, \P3 ;# M = max( M, |P2-P3|) - - vmaxub v14, v14, v4 ;# M = max interior abs diff - vcmpgtub v9, v14, v2 ;# M = true if int_l exceeded - - Abs v14, v13, \P0, \Q0 ;# X = Abs( P0-Q0) - vcmpgtub v8, v14, v3 ;# X = true if edge_l exceeded - vor v8, v8, v9 ;# M = true if edge_l or int_l exceeded - - ;# replace P1,Q1 w/signed versions - common_adjust \P0, \Q0, \P1, \Q1, 1 - - vaddubm v13, v13, v1 ;# -16 <= M <= 15, saturation irrelevant - vsrab v13, v13, v1 - vandc v13, v13, v10 ;# adjust P1,Q1 by (M+1)>>1 if ! hev - vsubsbs \Q1, \Q1, v13 - vaddsbs \P1, \P1, v13 - - vxor \P1, \P1, v11 ;# P1 - vxor \P0, \P0, v11 ;# P0 - vxor \Q0, \Q0, v11 ;# Q0 - vxor \Q1, \Q1, v11 ;# Q1 -.endm - - - .align 2 -;# r3 unsigned char *s -;# r4 int p -;# r5 const signed char *flimit -;# r6 const signed char *limit -;# r7 const signed char *thresh -loop_filter_vertical_edge_y_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xffff - mtspr 256, r12 ;# set VRSAVE - - addi r9, r3, 0 - RL v16, r9, r4 - RL v17, r9, r4 - RL v18, r9, r4 - RL v19, r9, r4 - RL v20, r9, r4 - RL v21, r9, r4 - RL v22, r9, r4 - RL v23, r9, r4 - RL v24, r9, r4 - RL v25, r9, r4 - RL v26, r9, r4 - RL v27, r9, r4 - RL v28, r9, r4 - RL v29, r9, r4 - RL v30, r9, r4 - lvx v31, 0, r9 - - Transpose16x16 - - vspltisb v1, 1 - - build_constants r5, r6, r7, v3, v2, v0 - - Abs v4, v5, v19, v18 ;# K(v14) = first |P0-P1| - - Fil v16, v17, v18, v19, v20, v21, v22, v23 - Fil v20, v21, v22, v23, v24, v25, v26, v27 - Fil v24, v25, v26, v27, v28, v29, v30, v31 - - Transpose16x16 - - addi r9, r3, 0 - WL v16, r9, r4 - WL v17, r9, r4 - WL v18, r9, r4 - WL v19, r9, r4 - WL v20, r9, r4 - WL v21, r9, r4 - WL v22, r9, r4 - WL v23, r9, r4 - WL v24, r9, r4 - WL v25, r9, r4 - WL v26, r9, r4 - WL v27, r9, r4 - WL v28, r9, r4 - WL v29, r9, r4 - WL v30, r9, r4 - stvx v31, 0, r9 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -.macro active_chroma_sel V - andi. r7, r3, 8 ;# row origin modulo 16 - add r7, r7, r7 ;# selects selectors - lis r12, _chromaSelectors@ha - la r0, _chromaSelectors@l(r12) - lwzux r0, r7, r0 ;# leave selector addr in r7 - - lvx \V, 0, r0 ;# mask to concatenate active U,V pels -.endm - -.macro hread_uv Dest, U, V, Offs, VMask - lvx \U, \Offs, r3 - lvx \V, \Offs, r4 - vperm \Dest, \U, \V, \VMask ;# Dest = active part of U then V -.endm - -.macro hwrite_uv New, U, V, Offs, Umask, Vmask - vperm \U, \New, \U, \Umask ;# Combine new pels with siblings - vperm \V, \New, \V, \Vmask - stvx \U, \Offs, r3 ;# Write to frame buffer - stvx \V, \Offs, r4 -.endm - -;# Process U,V in parallel. -.macro load_chroma_h - neg r9, r5 ;# r9 = -1 * stride - add r8, r9, r9 ;# r8 = -2 * stride - add r10, r5, r5 ;# r10 = 2 * stride - - active_chroma_sel v12 - - ;# P3, Q3 are read-only; need not save addresses or sibling pels - add r6, r8, r8 ;# r6 = -4 * stride - hread_uv v0, v14, v15, r6, v12 - add r6, r10, r5 ;# r6 = 3 * stride - hread_uv v7, v14, v15, r6, v12 - - ;# Others are read/write; save addresses and sibling pels - - add r6, r8, r9 ;# r6 = -3 * stride - hread_uv v1, v16, v17, r6, v12 - hread_uv v2, v18, v19, r8, v12 - hread_uv v3, v20, v21, r9, v12 - hread_uv v4, v22, v23, 0, v12 - hread_uv v5, v24, v25, r5, v12 - hread_uv v6, v26, v27, r10, v12 -.endm - -.macro uresult_sel V - load_g \V, 4(r7) -.endm - -.macro vresult_sel V - load_g \V, 8(r7) -.endm - -;# always write P1,P0,Q0,Q1 -.macro store_chroma_h - uresult_sel v11 - vresult_sel v12 - hwrite_uv v2, v18, v19, r8, v11, v12 - hwrite_uv v3, v20, v21, r9, v11, v12 - hwrite_uv v4, v22, v23, 0, v11, v12 - hwrite_uv v5, v24, v25, r5, v11, v12 -.endm - - .align 2 -;# r3 unsigned char *u -;# r4 unsigned char *v -;# r5 int p -;# r6 const signed char *flimit -;# r7 const signed char *limit -;# r8 const signed char *thresh -mbloop_filter_horizontal_edge_uv_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xffff - mtspr 256, r12 ;# set VRSAVE - - build_constants r6, r7, r8, v8, v9, v10 - - load_chroma_h - - vp8_mbfilter - - store_chroma_h - - hwrite_uv v1, v16, v17, r6, v11, v12 ;# v1 == P2 - hwrite_uv v6, v26, v27, r10, v11, v12 ;# v6 == Q2 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char *u -;# r4 unsigned char *v -;# r5 int p -;# r6 const signed char *flimit -;# r7 const signed char *limit -;# r8 const signed char *thresh -loop_filter_horizontal_edge_uv_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xffff - mtspr 256, r12 ;# set VRSAVE - - build_constants r6, r7, r8, v8, v9, v10 - - load_chroma_h - - SBFilter - - store_chroma_h - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -.macro R V, R - lwzux r0, r3, r5 - stw r0, 4(\R) - lwz r0,-4(r3) - stw r0, 0(\R) - lwzux r0, r4, r5 - stw r0,12(\R) - lwz r0,-4(r4) - stw r0, 8(\R) - lvx \V, 0, \R -.endm - - -.macro W V, R - stvx \V, 0, \R - lwz r0,12(\R) - stwux r0, r4, r5 - lwz r0, 8(\R) - stw r0,-4(r4) - lwz r0, 4(\R) - stwux r0, r3, r5 - lwz r0, 0(\R) - stw r0,-4(r3) -.endm - -.macro chroma_vread R - sub r3, r3, r5 ;# back up one line for simplicity - sub r4, r4, r5 - - R v0, \R - R v1, \R - R v2, \R - R v3, \R - R v4, \R - R v5, \R - R v6, \R - R v7, \R - - transpose8x16_fwd -.endm - -.macro chroma_vwrite R - - transpose8x16_inv - - add r3, r3, r5 - add r4, r4, r5 - neg r5, r5 ;# Write rows back in reverse order - - W v17, \R - W v16, \R - W v15, \R - W v14, \R - W v13, \R - W v12, \R - W v11, \R - W v10, \R -.endm - - .align 2 -;# r3 unsigned char *u -;# r4 unsigned char *v -;# r5 int p -;# r6 const signed char *flimit -;# r7 const signed char *limit -;# r8 const signed char *thresh -mbloop_filter_vertical_edge_uv_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xc000 - mtspr 256, r12 ;# set VRSAVE - - la r9, -48(r1) ;# temporary space for reading in vectors - - chroma_vread r9 - - build_constants r6, r7, r8, v8, v9, v10 - - vp8_mbfilter - - chroma_vwrite r9 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char *u -;# r4 unsigned char *v -;# r5 int p -;# r6 const signed char *flimit -;# r7 const signed char *limit -;# r8 const signed char *thresh -loop_filter_vertical_edge_uv_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xc000 - mtspr 256, r12 ;# set VRSAVE - - la r9, -48(r1) ;# temporary space for reading in vectors - - chroma_vread r9 - - build_constants r6, r7, r8, v8, v9, v10 - - SBFilter - - chroma_vwrite r9 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=- - -.macro vp8_simple_filter - Abs v14, v13, v1, v2 ;# M = abs( P0 - Q0) - vcmpgtub v8, v14, v8 ;# v5 = true if _over_ limit - - ;# preserve unsigned v0 and v3 - common_adjust v1, v2, v0, v3, 0 - - vxor v1, v1, v11 - vxor v2, v2, v11 ;# cvt Q0, P0 back to pels -.endm - -.macro simple_vertical - addi r8, 0, 16 - addi r7, r5, 32 - - lvx v0, 0, r5 - lvx v1, r8, r5 - lvx v2, 0, r7 - lvx v3, r8, r7 - - lis r12, _B_hihi@ha - la r0, _B_hihi@l(r12) - lvx v16, 0, r0 - - lis r12, _B_lolo@ha - la r0, _B_lolo@l(r12) - lvx v17, 0, r0 - - Transpose4times4x4 v16, v17 - vp8_simple_filter - - vxor v0, v0, v11 - vxor v3, v3, v11 ;# cvt Q0, P0 back to pels - - Transpose4times4x4 v16, v17 - - stvx v0, 0, r5 - stvx v1, r8, r5 - stvx v2, 0, r7 - stvx v3, r8, r7 -.endm - - .align 2 -;# r3 unsigned char *s -;# r4 int p -;# r5 const signed char *flimit -loop_filter_simple_horizontal_edge_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - mtspr 256, r12 ;# set VRSAVE - - ;# build constants - lvx v8, 0, r5 ;# flimit - - vspltisb v11, 8 - vspltisb v12, 4 - vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 - - neg r5, r4 ;# r5 = -1 * stride - add r6, r5, r5 ;# r6 = -2 * stride - - lvx v0, r6, r3 ;# v0 = P1 = 16 pels two rows above edge - lvx v1, r5, r3 ;# v1 = P0 = 16 pels one row above edge - lvx v2, 0, r3 ;# v2 = Q0 = 16 pels one row below edge - lvx v3, r4, r3 ;# v3 = Q1 = 16 pels two rows below edge - - vp8_simple_filter - - stvx v1, r5, r3 ;# store P0 - stvx v2, 0, r3 ;# store Q0 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -.macro RLV Offs - stw r0, (\Offs*4)(r5) - lwzux r0, r7, r4 -.endm - -.macro WLV Offs - lwz r0, (\Offs*4)(r5) - stwux r0, r7, r4 -.endm - - .align 2 -;# r3 unsigned char *s -;# r4 int p -;# r5 const signed char *flimit -loop_filter_simple_vertical_edge_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xc000 - mtspr 256, r12 ;# set VRSAVE - - ;# build constants - lvx v8, 0, r5 ;# flimit - - vspltisb v11, 8 - vspltisb v12, 4 - vslb v11, v11, v12 ;# 0x80808080808080808080808080808080 - - la r5, -96(r1) ;# temporary space for reading in vectors - - ;# Store 4 pels at word "Offs" in temp array, then advance r7 - ;# to next row and read another 4 pels from the frame buffer. - - subi r7, r3, 2 ;# r7 -> 2 pels before start - lwzx r0, 0, r7 ;# read first 4 pels - - ;# 16 unaligned word accesses - RLV 0 - RLV 4 - RLV 8 - RLV 12 - RLV 1 - RLV 5 - RLV 9 - RLV 13 - RLV 2 - RLV 6 - RLV 10 - RLV 14 - RLV 3 - RLV 7 - RLV 11 - - stw r0, (15*4)(r5) ;# write last 4 pels - - simple_vertical - - ;# Read temp array, write frame buffer. - subi r7, r3, 2 ;# r7 -> 2 pels before start - lwzx r0, 0, r5 ;# read/write first 4 pels - stwx r0, 0, r7 - - WLV 4 - WLV 8 - WLV 12 - WLV 1 - WLV 5 - WLV 9 - WLV 13 - WLV 2 - WLV 6 - WLV 10 - WLV 14 - WLV 3 - WLV 7 - WLV 11 - WLV 15 - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .data - -_chromaSelectors: - .long _B_hihi - .long _B_Ures0 - .long _B_Vres0 - .long 0 - .long _B_lolo - .long _B_Ures8 - .long _B_Vres8 - .long 0 - - .align 4 -_B_Vres8: - .byte 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 - - .align 4 -_B_Ures8: - .byte 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7 - - .align 4 -_B_lolo: - .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 - - .align 4 -_B_Vres0: - .byte 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 - .align 4 -_B_Ures0: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 - - .align 4 -_B_hihi: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 diff --git a/vp8/common/ppc/platform_altivec.asm b/vp8/common/ppc/platform_altivec.asm deleted file mode 100644 index f81d86f74..000000000 --- a/vp8/common/ppc/platform_altivec.asm +++ /dev/null @@ -1,59 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl save_platform_context - .globl restore_platform_context - -.macro W V P - stvx \V, 0, \P - addi \P, \P, 16 -.endm - -.macro R V P - lvx \V, 0, \P - addi \P, \P, 16 -.endm - -;# r3 context_ptr - .align 2 -save_platform_contex: - W v20, r3 - W v21, r3 - W v22, r3 - W v23, r3 - W v24, r3 - W v25, r3 - W v26, r3 - W v27, r3 - W v28, r3 - W v29, r3 - W v30, r3 - W v31, r3 - - blr - -;# r3 context_ptr - .align 2 -restore_platform_context: - R v20, r3 - R v21, r3 - R v22, r3 - R v23, r3 - R v24, r3 - R v25, r3 - R v26, r3 - R v27, r3 - R v28, r3 - R v29, r3 - R v30, r3 - R v31, r3 - - blr diff --git a/vp8/common/ppc/recon_altivec.asm b/vp8/common/ppc/recon_altivec.asm deleted file mode 100644 index dd39e05a8..000000000 --- a/vp8/common/ppc/recon_altivec.asm +++ /dev/null @@ -1,175 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl recon4b_ppc - .globl recon2b_ppc - .globl recon_b_ppc - -.macro row_of16 Diff Pred Dst Stride - lvx v1, 0, \Pred ;# v1 = pred = p0..p15 - addi \Pred, \Pred, 16 ;# next pred - vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7 - lvx v3, 0, \Diff ;# v3 = d0..d7 - vaddshs v2, v2, v3 ;# v2 = r0..r7 - vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15 - lvx v3, r8, \Diff ;# v3 = d8..d15 - addi \Diff, \Diff, 32 ;# next diff - vaddshs v3, v3, v1 ;# v3 = r8..r15 - vpkshus v2, v2, v3 ;# v2 = 8-bit r0..r15 - stvx v2, 0, \Dst ;# to dst - add \Dst, \Dst, \Stride ;# next dst -.endm - - .text - .align 2 -;# r3 = short *diff_ptr, -;# r4 = unsigned char *pred_ptr, -;# r5 = unsigned char *dst_ptr, -;# r6 = int stride -recon4b_ppc: - mfspr r0, 256 ;# get old VRSAVE - stw r0, -8(r1) ;# save old VRSAVE to stack - oris r0, r0, 0xf000 - mtspr 256,r0 ;# set VRSAVE - - vxor v0, v0, v0 - li r8, 16 - - row_of16 r3, r4, r5, r6 - row_of16 r3, r4, r5, r6 - row_of16 r3, r4, r5, r6 - row_of16 r3, r4, r5, r6 - - lwz r12, -8(r1) ;# restore old VRSAVE from stack - mtspr 256, r12 ;# reset old VRSAVE - - blr - -.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels - lvx v1, 0, \Pred ;# v1 = pred = p0..p15 - vmrghb v2, v0, v1 ;# v2 = 16-bit p0..p7 - lvx v3, 0, \Diff ;# v3 = d0..d7 - vaddshs v2, v2, v3 ;# v2 = r0..r7 - vmrglb v1, v0, v1 ;# v1 = 16-bit p8..p15 - lvx v3, r8, \Diff ;# v2 = d8..d15 - vaddshs v3, v3, v1 ;# v3 = r8..r15 - vpkshus v2, v2, v3 ;# v3 = 8-bit r0..r15 - stvx v2, 0, r10 ;# 2 rows to dst from buf - lwz r0, 0(r10) -.if \write_first_four_pels - stw r0, 0(\Dst) - .else - stwux r0, \Dst, \Stride -.endif - lwz r0, 4(r10) - stw r0, 4(\Dst) - lwz r0, 8(r10) - stwux r0, \Dst, \Stride ;# advance dst to next row - lwz r0, 12(r10) - stw r0, 4(\Dst) -.endm - - .align 2 -;# r3 = short *diff_ptr, -;# r4 = unsigned char *pred_ptr, -;# r5 = unsigned char *dst_ptr, -;# r6 = int stride - -recon2b_ppc: - mfspr r0, 256 ;# get old VRSAVE - stw r0, -8(r1) ;# save old VRSAVE to stack - oris r0, r0, 0xf000 - mtspr 256,r0 ;# set VRSAVE - - vxor v0, v0, v0 - li r8, 16 - - la r10, -48(r1) ;# buf - - two_rows_of8 r3, r4, r5, r6, 1 - - addi r4, r4, 16; ;# next pred - addi r3, r3, 32; ;# next diff - - two_rows_of8 r3, r4, r5, r6, 0 - - lwz r12, -8(r1) ;# restore old VRSAVE from stack - mtspr 256, r12 ;# reset old VRSAVE - - blr - -.macro get_two_diff_rows - stw r0, 0(r10) - lwz r0, 4(r3) - stw r0, 4(r10) - lwzu r0, 32(r3) - stw r0, 8(r10) - lwz r0, 4(r3) - stw r0, 12(r10) - lvx v3, 0, r10 -.endm - - .align 2 -;# r3 = short *diff_ptr, -;# r4 = unsigned char *pred_ptr, -;# r5 = unsigned char *dst_ptr, -;# r6 = int stride -recon_b_ppc: - mfspr r0, 256 ;# get old VRSAVE - stw r0, -8(r1) ;# save old VRSAVE to stack - oris r0, r0, 0xf000 - mtspr 256,r0 ;# set VRSAVE - - vxor v0, v0, v0 - - la r10, -48(r1) ;# buf - - lwz r0, 0(r4) - stw r0, 0(r10) - lwz r0, 16(r4) - stw r0, 4(r10) - lwz r0, 32(r4) - stw r0, 8(r10) - lwz r0, 48(r4) - stw r0, 12(r10) - - lvx v1, 0, r10; ;# v1 = pred = p0..p15 - - lwz r0, 0(r3) ;# v3 = d0..d7 - - get_two_diff_rows - - vmrghb v2, v0, v1; ;# v2 = 16-bit p0..p7 - vaddshs v2, v2, v3; ;# v2 = r0..r7 - - lwzu r0, 32(r3) ;# v3 = d8..d15 - - get_two_diff_rows - - vmrglb v1, v0, v1; ;# v1 = 16-bit p8..p15 - vaddshs v3, v3, v1; ;# v3 = r8..r15 - - vpkshus v2, v2, v3; ;# v2 = 8-bit r0..r15 - stvx v2, 0, r10; ;# 16 pels to dst from buf - - lwz r0, 0(r10) - stw r0, 0(r5) - lwz r0, 4(r10) - stwux r0, r5, r6 - lwz r0, 8(r10) - stwux r0, r5, r6 - lwz r0, 12(r10) - stwx r0, r5, r6 - - lwz r12, -8(r1) ;# restore old VRSAVE from stack - mtspr 256, r12 ;# reset old VRSAVE - - blr diff --git a/vp8/common/ppc/sad_altivec.asm b/vp8/common/ppc/sad_altivec.asm deleted file mode 100644 index e5f26380f..000000000 --- a/vp8/common/ppc/sad_altivec.asm +++ /dev/null @@ -1,277 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl vp8_sad16x16_ppc - .globl vp8_sad16x8_ppc - .globl vp8_sad8x16_ppc - .globl vp8_sad8x8_ppc - .globl vp8_sad4x4_ppc - -.macro load_aligned_16 V R O - lvsl v3, 0, \R ;# permutate value for alignment - - lvx v1, 0, \R - lvx v2, \O, \R - - vperm \V, v1, v2, v3 -.endm - -.macro prologue - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffc0 - mtspr 256, r12 ;# set VRSAVE - - stwu r1, -32(r1) ;# create space on the stack - - li r10, 16 ;# load offset and loop counter - - vspltisw v8, 0 ;# zero out total to start -.endm - -.macro epilogue - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE -.endm - -.macro SAD_16 - ;# v6 = abs (v4 - v5) - vsububs v6, v4, v5 - vsububs v7, v5, v4 - vor v6, v6, v7 - - ;# v8 += abs (v4 - v5) - vsum4ubs v8, v6, v8 -.endm - -.macro sad_16_loop loop_label - lvsl v3, 0, r5 ;# only needs to be done once per block - - ;# preload a line of data before getting into the loop - lvx v4, 0, r3 - lvx v1, 0, r5 - lvx v2, r10, r5 - - add r5, r5, r6 - add r3, r3, r4 - - vperm v5, v1, v2, v3 - - .align 4 -\loop_label: - ;# compute difference on first row - vsububs v6, v4, v5 - vsububs v7, v5, v4 - - ;# load up next set of data - lvx v9, 0, r3 - lvx v1, 0, r5 - lvx v2, r10, r5 - - ;# perform abs() of difference - vor v6, v6, v7 - add r3, r3, r4 - - ;# add to the running tally - vsum4ubs v8, v6, v8 - - ;# now onto the next line - vperm v5, v1, v2, v3 - add r5, r5, r6 - lvx v4, 0, r3 - - ;# compute difference on second row - vsububs v6, v9, v5 - lvx v1, 0, r5 - vsububs v7, v5, v9 - lvx v2, r10, r5 - vor v6, v6, v7 - add r3, r3, r4 - vsum4ubs v8, v6, v8 - vperm v5, v1, v2, v3 - add r5, r5, r6 - - bdnz \loop_label - - vspltisw v7, 0 - - vsumsws v8, v8, v7 - - stvx v8, 0, r1 - lwz r3, 12(r1) -.endm - -.macro sad_8_loop loop_label - .align 4 -\loop_label: - ;# only one of the inputs should need to be aligned. - load_aligned_16 v4, r3, r10 - load_aligned_16 v5, r5, r10 - - ;# move onto the next line - add r3, r3, r4 - add r5, r5, r6 - - ;# only one of the inputs should need to be aligned. - load_aligned_16 v6, r3, r10 - load_aligned_16 v7, r5, r10 - - ;# move onto the next line - add r3, r3, r4 - add r5, r5, r6 - - vmrghb v4, v4, v6 - vmrghb v5, v5, v7 - - SAD_16 - - bdnz \loop_label - - vspltisw v7, 0 - - vsumsws v8, v8, v7 - - stvx v8, 0, r1 - lwz r3, 12(r1) -.endm - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_stride -;# r5 unsigned char *ref_ptr -;# r6 int ref_stride -;# -;# r3 return value -vp8_sad16x16_ppc: - - prologue - - li r9, 8 - mtctr r9 - - sad_16_loop sad16x16_loop - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_stride -;# r5 unsigned char *ref_ptr -;# r6 int ref_stride -;# -;# r3 return value -vp8_sad16x8_ppc: - - prologue - - li r9, 4 - mtctr r9 - - sad_16_loop sad16x8_loop - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_stride -;# r5 unsigned char *ref_ptr -;# r6 int ref_stride -;# -;# r3 return value -vp8_sad8x16_ppc: - - prologue - - li r9, 8 - mtctr r9 - - sad_8_loop sad8x16_loop - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_stride -;# r5 unsigned char *ref_ptr -;# r6 int ref_stride -;# -;# r3 return value -vp8_sad8x8_ppc: - - prologue - - li r9, 4 - mtctr r9 - - sad_8_loop sad8x8_loop - - epilogue - - blr - -.macro transfer_4x4 I P - lwz r0, 0(\I) - add \I, \I, \P - - lwz r7, 0(\I) - add \I, \I, \P - - lwz r8, 0(\I) - add \I, \I, \P - - lwz r9, 0(\I) - - stw r0, 0(r1) - stw r7, 4(r1) - stw r8, 8(r1) - stw r9, 12(r1) -.endm - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_stride -;# r5 unsigned char *ref_ptr -;# r6 int ref_stride -;# -;# r3 return value -vp8_sad4x4_ppc: - - prologue - - transfer_4x4 r3, r4 - lvx v4, 0, r1 - - transfer_4x4 r5, r6 - lvx v5, 0, r1 - - vspltisw v8, 0 ;# zero out total to start - - ;# v6 = abs (v4 - v5) - vsububs v6, v4, v5 - vsububs v7, v5, v4 - vor v6, v6, v7 - - ;# v8 += abs (v4 - v5) - vsum4ubs v7, v6, v8 - vsumsws v7, v7, v8 - - stvx v7, 0, r1 - lwz r3, 12(r1) - - epilogue - - blr diff --git a/vp8/common/ppc/systemdependent.c b/vp8/common/ppc/systemdependent.c deleted file mode 100644 index 6899c0e71..000000000 --- a/vp8/common/ppc/systemdependent.c +++ /dev/null @@ -1,165 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "subpixel.h" -#include "loopfilter.h" -#include "recon.h" -#include "onyxc_int.h" - -extern void (*vp8_post_proc_down_and_across_mb_row)( - unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int cols, - unsigned char *f, - int size -); - -extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit); -extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit); -extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit); -extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit); - -extern void vp8_post_proc_down_and_across_mb_row_c -( - unsigned char *src_ptr, - unsigned char *dst_ptr, - int src_pixels_per_line, - int dst_pixels_per_line, - int cols, - unsigned char *f, - int size -); -void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a); - -extern copy_mem_block_function *vp8_copy_mem16x16; -extern copy_mem_block_function *vp8_copy_mem8x8; -extern copy_mem_block_function *vp8_copy_mem8x4; - -// PPC -extern subpixel_predict_function sixtap_predict_ppc; -extern subpixel_predict_function sixtap_predict8x4_ppc; -extern subpixel_predict_function sixtap_predict8x8_ppc; -extern subpixel_predict_function sixtap_predict16x16_ppc; -extern subpixel_predict_function bilinear_predict4x4_ppc; -extern subpixel_predict_function bilinear_predict8x4_ppc; -extern subpixel_predict_function bilinear_predict8x8_ppc; -extern subpixel_predict_function bilinear_predict16x16_ppc; - -extern copy_mem_block_function copy_mem16x16_ppc; - -void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); -void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); -void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); - -extern void short_idct4x4llm_ppc(short *input, short *output, int pitch); - -// Generic C -extern subpixel_predict_function vp8_sixtap_predict_c; -extern subpixel_predict_function vp8_sixtap_predict8x4_c; -extern subpixel_predict_function vp8_sixtap_predict8x8_c; -extern subpixel_predict_function vp8_sixtap_predict16x16_c; -extern subpixel_predict_function vp8_bilinear_predict4x4_c; -extern subpixel_predict_function vp8_bilinear_predict8x4_c; -extern subpixel_predict_function vp8_bilinear_predict8x8_c; -extern subpixel_predict_function vp8_bilinear_predict16x16_c; - -extern copy_mem_block_function vp8_copy_mem16x16_c; -extern copy_mem_block_function vp8_copy_mem8x8_c; -extern copy_mem_block_function vp8_copy_mem8x4_c; - -void vp8_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); -void vp8_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); -void vp8_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); - -extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch); -extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch); -extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch); - -// PPC -extern loop_filter_block_function loop_filter_mbv_ppc; -extern loop_filter_block_function loop_filter_bv_ppc; -extern loop_filter_block_function loop_filter_mbh_ppc; -extern loop_filter_block_function loop_filter_bh_ppc; - -extern loop_filter_block_function loop_filter_mbvs_ppc; -extern loop_filter_block_function loop_filter_bvs_ppc; -extern loop_filter_block_function loop_filter_mbhs_ppc; -extern loop_filter_block_function loop_filter_bhs_ppc; - -// Generic C -extern loop_filter_block_function vp8_loop_filter_mbv_c; -extern loop_filter_block_function vp8_loop_filter_bv_c; -extern loop_filter_block_function vp8_loop_filter_mbh_c; -extern loop_filter_block_function vp8_loop_filter_bh_c; - -extern loop_filter_block_function vp8_loop_filter_mbvs_c; -extern loop_filter_block_function vp8_loop_filter_bvs_c; -extern loop_filter_block_function vp8_loop_filter_mbhs_c; -extern loop_filter_block_function vp8_loop_filter_bhs_c; - -extern loop_filter_block_function *vp8_lf_mbvfull; -extern loop_filter_block_function *vp8_lf_mbhfull; -extern loop_filter_block_function *vp8_lf_bvfull; -extern loop_filter_block_function *vp8_lf_bhfull; - -extern loop_filter_block_function *vp8_lf_mbvsimple; -extern loop_filter_block_function *vp8_lf_mbhsimple; -extern loop_filter_block_function *vp8_lf_bvsimple; -extern loop_filter_block_function *vp8_lf_bhsimple; - -void vp8_clear_c(void) -{ -} - -void vp8_machine_specific_config(void) -{ - // Pure C: - vp8_clear_system_state = vp8_clear_c; - vp8_recon_b = vp8_recon_b_c; - vp8_recon4b = vp8_recon4b_c; - vp8_recon2b = vp8_recon2b_c; - - vp8_bilinear_predict16x16 = bilinear_predict16x16_ppc; - vp8_bilinear_predict8x8 = bilinear_predict8x8_ppc; - vp8_bilinear_predict8x4 = bilinear_predict8x4_ppc; - vp8_bilinear_predict = bilinear_predict4x4_ppc; - - vp8_sixtap_predict16x16 = sixtap_predict16x16_ppc; - vp8_sixtap_predict8x8 = sixtap_predict8x8_ppc; - vp8_sixtap_predict8x4 = sixtap_predict8x4_ppc; - vp8_sixtap_predict = sixtap_predict_ppc; - - vp8_short_idct4x4_1 = vp8_short_idct4x4llm_1_c; - vp8_short_idct4x4 = short_idct4x4llm_ppc; - vp8_dc_only_idct = vp8_dc_only_idct_c; - - vp8_lf_mbvfull = loop_filter_mbv_ppc; - vp8_lf_bvfull = loop_filter_bv_ppc; - vp8_lf_mbhfull = loop_filter_mbh_ppc; - vp8_lf_bhfull = loop_filter_bh_ppc; - - vp8_lf_mbvsimple = loop_filter_mbvs_ppc; - vp8_lf_bvsimple = loop_filter_bvs_ppc; - vp8_lf_mbhsimple = loop_filter_mbhs_ppc; - vp8_lf_bhsimple = loop_filter_bhs_ppc; - - vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c; - vp8_mbpost_proc_down = vp8_mbpost_proc_down_c; - vp8_mbpost_proc_across_ip = vp8_mbpost_proc_across_ip_c; - vp8_plane_add_noise = vp8_plane_add_noise_c; - - vp8_copy_mem16x16 = copy_mem16x16_ppc; - vp8_copy_mem8x8 = vp8_copy_mem8x8_c; - vp8_copy_mem8x4 = vp8_copy_mem8x4_c; - -} diff --git a/vp8/common/ppc/variance_altivec.asm b/vp8/common/ppc/variance_altivec.asm deleted file mode 100644 index fb8d5bb1d..000000000 --- a/vp8/common/ppc/variance_altivec.asm +++ /dev/null @@ -1,375 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl vp8_get8x8var_ppc - .globl vp8_get16x16var_ppc - .globl vp8_mse16x16_ppc - .globl vp8_variance16x16_ppc - .globl vp8_variance16x8_ppc - .globl vp8_variance8x16_ppc - .globl vp8_variance8x8_ppc - .globl vp8_variance4x4_ppc - -.macro load_aligned_16 V R O - lvsl v3, 0, \R ;# permutate value for alignment - - lvx v1, 0, \R - lvx v2, \O, \R - - vperm \V, v1, v2, v3 -.endm - -.macro prologue - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffc0 - mtspr 256, r12 ;# set VRSAVE - - stwu r1, -32(r1) ;# create space on the stack - - li r10, 16 ;# load offset and loop counter - - vspltisw v7, 0 ;# zero for merging - vspltisw v8, 0 ;# zero out total to start - vspltisw v9, 0 ;# zero out total for dif^2 -.endm - -.macro epilogue - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE -.endm - -.macro compute_sum_sse - ;# Compute sum first. Unpack to so signed subract - ;# can be used. Only have a half word signed - ;# subract. Do high, then low. - vmrghb v2, v7, v4 - vmrghb v3, v7, v5 - vsubshs v2, v2, v3 - vsum4shs v8, v2, v8 - - vmrglb v2, v7, v4 - vmrglb v3, v7, v5 - vsubshs v2, v2, v3 - vsum4shs v8, v2, v8 - - ;# Now compute sse. - vsububs v2, v4, v5 - vsububs v3, v5, v4 - vor v2, v2, v3 - - vmsumubm v9, v2, v2, v9 -.endm - -.macro variance_16 DS loop_label store_sum -\loop_label: - ;# only one of the inputs should need to be aligned. - load_aligned_16 v4, r3, r10 - load_aligned_16 v5, r5, r10 - - ;# move onto the next line - add r3, r3, r4 - add r5, r5, r6 - - compute_sum_sse - - bdnz \loop_label - - vsumsws v8, v8, v7 - vsumsws v9, v9, v7 - - stvx v8, 0, r1 - lwz r3, 12(r1) - - stvx v9, 0, r1 - lwz r4, 12(r1) - -.if \store_sum - stw r3, 0(r8) ;# sum -.endif - stw r4, 0(r7) ;# sse - - mullw r3, r3, r3 ;# sum*sum - srlwi r3, r3, \DS ;# (sum*sum) >> DS - subf r3, r3, r4 ;# sse - ((sum*sum) >> DS) -.endm - -.macro variance_8 DS loop_label store_sum -\loop_label: - ;# only one of the inputs should need to be aligned. - load_aligned_16 v4, r3, r10 - load_aligned_16 v5, r5, r10 - - ;# move onto the next line - add r3, r3, r4 - add r5, r5, r6 - - ;# only one of the inputs should need to be aligned. - load_aligned_16 v6, r3, r10 - load_aligned_16 v0, r5, r10 - - ;# move onto the next line - add r3, r3, r4 - add r5, r5, r6 - - vmrghb v4, v4, v6 - vmrghb v5, v5, v0 - - compute_sum_sse - - bdnz \loop_label - - vsumsws v8, v8, v7 - vsumsws v9, v9, v7 - - stvx v8, 0, r1 - lwz r3, 12(r1) - - stvx v9, 0, r1 - lwz r4, 12(r1) - -.if \store_sum - stw r3, 0(r8) ;# sum -.endif - stw r4, 0(r7) ;# sse - - mullw r3, r3, r3 ;# sum*sum - srlwi r3, r3, \DS ;# (sum*sum) >> 8 - subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) -.endm - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *SSE -;# r8 int *Sum -;# -;# r3 return value -vp8_get8x8var_ppc: - - prologue - - li r9, 4 - mtctr r9 - - variance_8 6, get8x8var_loop, 1 - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *SSE -;# r8 int *Sum -;# -;# r3 return value -vp8_get16x16var_ppc: - - prologue - - mtctr r10 - - variance_16 8, get16x16var_loop, 1 - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *sse -;# -;# r 3 return value -vp8_mse16x16_ppc: - prologue - - mtctr r10 - -mse16x16_loop: - ;# only one of the inputs should need to be aligned. - load_aligned_16 v4, r3, r10 - load_aligned_16 v5, r5, r10 - - ;# move onto the next line - add r3, r3, r4 - add r5, r5, r6 - - ;# Now compute sse. - vsububs v2, v4, v5 - vsububs v3, v5, v4 - vor v2, v2, v3 - - vmsumubm v9, v2, v2, v9 - - bdnz mse16x16_loop - - vsumsws v9, v9, v7 - - stvx v9, 0, r1 - lwz r3, 12(r1) - - stvx v9, 0, r1 - lwz r3, 12(r1) - - stw r3, 0(r7) ;# sse - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *sse -;# -;# r3 return value -vp8_variance16x16_ppc: - - prologue - - mtctr r10 - - variance_16 8, variance16x16_loop, 0 - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *sse -;# -;# r3 return value -vp8_variance16x8_ppc: - - prologue - - li r9, 8 - mtctr r9 - - variance_16 7, variance16x8_loop, 0 - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *sse -;# -;# r3 return value -vp8_variance8x16_ppc: - - prologue - - li r9, 8 - mtctr r9 - - variance_8 7, variance8x16_loop, 0 - - epilogue - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *sse -;# -;# r3 return value -vp8_variance8x8_ppc: - - prologue - - li r9, 4 - mtctr r9 - - variance_8 6, variance8x8_loop, 0 - - epilogue - - blr - -.macro transfer_4x4 I P - lwz r0, 0(\I) - add \I, \I, \P - - lwz r10,0(\I) - add \I, \I, \P - - lwz r8, 0(\I) - add \I, \I, \P - - lwz r9, 0(\I) - - stw r0, 0(r1) - stw r10, 4(r1) - stw r8, 8(r1) - stw r9, 12(r1) -.endm - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int source_stride -;# r5 unsigned char *ref_ptr -;# r6 int recon_stride -;# r7 unsigned int *sse -;# -;# r3 return value -vp8_variance4x4_ppc: - - prologue - - transfer_4x4 r3, r4 - lvx v4, 0, r1 - - transfer_4x4 r5, r6 - lvx v5, 0, r1 - - compute_sum_sse - - vsumsws v8, v8, v7 - vsumsws v9, v9, v7 - - stvx v8, 0, r1 - lwz r3, 12(r1) - - stvx v9, 0, r1 - lwz r4, 12(r1) - - stw r4, 0(r7) ;# sse - - mullw r3, r3, r3 ;# sum*sum - srlwi r3, r3, 4 ;# (sum*sum) >> 4 - subf r3, r3, r4 ;# sse - ((sum*sum) >> 4) - - epilogue - - blr diff --git a/vp8/common/ppc/variance_subpixel_altivec.asm b/vp8/common/ppc/variance_subpixel_altivec.asm deleted file mode 100644 index 2308373a1..000000000 --- a/vp8/common/ppc/variance_subpixel_altivec.asm +++ /dev/null @@ -1,865 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl vp8_sub_pixel_variance4x4_ppc - .globl vp8_sub_pixel_variance8x8_ppc - .globl vp8_sub_pixel_variance8x16_ppc - .globl vp8_sub_pixel_variance16x8_ppc - .globl vp8_sub_pixel_variance16x16_ppc - -.macro load_c V, LABEL, OFF, R0, R1 - lis \R0, \LABEL@ha - la \R1, \LABEL@l(\R0) - lvx \V, \OFF, \R1 -.endm - -.macro load_vfilter V0, V1 - load_c \V0, vfilter_b, r6, r12, r10 - - addi r6, r6, 16 - lvx \V1, r6, r10 -.endm - -.macro HProlog jump_label - ;# load up horizontal filter - slwi. r5, r5, 4 ;# index into horizontal filter array - - ;# index to the next set of vectors in the row. - li r10, 16 - - ;# downshift by 7 ( divide by 128 ) at the end - vspltish v19, 7 - - ;# If there isn't any filtering to be done for the horizontal, then - ;# just skip to the second pass. - beq \jump_label - - load_c v20, hfilter_b, r5, r12, r0 - - ;# setup constants - ;# v14 permutation value for alignment - load_c v28, b_hperm_b, 0, r12, r0 - - ;# index to the next set of vectors in the row. - li r12, 32 - - ;# rounding added in on the multiply - vspltisw v21, 8 - vspltisw v18, 3 - vslw v18, v21, v18 ;# 0x00000040000000400000004000000040 - - slwi. r6, r6, 5 ;# index into vertical filter array -.endm - -;# Filters a horizontal line -;# expects: -;# r3 src_ptr -;# r4 pitch -;# r10 16 -;# r12 32 -;# v17 perm intput -;# v18 rounding -;# v19 shift -;# v20 filter taps -;# v21 tmp -;# v22 tmp -;# v23 tmp -;# v24 tmp -;# v25 tmp -;# v26 tmp -;# v27 tmp -;# v28 perm output -;# - -.macro hfilter_8 V, hp, lp, increment_counter - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 9 bytes wide, output is 8 bytes. - lvx v21, 0, r3 - lvx v22, r10, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - vperm v21, v21, v22, v17 - - vperm v24, v21, v21, \hp ;# v20 = 0123 1234 2345 3456 - vperm v25, v21, v21, \lp ;# v21 = 4567 5678 6789 789A - - vmsummbm v24, v20, v24, v18 - vmsummbm v25, v20, v25, v18 - - vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) - - vsrh v24, v24, v19 ;# divide v0, v1 by 128 - - vpkuhus \V, v24, v24 ;# \V = scrambled 8-bit result -.endm - -.macro vfilter_16 P0 P1 - vmuleub v22, \P0, v20 ;# 64 + 4 positive taps - vadduhm v22, v18, v22 - vmuloub v23, \P0, v20 - vadduhm v23, v18, v23 - - vmuleub v24, \P1, v21 - vadduhm v22, v22, v24 ;# Re = evens, saturation unnecessary - vmuloub v25, \P1, v21 - vadduhm v23, v23, v25 ;# Ro = odds - - vsrh v22, v22, v19 ;# divide by 128 - vsrh v23, v23, v19 ;# v16 v17 = evens, odds - vmrghh \P0, v22, v23 ;# v18 v19 = 16-bit result in order - vmrglh v23, v22, v23 - vpkuhus \P0, \P0, v23 ;# P0 = 8-bit result -.endm - -.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0 - ;# Compute sum first. Unpack to so signed subract - ;# can be used. Only have a half word signed - ;# subract. Do high, then low. - vmrghb \t1, \z0, \src - vmrghb \t2, \z0, \ref - vsubshs \t1, \t1, \t2 - vsum4shs \sum, \t1, \sum - - vmrglb \t1, \z0, \src - vmrglb \t2, \z0, \ref - vsubshs \t1, \t1, \t2 - vsum4shs \sum, \t1, \sum - - ;# Now compute sse. - vsububs \t1, \src, \ref - vsububs \t2, \ref, \src - vor \t1, \t1, \t2 - - vmsumubm \sse, \t1, \t1, \sse -.endm - -.macro variance_final sum, sse, z0, DS - vsumsws \sum, \sum, \z0 - vsumsws \sse, \sse, \z0 - - stvx \sum, 0, r1 - lwz r3, 12(r1) - - stvx \sse, 0, r1 - lwz r4, 12(r1) - - stw r4, 0(r9) ;# sse - - mullw r3, r3, r3 ;# sum*sum - srlwi r3, r3, \DS ;# (sum*sum) >> 8 - subf r3, r3, r4 ;# sse - ((sum*sum) >> 8) -.endm - -.macro compute_sum_sse_16 V, increment_counter - load_and_align_16 v16, r7, r8, \increment_counter - compute_sum_sse \V, v16, v18, v19, v20, v21, v23 -.endm - -.macro load_and_align_16 V, R, P, increment_counter - lvsl v17, 0, \R ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v21, 0, \R - lvx v22, r10, \R - -.if \increment_counter - add \R, \R, \P -.endif - - vperm \V, v21, v22, v17 -.endm - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_pixels_per_line -;# r5 int xoffset -;# r6 int yoffset -;# r7 unsigned char *dst_ptr -;# r8 int dst_pixels_per_line -;# r9 unsigned int *sse -;# -;# r3 return value -vp8_sub_pixel_variance4x4_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xf830 - ori r12, r12, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - HProlog second_pass_4x4_pre_copy_b - - ;# Load up permutation constants - load_c v10, b_0123_b, 0, r12, r0 - load_c v11, b_4567_b, 0, r12, r0 - - hfilter_8 v0, v10, v11, 1 - hfilter_8 v1, v10, v11, 1 - hfilter_8 v2, v10, v11, 1 - hfilter_8 v3, v10, v11, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq compute_sum_sse_4x4_b - - hfilter_8 v4, v10, v11, 0 - - b second_pass_4x4_b - -second_pass_4x4_pre_copy_b: - slwi r6, r6, 5 ;# index into vertical filter array - - load_and_align_16 v0, r3, r4, 1 - load_and_align_16 v1, r3, r4, 1 - load_and_align_16 v2, r3, r4, 1 - load_and_align_16 v3, r3, r4, 1 - load_and_align_16 v4, r3, r4, 0 - -second_pass_4x4_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - -compute_sum_sse_4x4_b: - vspltish v18, 0 ;# sum - vspltish v19, 0 ;# sse - vspltish v23, 0 ;# unpack - li r10, 16 - - load_and_align_16 v4, r7, r8, 1 - load_and_align_16 v5, r7, r8, 1 - load_and_align_16 v6, r7, r8, 1 - load_and_align_16 v7, r7, r8, 1 - - vmrghb v0, v0, v1 - vmrghb v1, v2, v3 - - vmrghb v2, v4, v5 - vmrghb v3, v6, v7 - - load_c v10, b_hilo_b, 0, r12, r0 - - vperm v0, v0, v1, v10 - vperm v1, v2, v3, v10 - - compute_sum_sse v0, v1, v18, v19, v20, v21, v23 - - variance_final v18, v19, v23, 4 - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_pixels_per_line -;# r5 int xoffset -;# r6 int yoffset -;# r7 unsigned char *dst_ptr -;# r8 int dst_pixels_per_line -;# r9 unsigned int *sse -;# -;# r3 return value -vp8_sub_pixel_variance8x8_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xfff0 - ori r12, r12, 0xffff - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - HProlog second_pass_8x8_pre_copy_b - - ;# Load up permutation constants - load_c v10, b_0123_b, 0, r12, r0 - load_c v11, b_4567_b, 0, r12, r0 - - hfilter_8 v0, v10, v11, 1 - hfilter_8 v1, v10, v11, 1 - hfilter_8 v2, v10, v11, 1 - hfilter_8 v3, v10, v11, 1 - hfilter_8 v4, v10, v11, 1 - hfilter_8 v5, v10, v11, 1 - hfilter_8 v6, v10, v11, 1 - hfilter_8 v7, v10, v11, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq compute_sum_sse_8x8_b - - hfilter_8 v8, v10, v11, 0 - - b second_pass_8x8_b - -second_pass_8x8_pre_copy_b: - slwi. r6, r6, 5 ;# index into vertical filter array - - load_and_align_16 v0, r3, r4, 1 - load_and_align_16 v1, r3, r4, 1 - load_and_align_16 v2, r3, r4, 1 - load_and_align_16 v3, r3, r4, 1 - load_and_align_16 v4, r3, r4, 1 - load_and_align_16 v5, r3, r4, 1 - load_and_align_16 v6, r3, r4, 1 - load_and_align_16 v7, r3, r4, 1 - load_and_align_16 v8, r3, r4, 0 - - beq compute_sum_sse_8x8_b - -second_pass_8x8_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - vfilter_16 v4, v5 - vfilter_16 v5, v6 - vfilter_16 v6, v7 - vfilter_16 v7, v8 - -compute_sum_sse_8x8_b: - vspltish v18, 0 ;# sum - vspltish v19, 0 ;# sse - vspltish v23, 0 ;# unpack - li r10, 16 - - vmrghb v0, v0, v1 - vmrghb v1, v2, v3 - vmrghb v2, v4, v5 - vmrghb v3, v6, v7 - - load_and_align_16 v4, r7, r8, 1 - load_and_align_16 v5, r7, r8, 1 - load_and_align_16 v6, r7, r8, 1 - load_and_align_16 v7, r7, r8, 1 - load_and_align_16 v8, r7, r8, 1 - load_and_align_16 v9, r7, r8, 1 - load_and_align_16 v10, r7, r8, 1 - load_and_align_16 v11, r7, r8, 0 - - vmrghb v4, v4, v5 - vmrghb v5, v6, v7 - vmrghb v6, v8, v9 - vmrghb v7, v10, v11 - - compute_sum_sse v0, v4, v18, v19, v20, v21, v23 - compute_sum_sse v1, v5, v18, v19, v20, v21, v23 - compute_sum_sse v2, v6, v18, v19, v20, v21, v23 - compute_sum_sse v3, v7, v18, v19, v20, v21, v23 - - variance_final v18, v19, v23, 6 - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_pixels_per_line -;# r5 int xoffset -;# r6 int yoffset -;# r7 unsigned char *dst_ptr -;# r8 int dst_pixels_per_line -;# r9 unsigned int *sse -;# -;# r3 return value -vp8_sub_pixel_variance8x16_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xfffc - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - HProlog second_pass_8x16_pre_copy_b - - ;# Load up permutation constants - load_c v29, b_0123_b, 0, r12, r0 - load_c v30, b_4567_b, 0, r12, r0 - - hfilter_8 v0, v29, v30, 1 - hfilter_8 v1, v29, v30, 1 - hfilter_8 v2, v29, v30, 1 - hfilter_8 v3, v29, v30, 1 - hfilter_8 v4, v29, v30, 1 - hfilter_8 v5, v29, v30, 1 - hfilter_8 v6, v29, v30, 1 - hfilter_8 v7, v29, v30, 1 - hfilter_8 v8, v29, v30, 1 - hfilter_8 v9, v29, v30, 1 - hfilter_8 v10, v29, v30, 1 - hfilter_8 v11, v29, v30, 1 - hfilter_8 v12, v29, v30, 1 - hfilter_8 v13, v29, v30, 1 - hfilter_8 v14, v29, v30, 1 - hfilter_8 v15, v29, v30, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq compute_sum_sse_8x16_b - - hfilter_8 v16, v29, v30, 0 - - b second_pass_8x16_b - -second_pass_8x16_pre_copy_b: - slwi. r6, r6, 5 ;# index into vertical filter array - - load_and_align_16 v0, r3, r4, 1 - load_and_align_16 v1, r3, r4, 1 - load_and_align_16 v2, r3, r4, 1 - load_and_align_16 v3, r3, r4, 1 - load_and_align_16 v4, r3, r4, 1 - load_and_align_16 v5, r3, r4, 1 - load_and_align_16 v6, r3, r4, 1 - load_and_align_16 v7, r3, r4, 1 - load_and_align_16 v8, r3, r4, 1 - load_and_align_16 v9, r3, r4, 1 - load_and_align_16 v10, r3, r4, 1 - load_and_align_16 v11, r3, r4, 1 - load_and_align_16 v12, r3, r4, 1 - load_and_align_16 v13, r3, r4, 1 - load_and_align_16 v14, r3, r4, 1 - load_and_align_16 v15, r3, r4, 1 - load_and_align_16 v16, r3, r4, 0 - - beq compute_sum_sse_8x16_b - -second_pass_8x16_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - vfilter_16 v4, v5 - vfilter_16 v5, v6 - vfilter_16 v6, v7 - vfilter_16 v7, v8 - vfilter_16 v8, v9 - vfilter_16 v9, v10 - vfilter_16 v10, v11 - vfilter_16 v11, v12 - vfilter_16 v12, v13 - vfilter_16 v13, v14 - vfilter_16 v14, v15 - vfilter_16 v15, v16 - -compute_sum_sse_8x16_b: - vspltish v18, 0 ;# sum - vspltish v19, 0 ;# sse - vspltish v23, 0 ;# unpack - li r10, 16 - - vmrghb v0, v0, v1 - vmrghb v1, v2, v3 - vmrghb v2, v4, v5 - vmrghb v3, v6, v7 - vmrghb v4, v8, v9 - vmrghb v5, v10, v11 - vmrghb v6, v12, v13 - vmrghb v7, v14, v15 - - load_and_align_16 v8, r7, r8, 1 - load_and_align_16 v9, r7, r8, 1 - load_and_align_16 v10, r7, r8, 1 - load_and_align_16 v11, r7, r8, 1 - load_and_align_16 v12, r7, r8, 1 - load_and_align_16 v13, r7, r8, 1 - load_and_align_16 v14, r7, r8, 1 - load_and_align_16 v15, r7, r8, 1 - - vmrghb v8, v8, v9 - vmrghb v9, v10, v11 - vmrghb v10, v12, v13 - vmrghb v11, v14, v15 - - compute_sum_sse v0, v8, v18, v19, v20, v21, v23 - compute_sum_sse v1, v9, v18, v19, v20, v21, v23 - compute_sum_sse v2, v10, v18, v19, v20, v21, v23 - compute_sum_sse v3, v11, v18, v19, v20, v21, v23 - - load_and_align_16 v8, r7, r8, 1 - load_and_align_16 v9, r7, r8, 1 - load_and_align_16 v10, r7, r8, 1 - load_and_align_16 v11, r7, r8, 1 - load_and_align_16 v12, r7, r8, 1 - load_and_align_16 v13, r7, r8, 1 - load_and_align_16 v14, r7, r8, 1 - load_and_align_16 v15, r7, r8, 0 - - vmrghb v8, v8, v9 - vmrghb v9, v10, v11 - vmrghb v10, v12, v13 - vmrghb v11, v14, v15 - - compute_sum_sse v4, v8, v18, v19, v20, v21, v23 - compute_sum_sse v5, v9, v18, v19, v20, v21, v23 - compute_sum_sse v6, v10, v18, v19, v20, v21, v23 - compute_sum_sse v7, v11, v18, v19, v20, v21, v23 - - variance_final v18, v19, v23, 7 - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - blr - -;# Filters a horizontal line -;# expects: -;# r3 src_ptr -;# r4 pitch -;# r10 16 -;# r12 32 -;# v17 perm intput -;# v18 rounding -;# v19 shift -;# v20 filter taps -;# v21 tmp -;# v22 tmp -;# v23 tmp -;# v24 tmp -;# v25 tmp -;# v26 tmp -;# v27 tmp -;# v28 perm output -;# -.macro hfilter_16 V, increment_counter - - lvsl v17, 0, r3 ;# permutate value for alignment - - ;# input to filter is 21 bytes wide, output is 16 bytes. - ;# input will can span three vectors if not aligned correctly. - lvx v21, 0, r3 - lvx v22, r10, r3 - lvx v23, r12, r3 - -.if \increment_counter - add r3, r3, r4 -.endif - vperm v21, v21, v22, v17 - vperm v22, v22, v23, v17 ;# v8 v9 = 21 input pixels left-justified - - ;# set 0 - vmsummbm v24, v20, v21, v18 ;# taps times elements - - ;# set 1 - vsldoi v23, v21, v22, 1 - vmsummbm v25, v20, v23, v18 - - ;# set 2 - vsldoi v23, v21, v22, 2 - vmsummbm v26, v20, v23, v18 - - ;# set 3 - vsldoi v23, v21, v22, 3 - vmsummbm v27, v20, v23, v18 - - vpkswus v24, v24, v25 ;# v24 = 0 4 8 C 1 5 9 D (16-bit) - vpkswus v25, v26, v27 ;# v25 = 2 6 A E 3 7 B F - - vsrh v24, v24, v19 ;# divide v0, v1 by 128 - vsrh v25, v25, v19 - - vpkuhus \V, v24, v25 ;# \V = scrambled 8-bit result - vperm \V, \V, v0, v28 ;# \V = correctly-ordered result -.endm - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_pixels_per_line -;# r5 int xoffset -;# r6 int yoffset -;# r7 unsigned char *dst_ptr -;# r8 int dst_pixels_per_line -;# r9 unsigned int *sse -;# -;# r3 return value -vp8_sub_pixel_variance16x8_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - stwu r1, -32(r1) ;# create space on the stack - - HProlog second_pass_16x8_pre_copy_b - - hfilter_16 v0, 1 - hfilter_16 v1, 1 - hfilter_16 v2, 1 - hfilter_16 v3, 1 - hfilter_16 v4, 1 - hfilter_16 v5, 1 - hfilter_16 v6, 1 - hfilter_16 v7, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq compute_sum_sse_16x8_b - - hfilter_16 v8, 0 - - b second_pass_16x8_b - -second_pass_16x8_pre_copy_b: - slwi. r6, r6, 5 ;# index into vertical filter array - - load_and_align_16 v0, r3, r4, 1 - load_and_align_16 v1, r3, r4, 1 - load_and_align_16 v2, r3, r4, 1 - load_and_align_16 v3, r3, r4, 1 - load_and_align_16 v4, r3, r4, 1 - load_and_align_16 v5, r3, r4, 1 - load_and_align_16 v6, r3, r4, 1 - load_and_align_16 v7, r3, r4, 1 - load_and_align_16 v8, r3, r4, 1 - - beq compute_sum_sse_16x8_b - -second_pass_16x8_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - vfilter_16 v4, v5 - vfilter_16 v5, v6 - vfilter_16 v6, v7 - vfilter_16 v7, v8 - -compute_sum_sse_16x8_b: - vspltish v18, 0 ;# sum - vspltish v19, 0 ;# sse - vspltish v23, 0 ;# unpack - li r10, 16 - - compute_sum_sse_16 v0, 1 - compute_sum_sse_16 v1, 1 - compute_sum_sse_16 v2, 1 - compute_sum_sse_16 v3, 1 - compute_sum_sse_16 v4, 1 - compute_sum_sse_16 v5, 1 - compute_sum_sse_16 v6, 1 - compute_sum_sse_16 v7, 0 - - variance_final v18, v19, v23, 7 - - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .align 2 -;# r3 unsigned char *src_ptr -;# r4 int src_pixels_per_line -;# r5 int xoffset -;# r6 int yoffset -;# r7 unsigned char *dst_ptr -;# r8 int dst_pixels_per_line -;# r9 unsigned int *sse -;# -;# r3 return value -vp8_sub_pixel_variance16x16_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xffff - ori r12, r12, 0xfff8 - mtspr 256, r12 ;# set VRSAVE - - stwu r1, -32(r1) ;# create space on the stack - - HProlog second_pass_16x16_pre_copy_b - - hfilter_16 v0, 1 - hfilter_16 v1, 1 - hfilter_16 v2, 1 - hfilter_16 v3, 1 - hfilter_16 v4, 1 - hfilter_16 v5, 1 - hfilter_16 v6, 1 - hfilter_16 v7, 1 - hfilter_16 v8, 1 - hfilter_16 v9, 1 - hfilter_16 v10, 1 - hfilter_16 v11, 1 - hfilter_16 v12, 1 - hfilter_16 v13, 1 - hfilter_16 v14, 1 - hfilter_16 v15, 1 - - ;# Finished filtering main horizontal block. If there is no - ;# vertical filtering, jump to storing the data. Otherwise - ;# load up and filter the additional line that is needed - ;# for the vertical filter. - beq compute_sum_sse_16x16_b - - hfilter_16 v16, 0 - - b second_pass_16x16_b - -second_pass_16x16_pre_copy_b: - slwi. r6, r6, 5 ;# index into vertical filter array - - load_and_align_16 v0, r3, r4, 1 - load_and_align_16 v1, r3, r4, 1 - load_and_align_16 v2, r3, r4, 1 - load_and_align_16 v3, r3, r4, 1 - load_and_align_16 v4, r3, r4, 1 - load_and_align_16 v5, r3, r4, 1 - load_and_align_16 v6, r3, r4, 1 - load_and_align_16 v7, r3, r4, 1 - load_and_align_16 v8, r3, r4, 1 - load_and_align_16 v9, r3, r4, 1 - load_and_align_16 v10, r3, r4, 1 - load_and_align_16 v11, r3, r4, 1 - load_and_align_16 v12, r3, r4, 1 - load_and_align_16 v13, r3, r4, 1 - load_and_align_16 v14, r3, r4, 1 - load_and_align_16 v15, r3, r4, 1 - load_and_align_16 v16, r3, r4, 0 - - beq compute_sum_sse_16x16_b - -second_pass_16x16_b: - vspltish v20, 8 - vspltish v18, 3 - vslh v18, v20, v18 ;# 0x0040 0040 0040 0040 0040 0040 0040 0040 - - load_vfilter v20, v21 - - vfilter_16 v0, v1 - vfilter_16 v1, v2 - vfilter_16 v2, v3 - vfilter_16 v3, v4 - vfilter_16 v4, v5 - vfilter_16 v5, v6 - vfilter_16 v6, v7 - vfilter_16 v7, v8 - vfilter_16 v8, v9 - vfilter_16 v9, v10 - vfilter_16 v10, v11 - vfilter_16 v11, v12 - vfilter_16 v12, v13 - vfilter_16 v13, v14 - vfilter_16 v14, v15 - vfilter_16 v15, v16 - -compute_sum_sse_16x16_b: - vspltish v18, 0 ;# sum - vspltish v19, 0 ;# sse - vspltish v23, 0 ;# unpack - li r10, 16 - - compute_sum_sse_16 v0, 1 - compute_sum_sse_16 v1, 1 - compute_sum_sse_16 v2, 1 - compute_sum_sse_16 v3, 1 - compute_sum_sse_16 v4, 1 - compute_sum_sse_16 v5, 1 - compute_sum_sse_16 v6, 1 - compute_sum_sse_16 v7, 1 - compute_sum_sse_16 v8, 1 - compute_sum_sse_16 v9, 1 - compute_sum_sse_16 v10, 1 - compute_sum_sse_16 v11, 1 - compute_sum_sse_16 v12, 1 - compute_sum_sse_16 v13, 1 - compute_sum_sse_16 v14, 1 - compute_sum_sse_16 v15, 0 - - variance_final v18, v19, v23, 8 - - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE - - blr - - .data - - .align 4 -hfilter_b: - .byte 128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0,128, 0, 0, 0 - .byte 112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0,112, 16, 0, 0 - .byte 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0, 96, 32, 0, 0 - .byte 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0, 80, 48, 0, 0 - .byte 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0, 64, 64, 0, 0 - .byte 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0, 48, 80, 0, 0 - .byte 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0, 32, 96, 0, 0 - .byte 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0, 16,112, 0, 0 - - .align 4 -vfilter_b: - .byte 128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128 - .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 - .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 - .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 - .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 - .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 - .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - .byte 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - .byte 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48 - .byte 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80 - .byte 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32 - .byte 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 - .byte 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 - .byte 112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112 - - .align 4 -b_hperm_b: - .byte 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 - - .align 4 -b_0123_b: - .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 - - .align 4 -b_4567_b: - .byte 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 - -b_hilo_b: - .byte 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c index ab03c909d..4f404bd66 100644 --- a/vp8/decoder/decodeframe.c +++ b/vp8/decoder/decodeframe.c @@ -1069,7 +1069,6 @@ int vp8_decode_frame(VP8D_COMP *pbi) pc->vert_scale = clear[6] >> 6; } data += 7; - clear += 7; } else { diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 85767ef0d..84b9faec6 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -2969,7 +2969,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) */ decay_accumulator = 1.0; boost_score = 0.0; - loop_decay_rate = 1.00; /* Starting decay rate */ for (i = 0 ; i < cpi->twopass.frames_to_key ; i++) { @@ -3213,7 +3212,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) int new_width = cpi->oxcf.Width; int new_height = cpi->oxcf.Height; - int projected_buffer_level = (int)cpi->buffer_level; + int projected_buffer_level; int tmp_q; double projected_bits_perframe; diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c index f0c8f28fc..a55a1ea16 100644 --- a/vp8/encoder/picklpf.c +++ b/vp8/encoder/picklpf.c @@ -142,7 +142,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) int min_filter_level = get_min_filter_level(cpi, cm->base_qindex); int max_filter_level = get_max_filter_level(cpi, cm->base_qindex); int filt_val; - int best_filt_val = cm->filter_level; + int best_filt_val; YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show; /* Replace unfiltered frame buffer with a new one */ @@ -274,8 +274,7 @@ void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) int filter_step; int filt_high = 0; - /* Start search at previous frame filter level */ - int filt_mid = cm->filter_level; + int filt_mid; int filt_low = 0; int filt_best; int filt_direction = 0; diff --git a/vp8/encoder/ppc/csystemdependent.c b/vp8/encoder/ppc/csystemdependent.c deleted file mode 100644 index 63f235784..000000000 --- a/vp8/encoder/ppc/csystemdependent.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vp8/encoder/variance.h" -#include "vp8/encoder/onyx_int.h" - -SADFunction *vp8_sad16x16; -SADFunction *vp8_sad16x8; -SADFunction *vp8_sad8x16; -SADFunction *vp8_sad8x8; -SADFunction *vp8_sad4x4; - -variance_function *vp8_variance4x4; -variance_function *vp8_variance8x8; -variance_function *vp8_variance8x16; -variance_function *vp8_variance16x8; -variance_function *vp8_variance16x16; - -variance_function *vp8_mse16x16; - -sub_pixel_variance_function *vp8_sub_pixel_variance4x4; -sub_pixel_variance_function *vp8_sub_pixel_variance8x8; -sub_pixel_variance_function *vp8_sub_pixel_variance8x16; -sub_pixel_variance_function *vp8_sub_pixel_variance16x8; -sub_pixel_variance_function *vp8_sub_pixel_variance16x16; - -int (*vp8_block_error)(short *coeff, short *dqcoeff); -int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc); - -int (*vp8_mbuverror)(MACROBLOCK *mb); -unsigned int (*vp8_get_mb_ss)(short *); -void (*vp8_short_fdct4x4)(short *input, short *output, int pitch); -void (*vp8_short_fdct8x4)(short *input, short *output, int pitch); -void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch); -void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch); -void (*short_walsh4x4)(short *input, short *output, int pitch); - -void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch); -void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride); -void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); -void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d); - -unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride); - -// c imports -extern int block_error_c(short *coeff, short *dqcoeff); -extern int vp8_mbblock_error_c(MACROBLOCK *mb, int dc); - -extern int vp8_mbuverror_c(MACROBLOCK *mb); -extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); -extern void short_fdct4x4_c(short *input, short *output, int pitch); -extern void short_fdct8x4_c(short *input, short *output, int pitch); -extern void vp8_short_walsh4x4_c(short *input, short *output, int pitch); - -extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch); -extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride); -extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); -extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d); - -extern SADFunction sad16x16_c; -extern SADFunction sad16x8_c; -extern SADFunction sad8x16_c; -extern SADFunction sad8x8_c; -extern SADFunction sad4x4_c; - -extern variance_function variance16x16_c; -extern variance_function variance8x16_c; -extern variance_function variance16x8_c; -extern variance_function variance8x8_c; -extern variance_function variance4x4_c; -extern variance_function mse16x16_c; - -extern sub_pixel_variance_function sub_pixel_variance4x4_c; -extern sub_pixel_variance_function sub_pixel_variance8x8_c; -extern sub_pixel_variance_function sub_pixel_variance8x16_c; -extern sub_pixel_variance_function sub_pixel_variance16x8_c; -extern sub_pixel_variance_function sub_pixel_variance16x16_c; - -extern unsigned int vp8_get_mb_ss_c(short *); -extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride); - -// ppc -extern int vp8_block_error_ppc(short *coeff, short *dqcoeff); - -extern void vp8_short_fdct4x4_ppc(short *input, short *output, int pitch); -extern void vp8_short_fdct8x4_ppc(short *input, short *output, int pitch); - -extern void vp8_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride); -extern void vp8_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride); - -extern SADFunction vp8_sad16x16_ppc; -extern SADFunction vp8_sad16x8_ppc; -extern SADFunction vp8_sad8x16_ppc; -extern SADFunction vp8_sad8x8_ppc; -extern SADFunction vp8_sad4x4_ppc; - -extern variance_function vp8_variance16x16_ppc; -extern variance_function vp8_variance8x16_ppc; -extern variance_function vp8_variance16x8_ppc; -extern variance_function vp8_variance8x8_ppc; -extern variance_function vp8_variance4x4_ppc; -extern variance_function vp8_mse16x16_ppc; - -extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_ppc; -extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_ppc; -extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_ppc; -extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_ppc; -extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_ppc; - -extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); -extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int source_stride, unsigned char *ref_ptr, int recon_stride, unsigned int *SSE, int *Sum); - -void vp8_cmachine_specific_config(void) -{ - // Pure C: - vp8_mbuverror = vp8_mbuverror_c; - vp8_fast_quantize_b = vp8_fast_quantize_b_c; - vp8_short_fdct4x4 = vp8_short_fdct4x4_ppc; - vp8_short_fdct8x4 = vp8_short_fdct8x4_ppc; - vp8_fast_fdct4x4 = vp8_short_fdct4x4_ppc; - vp8_fast_fdct8x4 = vp8_short_fdct8x4_ppc; - short_walsh4x4 = vp8_short_walsh4x4_c; - - vp8_variance4x4 = vp8_variance4x4_ppc; - vp8_variance8x8 = vp8_variance8x8_ppc; - vp8_variance8x16 = vp8_variance8x16_ppc; - vp8_variance16x8 = vp8_variance16x8_ppc; - vp8_variance16x16 = vp8_variance16x16_ppc; - vp8_mse16x16 = vp8_mse16x16_ppc; - - vp8_sub_pixel_variance4x4 = vp8_sub_pixel_variance4x4_ppc; - vp8_sub_pixel_variance8x8 = vp8_sub_pixel_variance8x8_ppc; - vp8_sub_pixel_variance8x16 = vp8_sub_pixel_variance8x16_ppc; - vp8_sub_pixel_variance16x8 = vp8_sub_pixel_variance16x8_ppc; - vp8_sub_pixel_variance16x16 = vp8_sub_pixel_variance16x16_ppc; - - vp8_get_mb_ss = vp8_get_mb_ss_c; - vp8_get4x4sse_cs = vp8_get4x4sse_cs_c; - - vp8_sad16x16 = vp8_sad16x16_ppc; - vp8_sad16x8 = vp8_sad16x8_ppc; - vp8_sad8x16 = vp8_sad8x16_ppc; - vp8_sad8x8 = vp8_sad8x8_ppc; - vp8_sad4x4 = vp8_sad4x4_ppc; - - vp8_block_error = vp8_block_error_ppc; - vp8_mbblock_error = vp8_mbblock_error_c; - - vp8_subtract_b = vp8_subtract_b_c; - vp8_subtract_mby = vp8_subtract_mby_ppc; - vp8_subtract_mbuv = vp8_subtract_mbuv_ppc; -} diff --git a/vp8/encoder/ppc/encodemb_altivec.asm b/vp8/encoder/ppc/encodemb_altivec.asm deleted file mode 100644 index 6e0099ddc..000000000 --- a/vp8/encoder/ppc/encodemb_altivec.asm +++ /dev/null @@ -1,153 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl vp8_subtract_mbuv_ppc - .globl vp8_subtract_mby_ppc - -;# r3 short *diff -;# r4 unsigned char *usrc -;# r5 unsigned char *vsrc -;# r6 unsigned char *pred -;# r7 int stride -vp8_subtract_mbuv_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xf000 - mtspr 256, r12 ;# set VRSAVE - - li r9, 256 - add r3, r3, r9 - add r3, r3, r9 - add r6, r6, r9 - - li r10, 16 - li r9, 4 - mtctr r9 - - vspltisw v0, 0 - -mbu_loop: - lvsl v5, 0, r4 ;# permutate value for alignment - lvx v1, 0, r4 ;# src - lvx v2, 0, r6 ;# pred - - add r4, r4, r7 - addi r6, r6, 16 - - vperm v1, v1, v0, v5 - - vmrghb v3, v0, v1 ;# unpack high src to short - vmrghb v4, v0, v2 ;# unpack high pred to short - - lvsl v5, 0, r4 ;# permutate value for alignment - lvx v1, 0, r4 ;# src - - add r4, r4, r7 - - vsubshs v3, v3, v4 - - stvx v3, 0, r3 ;# store out diff - - vperm v1, v1, v0, v5 - - vmrghb v3, v0, v1 ;# unpack high src to short - vmrglb v4, v0, v2 ;# unpack high pred to short - - vsubshs v3, v3, v4 - - stvx v3, r10, r3 ;# store out diff - - addi r3, r3, 32 - - bdnz mbu_loop - - mtctr r9 - -mbv_loop: - lvsl v5, 0, r5 ;# permutate value for alignment - lvx v1, 0, r5 ;# src - lvx v2, 0, r6 ;# pred - - add r5, r5, r7 - addi r6, r6, 16 - - vperm v1, v1, v0, v5 - - vmrghb v3, v0, v1 ;# unpack high src to short - vmrghb v4, v0, v2 ;# unpack high pred to short - - lvsl v5, 0, r5 ;# permutate value for alignment - lvx v1, 0, r5 ;# src - - add r5, r5, r7 - - vsubshs v3, v3, v4 - - stvx v3, 0, r3 ;# store out diff - - vperm v1, v1, v0, v5 - - vmrghb v3, v0, v1 ;# unpack high src to short - vmrglb v4, v0, v2 ;# unpack high pred to short - - vsubshs v3, v3, v4 - - stvx v3, r10, r3 ;# store out diff - - addi r3, r3, 32 - - bdnz mbv_loop - - mtspr 256, r11 ;# reset old VRSAVE - - blr - -;# r3 short *diff -;# r4 unsigned char *src -;# r5 unsigned char *pred -;# r6 int stride -vp8_subtract_mby_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xf800 - mtspr 256, r12 ;# set VRSAVE - - li r10, 16 - mtctr r10 - - vspltisw v0, 0 - -mby_loop: - lvx v1, 0, r4 ;# src - lvx v2, 0, r5 ;# pred - - add r4, r4, r6 - addi r5, r5, 16 - - vmrghb v3, v0, v1 ;# unpack high src to short - vmrghb v4, v0, v2 ;# unpack high pred to short - - vsubshs v3, v3, v4 - - stvx v3, 0, r3 ;# store out diff - - vmrglb v3, v0, v1 ;# unpack low src to short - vmrglb v4, v0, v2 ;# unpack low pred to short - - vsubshs v3, v3, v4 - - stvx v3, r10, r3 ;# store out diff - - addi r3, r3, 32 - - bdnz mby_loop - - mtspr 256, r11 ;# reset old VRSAVE - - blr diff --git a/vp8/encoder/ppc/fdct_altivec.asm b/vp8/encoder/ppc/fdct_altivec.asm deleted file mode 100644 index 935d0cb09..000000000 --- a/vp8/encoder/ppc/fdct_altivec.asm +++ /dev/null @@ -1,205 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl vp8_short_fdct4x4_ppc - .globl vp8_short_fdct8x4_ppc - -.macro load_c V, LABEL, OFF, R0, R1 - lis \R0, \LABEL@ha - la \R1, \LABEL@l(\R0) - lvx \V, \OFF, \R1 -.endm - -;# Forward and inverse DCTs are nearly identical; only differences are -;# in normalization (fwd is twice unitary, inv is half unitary) -;# and that they are of course transposes of each other. -;# -;# The following three accomplish most of implementation and -;# are used only by ppc_idct.c and ppc_fdct.c. -.macro prologue - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xfffc - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - li r6, 16 - - load_c v0, dct_tab, 0, r9, r10 - lvx v1, r6, r10 - addi r10, r10, 32 - lvx v2, 0, r10 - lvx v3, r6, r10 - - load_c v4, ppc_dctperm_tab, 0, r9, r10 - load_c v5, ppc_dctperm_tab, r6, r9, r10 - - load_c v6, round_tab, 0, r10, r9 -.endm - -.macro epilogue - addi r1, r1, 32 ;# recover stack - - mtspr 256, r11 ;# reset old VRSAVE -.endm - -;# Do horiz xf on two rows of coeffs v8 = a0 a1 a2 a3 b0 b1 b2 b3. -;# a/A are the even rows 0,2 b/B are the odd rows 1,3 -;# For fwd transform, indices are horizontal positions, then frequencies. -;# For inverse transform, frequencies then positions. -;# The two resulting A0..A3 B0..B3 are later combined -;# and vertically transformed. - -.macro two_rows_horiz Dst - vperm v9, v8, v8, v4 ;# v9 = a2 a3 a0 a1 b2 b3 b0 b1 - - vmsumshm v10, v0, v8, v6 - vmsumshm v10, v1, v9, v10 - vsraw v10, v10, v7 ;# v10 = A0 A1 B0 B1 - - vmsumshm v11, v2, v8, v6 - vmsumshm v11, v3, v9, v11 - vsraw v11, v11, v7 ;# v11 = A2 A3 B2 B3 - - vpkuwum v10, v10, v11 ;# v10 = A0 A1 B0 B1 A2 A3 B2 B3 - vperm \Dst, v10, v10, v5 ;# Dest = A0 B0 A1 B1 A2 B2 A3 B3 -.endm - -;# Vertical xf on two rows. DCT values in comments are for inverse transform; -;# forward transform uses transpose. - -.macro two_rows_vert Ceven, Codd - vspltw v8, \Ceven, 0 ;# v8 = c00 c10 or c02 c12 four times - vspltw v9, \Codd, 0 ;# v9 = c20 c30 or c22 c32 "" - vmsumshm v8, v8, v12, v6 - vmsumshm v8, v9, v13, v8 - vsraw v10, v8, v7 - - vspltw v8, \Codd, 1 ;# v8 = c01 c11 or c03 c13 - vspltw v9, \Ceven, 1 ;# v9 = c21 c31 or c23 c33 - vmsumshm v8, v8, v12, v6 - vmsumshm v8, v9, v13, v8 - vsraw v8, v8, v7 - - vpkuwum v8, v10, v8 ;# v8 = rows 0,1 or 2,3 -.endm - -.macro two_rows_h Dest - stw r0, 0(r8) - lwz r0, 4(r3) - stw r0, 4(r8) - lwzux r0, r3,r5 - stw r0, 8(r8) - lwz r0, 4(r3) - stw r0, 12(r8) - lvx v8, 0,r8 - two_rows_horiz \Dest -.endm - - .align 2 -;# r3 short *input -;# r4 short *output -;# r5 int pitch -vp8_short_fdct4x4_ppc: - - prologue - - vspltisw v7, 14 ;# == 14, fits in 5 signed bits - addi r8, r1, 0 - - - lwz r0, 0(r3) - two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 - - lwzux r0, r3, r5 - two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 - - lvx v6, r6, r9 ;# v6 = Vround - vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter - - two_rows_vert v0, v1 - stvx v8, 0, r4 - two_rows_vert v2, v3 - stvx v8, r6, r4 - - epilogue - - blr - - .align 2 -;# r3 short *input -;# r4 short *output -;# r5 int pitch -vp8_short_fdct8x4_ppc: - prologue - - vspltisw v7, 14 ;# == 14, fits in 5 signed bits - addi r8, r1, 0 - addi r10, r3, 0 - - lwz r0, 0(r3) - two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 - - lwzux r0, r3, r5 - two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 - - lvx v6, r6, r9 ;# v6 = Vround - vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter - - two_rows_vert v0, v1 - stvx v8, 0, r4 - two_rows_vert v2, v3 - stvx v8, r6, r4 - - ;# Next block - addi r3, r10, 8 - addi r4, r4, 32 - lvx v6, 0, r9 ;# v6 = Hround - - vspltisw v7, 14 ;# == 14, fits in 5 signed bits - addi r8, r1, 0 - - lwz r0, 0(r3) - two_rows_h v12 ;# v12 = H00 H10 H01 H11 H02 H12 H03 H13 - - lwzux r0, r3, r5 - two_rows_h v13 ;# v13 = H20 H30 H21 H31 H22 H32 H23 H33 - - lvx v6, r6, r9 ;# v6 = Vround - vspltisw v7, -16 ;# == 16 == -16, only low 5 bits matter - - two_rows_vert v0, v1 - stvx v8, 0, r4 - two_rows_vert v2, v3 - stvx v8, r6, r4 - - epilogue - - blr - - .data - .align 4 -ppc_dctperm_tab: - .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 - .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15 - - .align 4 -dct_tab: - .short 23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274 - .short 23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540 - - .short 23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540 - .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274 - - .align 4 -round_tab: - .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1)) - .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1)) diff --git a/vp8/encoder/ppc/rdopt_altivec.asm b/vp8/encoder/ppc/rdopt_altivec.asm deleted file mode 100644 index ba4823009..000000000 --- a/vp8/encoder/ppc/rdopt_altivec.asm +++ /dev/null @@ -1,51 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - .globl vp8_block_error_ppc - - .align 2 -;# r3 short *Coeff -;# r4 short *dqcoeff -vp8_block_error_ppc: - mfspr r11, 256 ;# get old VRSAVE - oris r12, r11, 0xf800 - mtspr 256, r12 ;# set VRSAVE - - stwu r1,-32(r1) ;# create space on the stack - - stw r5, 12(r1) ;# tranfer dc to vector register - - lvx v0, 0, r3 ;# Coeff - lvx v1, 0, r4 ;# dqcoeff - - li r10, 16 - - vspltisw v3, 0 - - vsubshs v0, v0, v1 - - vmsumshm v2, v0, v0, v3 ;# multiply differences - - lvx v0, r10, r3 ;# Coeff - lvx v1, r10, r4 ;# dqcoeff - - vsubshs v0, v0, v1 - - vmsumshm v1, v0, v0, v2 ;# multiply differences - vsumsws v1, v1, v3 ;# sum up - - stvx v1, 0, r1 - lwz r3, 12(r1) ;# return value - - addi r1, r1, 32 ;# recover stack - mtspr 256, r11 ;# reset old VRSAVE - - blr diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 29da926ce..582c8bc6c 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1661,7 +1661,6 @@ void vp8_mv_pred mv.as_mv.row = mvx[vcnt/2]; mv.as_mv.col = mvy[vcnt/2]; - find = 1; /* sr is set to 0 to allow calling function to decide the search * range. */ @@ -2293,7 +2292,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, mode_mv[NEWMV].as_int = d->bmi.mv.as_int; /* Further step/diamond searches as necessary */ - n = 0; further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; n = num00; @@ -2560,8 +2558,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, intra_rd_penalty, cpi, x); if (this_rd < best_mode.rd || x->skip) { - /* Note index of best mode so far */ - best_mode_index = mode_index; *returnrate = rd.rate2; *returndistortion = rd.distortion2; update_best_mode(&best_mode, this_rd, &rd, other_cost, x); diff --git a/vp8/encoder/x86/quantize_sse2.c b/vp8/encoder/x86/quantize_sse2.c index 291d21992..f56e64672 100644 --- a/vp8/encoder/x86/quantize_sse2.c +++ b/vp8/encoder/x86/quantize_sse2.c @@ -35,7 +35,7 @@ void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) { char eob = 0; - short *zbin_boost_ptr = b->zrun_zbin_boost; + short *zbin_boost_ptr; short *qcoeff_ptr = d->qcoeff; DECLARE_ALIGNED_ARRAY(16, short, x, 16); DECLARE_ALIGNED_ARRAY(16, short, y, 16); diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index 1be358e87..c496299ce 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -285,10 +285,10 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE], const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col) { - uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; + uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer, + src->v_buffer}; + const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride, + src->uv_stride}; int i; for (i = 0; i < MAX_MB_PLANE; ++i) { @@ -304,11 +304,10 @@ void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx, const struct scale_factors *sf) { if (src != NULL) { int i; - uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; - + uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer, + src->v_buffer}; + const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride, + src->uv_stride}; for (i = 0; i < MAX_MB_PLANE; ++i) { struct macroblockd_plane *const pd = &xd->plane[i]; setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index b1a33f5a0..d0a03ce98 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -499,7 +499,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_d153_predictor_4x4/; add_proto qw/void vp9_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_v_predictor_4x4 neon/, "$sse_x86inc"; + specialize qw/vp9_highbd_v_predictor_4x4/, "$sse_x86inc"; add_proto qw/void vp9_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vp9_highbd_tm_predictor_4x4/, "$sse_x86inc"; @@ -577,7 +577,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_d153_predictor_16x16/; add_proto qw/void vp9_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_v_predictor_16x16 neon/, "$sse2_x86inc"; + specialize qw/vp9_highbd_v_predictor_16x16/, "$sse2_x86inc"; add_proto qw/void vp9_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vp9_highbd_tm_predictor_16x16/, "$sse2_x86_64"; @@ -1028,16 +1028,20 @@ add_proto qw/void vp9_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, con specialize qw/vp9_sad32x32x8/; add_proto qw/void vp9_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vp9_sad16x16x8 sse4/; +specialize qw/vp9_sad16x16x8 sse4_1/; +$vp9_sad16x16x8_sse4_1=vp9_sad16x16x8_sse4; add_proto qw/void vp9_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vp9_sad16x8x8 sse4/; +specialize qw/vp9_sad16x8x8 sse4_1/; +$vp9_sad16x8x8_sse4_1=vp9_sad16x8x8_sse4; add_proto qw/void vp9_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vp9_sad8x16x8 sse4/; +specialize qw/vp9_sad8x16x8 sse4_1/; +$vp9_sad8x16x8_sse4_1=vp9_sad8x16x8_sse4; add_proto qw/void vp9_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vp9_sad8x8x8 sse4/; +specialize qw/vp9_sad8x8x8 sse4_1/; +$vp9_sad8x8x8_sse4_1=vp9_sad8x8x8_sse4; add_proto qw/void vp9_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; specialize qw/vp9_sad8x4x8/; @@ -1046,7 +1050,8 @@ add_proto qw/void vp9_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const specialize qw/vp9_sad4x8x8/; add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; -specialize qw/vp9_sad4x4x8 sse4/; +specialize qw/vp9_sad4x4x8 sse4_1/; +$vp9_sad4x4x8_sse4_1=vp9_sad4x4x8_sse4; add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"; specialize qw/vp9_sad64x64x4d sse2 avx2 neon/; diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 8840750fd..fcf97d320 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -1926,7 +1926,7 @@ void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd, // pixels of each superblock row can be changed by next superblock row. if (pbi->frame_parallel_decode) vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf, - MAX(0, (y1 + 7) << (plane == 0 ? 0 : 1))); + MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1)); // Skip border extension if block is inside the frame. if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 || @@ -1982,7 +1982,7 @@ void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd, // pixels of each superblock row can be changed by next superblock row. if (pbi->frame_parallel_decode) vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf, - MAX(0, (y1 + 7) << (plane == 0 ? 0 : 1))); + MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1)); } #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c index be2e6cd15..96a63bd9e 100644 --- a/vp9/decoder/vp9_dthread.c +++ b/vp9/decoder/vp9_dthread.c @@ -155,6 +155,10 @@ void vp9_frameworker_copy_context(VP9Worker *const dst_worker, dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync; vp9_frameworker_unlock_stats(src_worker); + dst_cm->bit_depth = src_cm->bit_depth; +#if CONFIG_VP9_HIGHBITDEPTH + dst_cm->use_highbitdepth = src_cm->use_highbitdepth; +#endif dst_cm->prev_frame = src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame; dst_cm->last_width = !src_cm->show_existing_frame ? diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 35c609d4f..7149157a7 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -474,7 +474,7 @@ static int set_vt_partitioning(VP9_COMP *cpi, } -void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) { +void vp9_set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) { SPEED_FEATURES *const sf = &cpi->sf; if (sf->partition_search_type != VAR_BASED_PARTITION && sf->partition_search_type != REFERENCE_PARTITION) { @@ -491,26 +491,107 @@ void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) { // If 4x4 partition is not used, then 8x8 partition will be selected // if variance of 16x16 block is very high, so use larger threshold // for 16x16 (threshold_bsize_min) in that case. + + // Array index: 0 - threshold_64x64; 1 - threshold_32x32; + // 2 - threshold_16x16; 3 - vbp_threshold_8x8; if (is_key_frame) { - cpi->vbp_threshold_64x64 = threshold_base; - cpi->vbp_threshold_32x32 = threshold_base >> 2; - cpi->vbp_threshold_16x16 = threshold_base >> 2; - cpi->vbp_threshold_8x8 = threshold_base << 2; + thresholds[0] = threshold_base; + thresholds[1] = threshold_base >> 2; + thresholds[2] = threshold_base >> 2; + thresholds[3] = threshold_base << 2; cpi->vbp_bsize_min = BLOCK_8X8; } else { - cpi->vbp_threshold_32x32 = threshold_base; + thresholds[1] = threshold_base; if (cm->width <= 352 && cm->height <= 288) { - cpi->vbp_threshold_64x64 = threshold_base >> 2; - cpi->vbp_threshold_16x16 = threshold_base << 3; + thresholds[0] = threshold_base >> 2; + thresholds[2] = threshold_base << 3; } else { - cpi->vbp_threshold_64x64 = threshold_base; - cpi->vbp_threshold_16x16 = threshold_base << cpi->oxcf.speed; + thresholds[0] = threshold_base; + thresholds[2] = threshold_base << cpi->oxcf.speed; } cpi->vbp_bsize_min = BLOCK_16X16; } } } +static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d, + int dp, int x8_idx, int y8_idx, v8x8 *vst, +#if CONFIG_VP9_HIGHBITDEPTH + int highbd_flag, +#endif + int pixels_wide, + int pixels_high, + int is_key_frame) { + int k; + for (k = 0; k < 4; k++) { + int x4_idx = x8_idx + ((k & 1) << 2); + int y4_idx = y8_idx + ((k >> 1) << 2); + unsigned int sse = 0; + int sum = 0; + if (x4_idx < pixels_wide && y4_idx < pixels_high) { + int s_avg; + int d_avg = 128; +#if CONFIG_VP9_HIGHBITDEPTH + if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { + s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp); + if (!is_key_frame) + d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp); + } else { + s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp); + if (!is_key_frame) + d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp); + } +#else + s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp); + if (!is_key_frame) + d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp); +#endif + sum = s_avg - d_avg; + sse = sum * sum; + } + fill_variance(sse, sum, 0, &vst->split[k].part_variances.none); + } +} + +static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d, + int dp, int x16_idx, int y16_idx, v16x16 *vst, +#if CONFIG_VP9_HIGHBITDEPTH + int highbd_flag, +#endif + int pixels_wide, + int pixels_high, + int is_key_frame) { + int k; + for (k = 0; k < 4; k++) { + int x8_idx = x16_idx + ((k & 1) << 3); + int y8_idx = y16_idx + ((k >> 1) << 3); + unsigned int sse = 0; + int sum = 0; + if (x8_idx < pixels_wide && y8_idx < pixels_high) { + int s_avg; + int d_avg = 128; +#if CONFIG_VP9_HIGHBITDEPTH + if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { + s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp); + if (!is_key_frame) + d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp); + } else { + s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp); + if (!is_key_frame) + d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp); + } +#else + s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp); + if (!is_key_frame) + d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp); +#endif + sum = s_avg - d_avg; + sse = sum * sum; + } + fill_variance(sse, sum, 0, &vst->split[k].part_variances.none); + } +} + // This function chooses partitioning based on the variance between source and // reconstructed last, where variance is computed for down-sampled inputs. static void choose_partitioning(VP9_COMP *cpi, @@ -528,6 +609,8 @@ static void choose_partitioning(VP9_COMP *cpi, int sp; int dp; int pixels_wide = 64, pixels_high = 64; + int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1], + cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]}; // Always use 4x4 partition for key frame. const int is_key_frame = (cm->frame_type == KEY_FRAME); @@ -540,6 +623,11 @@ static void choose_partitioning(VP9_COMP *cpi, const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col); + + if (cyclic_refresh_segment_id_boosted(segment_id)) { + int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); + vp9_set_vbp_thresholds(cpi, thresholds, q); + } } set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); @@ -647,33 +735,13 @@ static void choose_partitioning(VP9_COMP *cpi, v16x16 *vst = &vt.split[i].split[j]; variance4x4downsample[i2 + j] = 0; if (!is_key_frame) { - for (k = 0; k < 4; k++) { - int x8_idx = x16_idx + ((k & 1) << 3); - int y8_idx = y16_idx + ((k >> 1) << 3); - unsigned int sse = 0; - int sum = 0; - if (x8_idx < pixels_wide && y8_idx < pixels_high) { - int s_avg, d_avg; + fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst, #if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp); - d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp); - } else { - s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp); - d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp); - } -#else - s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp); - d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp); + xd->cur_buf->flags, #endif - sum = s_avg - d_avg; - sse = sum * sum; - } - // If variance is based on 8x8 downsampling, we stop here and have - // one sample for 8x8 block (so use 1 for count in fill_variance), - // which of course means variance = 0 for 8x8 block. - fill_variance(sse, sum, 0, &vst->split[k].part_variances.none); - } + pixels_wide, + pixels_high, + is_key_frame); fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16); // For low-resolution, compute the variance based on 8x8 down-sampling, // and if it is large (above the threshold) we go down for 4x4. @@ -683,7 +751,7 @@ static void choose_partitioning(VP9_COMP *cpi, } if (is_key_frame || (low_res && vt.split[i].split[j].part_variances.none.variance > - (cpi->vbp_threshold_32x32 << 1))) { + (thresholds[1] << 1))) { // Go down to 4x4 down-sampling for variance. variance4x4downsample[i2 + j] = 1; for (k = 0; k < 4; k++) { @@ -691,47 +759,18 @@ static void choose_partitioning(VP9_COMP *cpi, int y8_idx = y16_idx + ((k >> 1) << 3); v8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k]; - for (m = 0; m < 4; m++) { - int x4_idx = x8_idx + ((m & 1) << 2); - int y4_idx = y8_idx + ((m >> 1) << 2); - unsigned int sse = 0; - int sum = 0; - if (x4_idx < pixels_wide && y4_idx < pixels_high) { - int d_avg = 128; + fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2, #if CONFIG_VP9_HIGHBITDEPTH - int s_avg; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp); - if (cm->frame_type != KEY_FRAME) - d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp); - } else { - s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp); - if (cm->frame_type != KEY_FRAME) - d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp); - } -#else - int s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp); - if (!is_key_frame) - d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp); + xd->cur_buf->flags, #endif - sum = s_avg - d_avg; - sse = sum * sum; - } - // If variance is based on 4x4 down-sampling, we stop here and have - // one sample for 4x4 block (so use 1 for count in fill_variance), - // which of course means variance = 0 for 4x4 block. - fill_variance(sse, sum, 0, &vst2->split[m].part_variances.none); - } + pixels_wide, + pixels_high, + is_key_frame); } } } } - // No 64x64 blocks on segments other than base (un-boosted) segment, - // so force split. - if (cyclic_refresh_segment_id_boosted(segment_id)) - force_split[0] = 1; - // Fill the rest of the variance tree by summing split partition values. for (i = 0; i < 4; i++) { const int i2 = i << 2; @@ -748,7 +787,7 @@ static void choose_partitioning(VP9_COMP *cpi, // If variance of this 32x32 block is above the threshold, force the block // to split. This also forces a split on the upper (64x64) level. get_variance(&vt.split[i].part_variances.none); - if (vt.split[i].part_variances.none.variance > cpi->vbp_threshold_32x32) { + if (vt.split[i].part_variances.none.variance > thresholds[1]) { force_split[i + 1] = 1; force_split[0] = 1; } @@ -760,16 +799,15 @@ static void choose_partitioning(VP9_COMP *cpi, // we get to one that's got a variance lower than our threshold. if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows || !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col, - cpi->vbp_threshold_64x64, BLOCK_16X16, - force_split[0])) { + thresholds[0], BLOCK_16X16, force_split[0])) { for (i = 0; i < 4; ++i) { const int x32_idx = ((i & 1) << 2); const int y32_idx = ((i >> 1) << 2); const int i2 = i << 2; if (!set_vt_partitioning(cpi, xd, &vt.split[i], BLOCK_32X32, (mi_row + y32_idx), (mi_col + x32_idx), - cpi->vbp_threshold_32x32, - BLOCK_16X16, force_split[i + 1])) { + thresholds[1], BLOCK_16X16, + force_split[i + 1])) { for (j = 0; j < 4; ++j) { const int x16_idx = ((j & 1) << 1); const int y16_idx = ((j >> 1) << 1); @@ -782,8 +820,7 @@ static void choose_partitioning(VP9_COMP *cpi, if (!set_vt_partitioning(cpi, xd, vtemp, BLOCK_16X16, mi_row + y32_idx + y16_idx, mi_col + x32_idx + x16_idx, - cpi->vbp_threshold_16x16, - cpi->vbp_bsize_min, 0)) { + thresholds[2], cpi->vbp_bsize_min, 0)) { for (k = 0; k < 4; ++k) { const int x8_idx = (k & 1); const int y8_idx = (k >> 1); @@ -792,8 +829,7 @@ static void choose_partitioning(VP9_COMP *cpi, BLOCK_8X8, mi_row + y32_idx + y16_idx + y8_idx, mi_col + x32_idx + x16_idx + x8_idx, - cpi->vbp_threshold_8x8, - BLOCK_8X8, 0)) { + thresholds[3], BLOCK_8X8, 0)) { set_block_size(cpi, xd, (mi_row + y32_idx + y16_idx + y8_idx), (mi_col + x32_idx + x16_idx + x8_idx), diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h index 8d545b671..102713029 100644 --- a/vp9/encoder/vp9_encodeframe.h +++ b/vp9/encoder/vp9_encodeframe.h @@ -12,6 +12,8 @@ #ifndef VP9_ENCODER_VP9_ENCODEFRAME_H_ #define VP9_ENCODER_VP9_ENCODEFRAME_H_ +#include "vpx/vpx_integer.h" + #ifdef __cplusplus extern "C" { #endif @@ -38,7 +40,7 @@ void vp9_init_tile_data(struct VP9_COMP *cpi); void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td, int tile_row, int tile_col); -void vp9_set_vbp_thresholds(struct VP9_COMP *cpi, int q); +void vp9_set_vbp_thresholds(struct VP9_COMP *cpi, int64_t thresholds[], int q); #ifdef __cplusplus } // extern "C" diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 59788eb82..82f99b3e5 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2958,7 +2958,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi) { set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); vp9_set_quantizer(cm, q); - vp9_set_vbp_thresholds(cpi, q); + vp9_set_vbp_thresholds(cpi, cpi->vbp_thresholds, q); setup_frame(cpi); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index ee231f7fd..a5342ade3 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -460,10 +460,9 @@ typedef struct VP9_COMP { int resize_pending; // VAR_BASED_PARTITION thresholds - int64_t vbp_threshold_64x64; - int64_t vbp_threshold_32x32; - int64_t vbp_threshold_16x16; - int64_t vbp_threshold_8x8; + // 0 - threshold_64x64; 1 - threshold_32x32; + // 2 - threshold_16x16; 3 - vbp_threshold_8x8; + int64_t vbp_thresholds[4]; BLOCK_SIZE vbp_bsize_min; // Multi-threading diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index d754fd2b1..94ea4cd2d 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -378,16 +378,18 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, struct macroblockd_plane *const pd = &xd->plane[i]; const TX_SIZE uv_tx_size = get_uv_tx_size(&xd->mi[0].src_mi->mbmi, pd); const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size]; - const int sf = (bw - b_width_log2_lookup[unit_size]) + - (bh - b_height_log2_lookup[unit_size]); - const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); + const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd); + const int uv_bw = b_width_log2_lookup[uv_bsize]; + const int uv_bh = b_height_log2_lookup[uv_bsize]; + const int sf = (uv_bw - b_width_log2_lookup[unit_size]) + + (uv_bh - b_height_log2_lookup[unit_size]); const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf); const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf); int j = i - 1; vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i); - var_uv[j] = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, - pd->dst.buf, pd->dst.stride, &sse_uv[j]); + var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, &sse_uv[j]); if (var_uv[j] < uv_ac_thr || var_uv[j] == 0) { if (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]) @@ -1327,8 +1329,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); // For large partition blocks, extra testing is done. - if (bsize > BLOCK_32X32 && xd->mi[0].src_mi->mbmi.segment_id != 1 && - cm->base_qindex) { + if (bsize > BLOCK_32X32 && + !cyclic_refresh_segment_id_boosted(xd->mi[0].src_mi->mbmi.segment_id) && + cm->base_qindex) { model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col, &this_early_term); diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm index 318379777..56373e897 100644 --- a/vp9/encoder/x86/vp9_error_sse2.asm +++ b/vp9/encoder/x86/vp9_error_sse2.asm @@ -78,7 +78,7 @@ cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz ; intptr_t block_size) INIT_XMM sse2 -cglobal block_error_fp, 3, 3, 8, uqc, dqc, size +cglobal block_error_fp, 3, 3, 6, uqc, dqc, size pxor m4, m4 ; sse accumulator pxor m5, m5 ; dedicated zero register lea uqcq, [uqcq+sizeq*2] @@ -96,13 +96,13 @@ cglobal block_error_fp, 3, 3, 8, uqc, dqc, size pmaddwd m0, m0 pmaddwd m1, m1 ; accumulate in 64bit - punpckldq m7, m0, m5 + punpckldq m3, m0, m5 punpckhdq m0, m5 - paddq m4, m7 - punpckldq m7, m1, m5 + paddq m4, m3 + punpckldq m3, m1, m5 paddq m4, m0 punpckhdq m1, m5 - paddq m4, m7 + paddq m4, m3 paddq m4, m1 add sizeq, mmsize jl .loop diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index c85498372..4ed34d05c 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -141,8 +141,10 @@ VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_8_neon_asm$(A VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM) +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c +endif VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon.c # neon with assembly and intrinsics implementations. If both are available diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 2b3f8945a..f8da734de 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -152,12 +152,14 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c -VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c -VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c -VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad4d_neon.c +ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c -VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c +endif +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad4d_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/webmdec.cc b/webmdec.cc index 60b01ad4b..e152f5ee0 100644 --- a/webmdec.cc +++ b/webmdec.cc @@ -219,6 +219,7 @@ int webm_guess_framerate(struct WebmInputContext *webm_ctx, webm_ctx->block_entry = NULL; webm_ctx->block_frame_index = 0; webm_ctx->timestamp_ns = 0; + webm_ctx->reached_eos = 0; return 0; } |