diff options
116 files changed, 3410 insertions, 2137 deletions
diff --git a/.clang-format b/.clang-format index c1483199e..e76a526e4 100644 --- a/.clang-format +++ b/.clang-format @@ -1,12 +1,12 @@ --- Language: Cpp # BasedOnStyle: Google -# Generated with clang-format 4.0.1 +# Generated with clang-format 5.0.0 AccessModifierOffset: -1 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false -AlignEscapedNewlinesLeft: true +AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: true AllowAllParametersOfDeclarationOnNextLine: true @@ -33,14 +33,20 @@ BraceWrapping: BeforeCatch: false BeforeElse: false IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true BreakBeforeBinaryOperators: None BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false BreakBeforeTernaryOperators: true BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon BreakAfterJavaFieldAnnotations: false BreakStringLiterals: true ColumnLimit: 80 CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 @@ -48,7 +54,11 @@ Cpp11BracedListStyle: false DerivePointerAlignment: false DisableFormat: false ExperimentalAutoDetectBinPacking: false -ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH IncludeCategories: - Regex: '^<.*\.h>' Priority: 1 @@ -70,6 +80,7 @@ NamespaceIndentation: None ObjCBlockIndentWidth: 2 ObjCSpaceAfterProperty: false ObjCSpaceBeforeProtocolList: false +PenaltyBreakAssignment: 2 PenaltyBreakBeforeFirstCallParameter: 1 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 @@ -79,6 +90,7 @@ PenaltyReturnTypeOnItsOwnLine: 200 PointerAlignment: Right ReflowComments: true SortIncludes: false +SortUsingDeclarations: true SpaceAfterCStyleCast: false SpaceAfterTemplateKeyword: true SpaceBeforeAssignmentOperators: true @@ -3,6 +3,7 @@ Aℓex Converse <aconverse@google.com> Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com> Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com> Alpha Lam <hclam@google.com> <hclam@chromium.org> +Chris Cunningham <chcunningham@chromium.org> Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com> Deb Mukherjee <debargha@google.com> Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com> @@ -21,18 +22,21 @@ Marco Paniconi <marpan@google.com> Marco Paniconi <marpan@google.com> <marpan@chromium.org> Pascal Massimino <pascal.massimino@gmail.com> Paul Wilkins <paulwilkins@google.com> +Peter Boström <pbos@chromium.org> <pbos@google.com> Peter de Rivaz <peter.derivaz@gmail.com> Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com> Ralph Giles <giles@xiph.org> <giles@entropywave.com> Ralph Giles <giles@xiph.org> <giles@mozilla.com> Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com> Sami Pietilä <samipietila@google.com> +Shiyou Yin <yinshiyou-hf@loongson.cn> Tamar Levy <tamar.levy@intel.com> Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com> Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com> Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com> Tom Finegan <tomfinegan@google.com> Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org> +Urvang Joshi <urvang@google.com> <urvang@chromium.org> Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com> Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com> Yaowu Xu <yaowu@google.com> <Yaowu Xu> @@ -3,13 +3,13 @@ Aaron Watry <awatry@gmail.com> Abo Talib Mahfoodh <ab.mahfoodh@gmail.com> -Adam Xu <adam@xuyaowu.com> Adrian Grange <agrange@google.com> Aℓex Converse <aconverse@google.com> Ahmad Sharif <asharif@google.com> Aleksey Vasenev <margtu-fivt@ya.ru> Alexander Potapenko <glider@google.com> Alexander Voronov <avoronov@graphics.cs.msu.ru> +Alexandra Hájková <alexandra.khirnova@gmail.com> Alexis Ballier <aballier@gentoo.org> Alok Ahuja <waveletcoeff@gmail.com> Alpha Lam <hclam@google.com> @@ -17,6 +17,7 @@ A.Mahfoodh <ab.mahfoodh@gmail.com> Ami Fischman <fischman@chromium.org> Andoni Morales Alastruey <ylatuya@gmail.com> Andres Mejia <mcitadel@gmail.com> +Andrew Lewis <andrewlewis@google.com> Andrew Russell <anrussell@google.com> Angie Chiang <angiebird@google.com> Aron Rosenberg <arosenberg@logitech.com> @@ -24,7 +25,9 @@ Attila Nagy <attilanagy@google.com> Brion Vibber <bvibber@wikimedia.org> changjun.yang <changjun.yang@intel.com> Charles 'Buck' Krasic <ckrasic@google.com> +Cheng Chen <chengchen@google.com> chm <chm@rock-chips.com> +Chris Cunningham <chcunningham@chromium.org> Christian Duvivier <cduvivier@google.com> Daniele Castagna <dcastagna@chromium.org> Daniel Kang <ddkang@google.com> @@ -46,10 +49,12 @@ Geza Lore <gezalore@gmail.com> Ghislain MARY <ghislainmary2@gmail.com> Giuseppe Scrivano <gscrivano@gnu.org> Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com> +Gregor Jasny <gjasny@gmail.com> Guillaume Martres <gmartres@google.com> Guillermo Ballester Valor <gbvalor@gmail.com> Hangyu Kuang <hkuang@google.com> Hanno Böck <hanno@hboeck.de> +Han Shen <shenhan@google.com> Henrik Lundin <hlundin@google.com> Hui Su <huisu@google.com> Ivan Krasin <krasin@chromium.org> @@ -83,6 +88,7 @@ Justin Clift <justin@salasaga.org> Justin Lebar <justin.lebar@gmail.com> Kaustubh Raste <kaustubh.raste@imgtec.com> KO Myung-Hun <komh@chollian.net> +Kyle Siefring <kylesiefring@gmail.com> Lawrence Velázquez <larryv@macports.org> Linfeng Zhang <linfengz@google.com> Lou Quillio <louquillio@google.com> @@ -101,6 +107,7 @@ Mikhal Shemer <mikhal@google.com> Min Chen <chenm003@gmail.com> Minghai Shang <minghai@google.com> Min Ye <yeemmi@google.com> +Moriyoshi Koizumi <mozo@mozo.jp> Morton Jonuschat <yabawock@gmail.com> Nathan E. Egge <negge@mozilla.com> Nico Weber <thakis@chromium.org> @@ -111,12 +118,15 @@ Paul Wilkins <paulwilkins@google.com> Pavol Rusnak <stick@gk2.sk> Paweł Hajdan <phajdan@google.com> Pengchong Jin <pengchong@google.com> -Peter Boström <pbos@google.com> +Peter Boström <pbos@chromium.org> +Peter Collingbourne <pcc@chromium.org> Peter de Rivaz <peter.derivaz@gmail.com> Philip Jägenstedt <philipj@opera.com> Priit Laes <plaes@plaes.org> Rafael Ávila de Espíndola <rafael.espindola@gmail.com> Rafaël Carré <funman@videolan.org> +Rafael de Lucena Valle <rafaeldelucena@gmail.com> +Rahul Chaudhry <rahulchaudhry@google.com> Ralph Giles <giles@xiph.org> Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com> Rob Bradford <rob@linux.intel.com> @@ -135,6 +145,7 @@ Shiyou Yin <yinshiyou-hf@loongson.cn> Shunyao Li <shunyaoli@google.com> Stefan Holmer <holmer@google.com> Suman Sunkara <sunkaras@google.com> +Sylvestre Ledru <sylvestre@mozilla.com> Taekhyun Kim <takim@nvidia.com> Takanori MATSUURA <t.matsuu@gmail.com> Tamar Levy <tamar.levy@intel.com> @@ -147,6 +158,7 @@ Tom Finegan <tomfinegan@google.com> Tristan Matthews <le.businessman@gmail.com> Urvang Joshi <urvang@google.com> Vignesh Venkatasubramanian <vigneshv@google.com> +Vlad Tsyrklevich <vtsyrklevich@chromium.org> Yaowu Xu <yaowu@google.com> Yi Luo <luoyi@google.com> Yongzhe Wang <yongzhe@google.com> @@ -1,3 +1,28 @@ +2017-01-04 v1.7.0 "Mandarin Duck" + This release focused on high bit depth performance (10/12 bit) and vp9 + encoding improvements. + + - Upgrading: + This release is ABI incompatible due to new vp9 encoder features. + + Frame parallel decoding for vp9 has been removed. + + - Enhancements: + vp9 encoding supports additional threads with --row-mt. This can be greater + than the number of tiles. + + Two new vp9 encoder options have been added: + --corpus-complexity + --tune-content=film + + Additional tooling for respecting the vp9 "level" profiles has been added. + + - Bug fixes: + A variety of fuzzing issues. + vp8 threading fix for ARM. + Codec control VP9_SET_SKIP_LOOP_FILTER fixed. + Reject invalid multi resolution configurations. + 2017-01-09 v1.6.1 "Long Tailed Duck" This release improves upon the VP9 encoder and speeds up the encoding and decoding processes. @@ -1,4 +1,4 @@ -README - 26 January 2017 +README - 24 January 2018 Welcome to the WebM VP8/VP9 Codec SDK! @@ -63,6 +63,8 @@ COMPILING THE APPLICATIONS/LIBRARIES: armv8-linux-gcc mips32-linux-gcc mips64-linux-gcc + ppc64-linux-gcc + ppc64le-linux-gcc sparc-solaris-gcc x86-android-gcc x86-darwin8-gcc diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c index 0987cbfb8..d49a2307d 100644 --- a/examples/vp9_spatial_svc_encoder.c +++ b/examples/vp9_spatial_svc_encoder.c @@ -429,8 +429,9 @@ static void set_rate_control_stats(struct RateControlStats *rc, rc->layer_framerate[layer] = framerate / cfg->ts_rate_decimator[tl]; if (tl > 0) { rc->layer_pfb[layer] = - 1000.0 * (cfg->layer_target_bitrate[layer] - - cfg->layer_target_bitrate[layer - 1]) / + 1000.0 * + (cfg->layer_target_bitrate[layer] - + cfg->layer_target_bitrate[layer - 1]) / (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]); } else { rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] / @@ -573,8 +574,8 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers, } else { if (is_key_frame) { ref_frame_config->frame_flags[sl] = - VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_ARF | - VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | + VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF; } else { ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF; @@ -588,14 +589,24 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers, } else { ref_frame_config->frame_flags[sl] = VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF; + if (sl == num_spatial_layers - 1) + ref_frame_config->frame_flags[sl] = + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_ARF | + VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF; } } if (tl == 0) { ref_frame_config->lst_fb_idx[sl] = sl; - if (sl) - ref_frame_config->gld_fb_idx[sl] = sl - 1; - else + if (sl) { + if (is_key_frame) { + ref_frame_config->lst_fb_idx[sl] = sl - 1; + ref_frame_config->gld_fb_idx[sl] = sl; + } else { + ref_frame_config->gld_fb_idx[sl] = sl - 1; + } + } else { ref_frame_config->gld_fb_idx[sl] = 0; + } ref_frame_config->alt_fb_idx[sl] = 0; } else if (tl == 1) { ref_frame_config->lst_fb_idx[sl] = sl; @@ -738,6 +749,8 @@ int main(int argc, const char **argv) { // the encode for the whole superframe. The encoder will internally loop // over all the spatial layers for the current superframe. vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id); + // TODO(jianj): Fix the parameter passing for "is_key_frame" in + // set_frame_flags_bypass_model() for case of periodic key frames. set_frame_flags_bypass_mode(sl, layer_id.temporal_layer_id, svc_ctx.spatial_layers, frame_cnt == 0, &ref_frame_config); diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c index f5736ea45..4c985ba52 100644 --- a/examples/vpx_temporal_svc_encoder.c +++ b/examples/vpx_temporal_svc_encoder.c @@ -26,7 +26,9 @@ #include "../tools_common.h" #include "../video_writer.h" -#define VP8_ROI_MAP 0 +#define ROI_MAP 0 + +#define zero(Dest) memset(&Dest, 0, sizeof(Dest)); static const char *exec_name; @@ -99,9 +101,10 @@ static void set_rate_control_metrics(struct RateControlMetrics *rc, for (i = 0; i < cfg->ts_number_layers; ++i) { if (i > 0) { rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i]; - rc->layer_pfb[i] = 1000.0 * (rc->layer_target_bitrate[i] - - rc->layer_target_bitrate[i - 1]) / - (rc->layer_framerate[i] - rc->layer_framerate[i - 1]); + rc->layer_pfb[i] = + 1000.0 * + (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) / + (rc->layer_framerate[i] - rc->layer_framerate[i - 1]); } rc->layer_input_frames[i] = 0; rc->layer_enc_frames[i] = 0; @@ -164,38 +167,60 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc, die("Error: Number of input frames not equal to output! \n"); } -#if VP8_ROI_MAP -static void vp8_set_roi_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi) { +#if ROI_MAP +static void set_roi_map(const char *enc_name, vpx_codec_enc_cfg_t *cfg, + vpx_roi_map_t *roi) { unsigned int i, j; - memset(roi, 0, sizeof(*roi)); + int block_size = 0; + uint8_t is_vp8 = strncmp(enc_name, "vp8", 3) == 0 ? 1 : 0; + uint8_t is_vp9 = strncmp(enc_name, "vp9", 3) == 0 ? 1 : 0; + if (!is_vp8 && !is_vp9) { + die("unsupported codec."); + } + zero(*roi); + + block_size = is_vp9 && !is_vp8 ? 8 : 16; // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for // segment is 16x16 for vp8, 8x8 for vp9. - roi->rows = (cfg->g_h + 15) / 16; - roi->cols = (cfg->g_w + 15) / 16; + roi->rows = (cfg->g_h + block_size - 1) / block_size; + roi->cols = (cfg->g_w + block_size - 1) / block_size; // Applies delta QP on the segment blocks, varies from -63 to 63. // Setting to negative means lower QP (better quality). // Below we set delta_q to the extreme (-63) to show strong effect. - roi->delta_q[0] = 0; + // VP8 uses the first 4 segments. VP9 uses all 8 segments. + zero(roi->delta_q); roi->delta_q[1] = -63; - roi->delta_q[2] = 0; - roi->delta_q[3] = 0; // Applies delta loopfilter strength on the segment blocks, varies from -63 to - // 63. Setting to positive means stronger loopfilter. - roi->delta_lf[0] = 0; - roi->delta_lf[1] = 0; - roi->delta_lf[2] = 0; - roi->delta_lf[3] = 0; - - // Applies skip encoding threshold on the segment blocks, varies from 0 to - // UINT_MAX. Larger value means more skipping of encoding is possible. - // This skip threshold only applies on delta frames. - roi->static_threshold[0] = 0; - roi->static_threshold[1] = 0; - roi->static_threshold[2] = 0; - roi->static_threshold[3] = 0; + // 63. Setting to positive means stronger loopfilter. VP8 uses the first 4 + // segments. VP9 uses all 8 segments. + zero(roi->delta_lf); + + if (is_vp8) { + // Applies skip encoding threshold on the segment blocks, varies from 0 to + // UINT_MAX. Larger value means more skipping of encoding is possible. + // This skip threshold only applies on delta frames. + zero(roi->static_threshold); + } + + if (is_vp9) { + // Apply skip segment. Setting to 1 means this block will be copied from + // previous frame. + zero(roi->skip); + } + + if (is_vp9) { + // Apply ref frame segment. + // -1 : Do not apply this segment. + // 0 : Froce using intra. + // 1 : Force using last. + // 2 : Force using golden. + // 3 : Force using alfref but not used in non-rd pickmode for 0 lag. + memset(roi->ref_frame, -1, sizeof(roi->ref_frame)); + roi->ref_frame[1] = 1; + } // Use 2 states: 1 is center square, 0 is the rest. roi->roi_map = @@ -563,7 +588,7 @@ int main(int argc, char **argv) { int layering_mode = 0; int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 }; int flag_periodicity = 1; -#if VP8_ROI_MAP +#if ROI_MAP vpx_roi_map_t roi; #endif vpx_svc_layer_id_t layer_id = { 0, 0 }; @@ -766,8 +791,8 @@ int main(int argc, char **argv) { vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff); vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0); -#if VP8_ROI_MAP - vp8_set_roi_map(&cfg, &roi); +#if ROI_MAP + set_roi_map(encoder->name, &cfg, &roi); if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi)) die_codec(&codec, "Failed to set ROI map"); #endif @@ -784,6 +809,12 @@ int main(int argc, char **argv) { vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0); vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1)); +#if ROI_MAP + set_roi_map(encoder->name, &cfg, &roi); + if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi)) + die_codec(&codec, "Failed to set ROI map"); + vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 0); +#endif // TODO(marpan/jianj): There is an issue with row-mt for low resolutons at // high speed settings, disable its use for those cases for now. if (cfg.g_threads > 1 && ((cfg.g_w > 320 && cfg.g_h > 240) || speed < 7)) @@ -911,5 +942,8 @@ int main(int argc, char **argv) { for (i = 0; i < cfg.ts_number_layers; ++i) vpx_video_writer_close(outfile[i]); vpx_img_free(&raw); +#if ROI_MAP + free(roi.roi_map); +#endif return EXIT_SUCCESS; } diff --git a/libs.doxy_template b/libs.doxy_template index 5a8f84728..1eacc8fe2 100644 --- a/libs.doxy_template +++ b/libs.doxy_template @@ -943,18 +943,6 @@ GENERATE_XML = NO XML_OUTPUT = xml -# The XML_SCHEMA tag can be used to specify an XML schema, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_SCHEMA = - -# The XML_DTD tag can be used to specify an XML DTD, -# which can be used by a validating XML parser to check the -# syntax of the XML files. - -XML_DTD = - # If the XML_PROGRAMLISTING tag is set to YES Doxygen will # dump the program listings (including syntax highlighting # and cross-referencing information) to the XML output. Note that @@ -233,8 +233,8 @@ OBJS-yes += $(LIBVPX_OBJS) LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) -SO_VERSION_MAJOR := 4 -SO_VERSION_MINOR := 1 +SO_VERSION_MAJOR := 5 +SO_VERSION_MINOR := 0 SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib diff --git a/test/blockiness_test.cc b/test/blockiness_test.cc index 2fa10192f..08190c90d 100644 --- a/test/blockiness_test.cc +++ b/test/blockiness_test.cc @@ -215,7 +215,7 @@ using std::tr1::make_tuple; #if CONFIG_VP9_ENCODER const BlockinessParam c_vp9_tests[] = { - make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238), + make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238) }; INSTANTIATE_TEST_CASE_P(C, BlockinessVP9Test, ::testing::ValuesIn(c_vp9_tests)); #endif diff --git a/test/consistency_test.cc b/test/consistency_test.cc index 37b4a45e5..c21751f71 100644 --- a/test/consistency_test.cc +++ b/test/consistency_test.cc @@ -205,7 +205,7 @@ using std::tr1::make_tuple; #if CONFIG_VP9_ENCODER const ConsistencyParam c_vp9_tests[] = { - make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238), + make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238) }; INSTANTIATE_TEST_CASE_P(C, ConsistencyVP9Test, ::testing::ValuesIn(c_vp9_tests)); diff --git a/test/datarate_test.cc b/test/datarate_test.cc index 40eb972c0..65f63d654 100644 --- a/test/datarate_test.cc +++ b/test/datarate_test.cc @@ -44,7 +44,7 @@ class DatarateTestLarge denoiser_offon_test_ = 0; denoiser_offon_period_ = -1; gf_boost_ = 0; - use_roi_ = 0; + use_roi_ = false; } virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, @@ -55,11 +55,9 @@ class DatarateTestLarge encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_); } -#if CONFIG_VP8_ENCODER - if (use_roi_ == 1) { + if (use_roi_) { encoder->Control(VP8E_SET_ROI_MAP, &roi_); } -#endif if (denoiser_offon_test_) { ASSERT_GT(denoiser_offon_period_, 0) @@ -152,7 +150,7 @@ class DatarateTestLarge int denoiser_offon_period_; int set_cpu_used_; int gf_boost_; - int use_roi_; + bool use_roi_; vpx_roi_map_t roi_; }; @@ -441,7 +439,7 @@ TEST_P(DatarateTestRealTime, RegionOfInterest) { ResetModel(); // Set ROI parameters - use_roi_ = 1; + use_roi_ = true; memset(&roi_, 0, sizeof(roi_)); roi_.rows = (cfg_.g_h + 15) / 16; @@ -539,6 +537,7 @@ class DatarateTestVP9Large denoiser_offon_test_ = 0; denoiser_offon_period_ = -1; frame_parallel_decoding_mode_ = 1; + use_roi_ = false; } // @@ -621,6 +620,10 @@ class DatarateTestVP9Large encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, frame_parallel_decoding_mode_); + if (use_roi_) { + encoder->Control(VP9E_SET_ROI_MAP, &roi_); + } + if (cfg_.ts_number_layers > 1) { if (video->frame() == 0) { encoder->Control(VP9E_SET_SVC, 1); @@ -701,6 +704,8 @@ class DatarateTestVP9Large int denoiser_offon_test_; int denoiser_offon_period_; int frame_parallel_decoding_mode_; + bool use_roi_; + vpx_roi_map_t roi_; }; // Check basic rate targeting for VBR mode with 0 lag. @@ -1073,6 +1078,68 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) { } } +class DatarateTestVP9RealTime : public DatarateTestVP9Large { + public: + virtual ~DatarateTestVP9RealTime() {} +}; + +// Check VP9 region of interest feature. +TEST_P(DatarateTestVP9RealTime, RegionOfInterest) { + if (deadline_ != VPX_DL_REALTIME || set_cpu_used_ < 5) return; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 0; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + + cfg_.rc_target_bitrate = 450; + cfg_.g_w = 352; + cfg_.g_h = 288; + + ResetModel(); + + // Set ROI parameters + use_roi_ = true; + memset(&roi_, 0, sizeof(roi_)); + + roi_.rows = (cfg_.g_h + 7) / 8; + roi_.cols = (cfg_.g_w + 7) / 8; + + roi_.delta_q[1] = -20; + roi_.delta_lf[1] = -20; + memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame)); + roi_.ref_frame[1] = 1; + + // Use 2 states: 1 is center square, 0 is the rest. + roi_.roi_map = reinterpret_cast<uint8_t *>( + calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map))); + ASSERT_TRUE(roi_.roi_map != NULL); + + for (unsigned int i = 0; i < roi_.rows; ++i) { + for (unsigned int j = 0; j < roi_.cols; ++j) { + if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) && + j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) { + roi_.roi_map[i * roi_.cols + j] = 1; + } + } + } + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_[0] * 0.90) + << " The datarate for the file exceeds the target!"; + + ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_[0] * 1.4) + << " The datarate for the file missed the target!"; + + free(roi_.roi_map); +} + #if CONFIG_VP9_TEMPORAL_DENOISING class DatarateTestVP9LargeDenoiser : public DatarateTestVP9Large { public: @@ -1224,12 +1291,70 @@ class DatarateOnePassCbrSvc base_speed_setting_ = 5; spatial_layer_id_ = 0; temporal_layer_id_ = 0; + update_pattern_ = 0; memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_)); memset(bits_total_, 0, sizeof(bits_total_)); memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_)); dynamic_drop_layer_ = false; } virtual void BeginPassHook(unsigned int /*pass*/) {} + + // Example pattern for spatial layers and 2 temporal layers used in the + // bypass/flexible mode. The pattern corresponds to the pattern + // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in + // non-flexible mode, except that we disable inter-layer prediction. + void set_frame_flags_bypass_mode( + int tl, int num_spatial_layers, int is_key_frame, + vpx_svc_ref_frame_config_t *ref_frame_config) { + for (int sl = 0; sl < num_spatial_layers; ++sl) { + if (!tl) { + if (!sl) { + ref_frame_config->frame_flags[sl] = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF; + } else { + if (is_key_frame) { + ref_frame_config->frame_flags[sl] = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | + VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_ARF; + } else { + ref_frame_config->frame_flags[sl] = + VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | + VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF; + } + } + } else if (tl == 1) { + if (!sl) { + ref_frame_config->frame_flags[sl] = + VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | + VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF; + } else { + ref_frame_config->frame_flags[sl] = + VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | + VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_REF_GF; + } + } + if (tl == 0) { + ref_frame_config->lst_fb_idx[sl] = sl; + if (sl) { + if (is_key_frame) { + ref_frame_config->lst_fb_idx[sl] = sl - 1; + ref_frame_config->gld_fb_idx[sl] = sl; + } else { + ref_frame_config->gld_fb_idx[sl] = sl - 1; + } + } else { + ref_frame_config->gld_fb_idx[sl] = 0; + } + ref_frame_config->alt_fb_idx[sl] = 0; + } else if (tl == 1) { + ref_frame_config->lst_fb_idx[sl] = sl; + ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1; + ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl; + } + } + } + virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video, ::libvpx_test::Encoder *encoder) { if (video->frame() == 0) { @@ -1255,6 +1380,21 @@ class DatarateOnePassCbrSvc encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_); } + if (update_pattern_ && video->frame() >= 100) { + vpx_svc_layer_id_t layer_id; + if (video->frame() == 100) { + cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS; + encoder->Config(&cfg_); + } + // Set layer id since the pattern changed. + layer_id.spatial_layer_id = 0; + layer_id.temporal_layer_id = (video->frame() % 2 != 0); + encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); + set_frame_flags_bypass_mode(layer_id.temporal_layer_id, + number_spatial_layers_, 0, &ref_frame_config); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); + } + if (dynamic_drop_layer_) { if (video->frame() == 100) { // Change layer bitrates to set top layer to 0. This will trigger skip @@ -1351,6 +1491,14 @@ class DatarateOnePassCbrSvc << "Buffer Underrun at frame " << pkt->data.frame.pts; } } + + ASSERT_EQ(pkt->data.frame.width[sl], + top_sl_width_ * svc_params_.scaling_factor_num[sl] / + svc_params_.scaling_factor_den[sl]); + + ASSERT_EQ(pkt->data.frame.height[sl], + top_sl_height_ * svc_params_.scaling_factor_num[sl] / + svc_params_.scaling_factor_den[sl]); } } @@ -1393,6 +1541,10 @@ class DatarateOnePassCbrSvc int number_temporal_layers_; int layer_target_avg_bandwidth_[VPX_MAX_LAYERS]; bool dynamic_drop_layer_; + unsigned int top_sl_width_; + unsigned int top_sl_height_; + vpx_svc_ref_frame_config_t ref_frame_config; + int update_pattern_; }; static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg, const vpx_svc_extra_cfg_t *svc_params, @@ -1486,6 +1638,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TLScreenContent1) { number_spatial_layers_ = cfg_.ss_number_layers; number_temporal_layers_ = cfg_.ts_number_layers; ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; cfg_.rc_target_bitrate = 500; ResetModel(); tune_content_ = 1; @@ -1527,6 +1681,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL) { number_temporal_layers_ = cfg_.ts_number_layers; ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; // TODO(marpan): Check that effective_datarate for each layer hits the // layer target_bitrate. for (int i = 200; i <= 800; i += 200) { @@ -1577,6 +1733,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLDenoiserOn) { number_temporal_layers_ = cfg_.ts_number_layers; ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; // TODO(marpan): Check that effective_datarate for each layer hits the // layer target_bitrate. // For SVC, noise_sen = 1 means denoising only the top spatial layer @@ -1633,6 +1791,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLSmallKf) { number_temporal_layers_ = cfg_.ts_number_layers; ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; // For this 3 temporal layer case, pattern repeats every 4 frames, so choose // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). for (int j = 64; j <= 67; j++) { @@ -1675,6 +1835,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4Threads) { number_spatial_layers_ = cfg_.ss_number_layers; number_temporal_layers_ = cfg_.ts_number_layers; ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; cfg_.rc_target_bitrate = 800; ResetModel(); assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, @@ -1722,6 +1884,60 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL) { number_temporal_layers_ = cfg_.ts_number_layers; ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; + cfg_.rc_target_bitrate = 800; + ResetModel(); + assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, + cfg_.ts_number_layers, cfg_.temporal_layering_mode, + layer_target_avg_bandwidth_, bits_in_buffer_model_); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + CheckLayerRateTargeting(&cfg_, number_spatial_layers_, + number_temporal_layers_, file_datarate_, 0.78, 1.15); +#if CONFIG_VP9_DECODER + // Number of temporal layers > 1, so half of the frames in this SVC pattern + // will be non-reference frame and hence encoder will avoid loopfilter. + // Since frame dropper is off, we can expect 200 (half of the sequence) + // mismatched frames. + EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames()); +#endif +} + +// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and +// 2 temporal layers, with a change on the fly from the fixed SVC pattern to one +// generate via SVC_SET_REF_FRAME_CONFIG. The new pattern also disables +// inter-layer prediction. +TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL2TLDynamicPatternChange) { + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + cfg_.g_lag_in_frames = 0; + cfg_.ss_number_layers = 3; + cfg_.ts_number_layers = 2; + cfg_.ts_rate_decimator[0] = 2; + cfg_.ts_rate_decimator[1] = 1; + cfg_.g_error_resilient = 1; + cfg_.g_threads = 1; + cfg_.temporal_layering_mode = 2; + svc_params_.scaling_factor_num[0] = 72; + svc_params_.scaling_factor_den[0] = 288; + svc_params_.scaling_factor_num[1] = 144; + svc_params_.scaling_factor_den[1] = 288; + svc_params_.scaling_factor_num[2] = 288; + svc_params_.scaling_factor_den[2] = 288; + cfg_.rc_dropframe_thresh = 0; + cfg_.kf_max_dist = 9999; + number_spatial_layers_ = cfg_.ss_number_layers; + number_temporal_layers_ = cfg_.ts_number_layers; + // Change SVC pattern on the fly. + update_pattern_ = 1; + ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, + 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; cfg_.rc_target_bitrate = 800; ResetModel(); assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, @@ -1769,6 +1985,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL_to_2SL_dynamic) { number_temporal_layers_ = cfg_.ts_number_layers; ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; cfg_.rc_target_bitrate = 800; ResetModel(); dynamic_drop_layer_ = true; @@ -1812,6 +2030,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TLSmallKf) { number_temporal_layers_ = cfg_.ts_number_layers; ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, 0, 400); + top_sl_width_ = 640; + top_sl_height_ = 480; // For this 3 temporal layer case, pattern repeats every 4 frames, so choose // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2). for (int j = 32; j <= 35; j++) { @@ -1856,6 +2076,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL4threads) { number_spatial_layers_ = cfg_.ss_number_layers; number_temporal_layers_ = cfg_.ts_number_layers; ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; cfg_.rc_target_bitrate = 800; ResetModel(); assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers, @@ -1911,6 +2133,8 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TL5x5MultipleRuns) { bits_in_buffer_model_[1] = cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz; ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); + top_sl_width_ = 1280; + top_sl_height_ = 720; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); CheckLayerRateTargeting(&cfg_, number_spatial_layers_, number_temporal_layers_, file_datarate_, 0.78, 1.15); @@ -1926,6 +2150,9 @@ VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large, ::testing::Values(::libvpx_test::kOnePassGood, ::libvpx_test::kRealTime), ::testing::Range(2, 9)); +VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTime, + ::testing::Values(::libvpx_test::kRealTime), + ::testing::Range(5, 9)); #if CONFIG_VP9_TEMPORAL_DENOISING VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser, ::testing::Values(::libvpx_test::kRealTime), diff --git a/test/dct_test.cc b/test/dct_test.cc index a5ac9a0dc..71b36c7bb 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -597,7 +597,9 @@ class TransHT : public TransTestBase { TransHT() { fwd_txfm_ref = fht_ref; } }; -TEST_P(TransHT, AccuracyCheck) { RunAccuracyCheck(1); } +TEST_P(TransHT, AccuracyCheck) { + RunAccuracyCheck(size_ == 16 && bit_depth_ > 10 ? 2 : 1); +} TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); } @@ -605,17 +607,6 @@ TEST_P(TransHT, MemCheck) { RunMemCheck(); } TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); } -/* TODO:(johannkoenig) Determine why these fail AccuracyCheck - make_tuple(&vp9_highbd_fht16x16_c, - &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 0, VPX_BITS_12, 2), - make_tuple(&vp9_highbd_fht16x16_c, - &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 1, VPX_BITS_12, 2), - make_tuple(&vp9_highbd_fht16x16_c, - &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 2, VPX_BITS_12, 2), - make_tuple(&vp9_highbd_fht16x16_c, - &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3, VPX_BITS_12, 2), - */ - const DctParam c_ht_tests[] = { #if CONFIG_VP9_HIGHBITDEPTH make_tuple(&vp9_highbd_fht16x16_c, @@ -642,6 +633,19 @@ const DctParam c_ht_tests[] = { make_tuple(&vp9_highbd_fht16x16_c, &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3, VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 0, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 1, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 2, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>, 16, 3, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_c>, 8, 0, VPX_BITS_8, 2), @@ -748,6 +752,69 @@ INSTANTIATE_TEST_CASE_P(C, TransHT, ::testing::ValuesIn(c_ht_tests)); #if !CONFIG_EMULATE_HARDWARE +#if HAVE_NEON + +const DctParam neon_ht_tests[] = { +#if CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vp9_highbd_fht4x4_c, + &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 0, + VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht4x4_c, + &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 1, + VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht4x4_c, + &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 2, + VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht4x4_c, + &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 3, + VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht4x4_c, + &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 0, + VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht4x4_c, + &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 1, + VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht4x4_c, + &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 2, + VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht4x4_c, + &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 3, + VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht4x4_c, + &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 0, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht4x4_c, + &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 1, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht4x4_c, + &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 2, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht4x4_c, + &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4, 3, + VPX_BITS_12, 2), +#endif // CONFIG_VP9_HIGHBITDEPTH + make_tuple(&vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 0, + VPX_BITS_8, 1), + make_tuple(&vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1, + VPX_BITS_8, 1), + make_tuple(&vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 2, + VPX_BITS_8, 1), + make_tuple(&vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 3, + VPX_BITS_8, 1), + + make_tuple(&vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 0, + VPX_BITS_8, 1), + make_tuple(&vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1, + VPX_BITS_8, 1), + make_tuple(&vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 2, + VPX_BITS_8, 1), + make_tuple(&vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 3, + VPX_BITS_8, 1) +}; + +INSTANTIATE_TEST_CASE_P(NEON, TransHT, ::testing::ValuesIn(neon_ht_tests)); +#endif // HAVE_NEON + #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( SSE2, TransHT, @@ -784,6 +851,80 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE4_1, TransHT, ::testing::Values( + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 0, VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 1, VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 2, VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 3, VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 0, VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 1, VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 2, VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 3, VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 0, VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 1, VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 2, VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht16x16_c, + &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, + 3, VPX_BITS_12, 2), + + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0, + VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 1, + VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 2, + VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 3, + VPX_BITS_8, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0, + VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 1, + VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 2, + VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 3, + VPX_BITS_10, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 0, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 1, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 2, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht8x8_c, + &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>, 8, 3, + VPX_BITS_12, 2), + make_tuple(&vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_sse4_1>, 4, 0, VPX_BITS_8, 2), diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 164db5a7b..87e29b61d 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -106,4 +106,90 @@ TEST(EncodeAPI, ImageSizeSetting) { } #endif +// Set up 2 spatial streams with 2 temporal layers per stream, and generate +// invalid configuration by setting the temporal layer rate allocation +// (ts_target_bitrate[]) to 0 for both layers. This should fail independent of +// CONFIG_MULTI_RES_ENCODING. +TEST(EncodeAPI, MultiResEncode) { + static const vpx_codec_iface_t *kCodecs[] = { +#if CONFIG_VP8_ENCODER + &vpx_codec_vp8_cx_algo, +#endif +#if CONFIG_VP9_ENCODER + &vpx_codec_vp9_cx_algo, +#endif + }; + const int width = 1280; + const int height = 720; + const int width_down = width / 2; + const int height_down = height / 2; + const int target_bitrate = 1000; + const int framerate = 30; + + for (int c = 0; c < NELEMENTS(kCodecs); ++c) { + const vpx_codec_iface_t *const iface = kCodecs[c]; + vpx_codec_ctx_t enc[2]; + vpx_codec_enc_cfg_t cfg[2]; + vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } }; + + memset(enc, 0, sizeof(enc)); + + for (int i = 0; i < 2; i++) { + vpx_codec_enc_config_default(iface, &cfg[i], 0); + } + + /* Highest-resolution encoder settings */ + cfg[0].g_w = width; + cfg[0].g_h = height; + cfg[0].rc_dropframe_thresh = 0; + cfg[0].rc_end_usage = VPX_CBR; + cfg[0].rc_resize_allowed = 0; + cfg[0].rc_min_quantizer = 2; + cfg[0].rc_max_quantizer = 56; + cfg[0].rc_undershoot_pct = 100; + cfg[0].rc_overshoot_pct = 15; + cfg[0].rc_buf_initial_sz = 500; + cfg[0].rc_buf_optimal_sz = 600; + cfg[0].rc_buf_sz = 1000; + cfg[0].g_error_resilient = 1; /* Enable error resilient mode */ + cfg[0].g_lag_in_frames = 0; + + cfg[0].kf_mode = VPX_KF_AUTO; + cfg[0].kf_min_dist = 3000; + cfg[0].kf_max_dist = 3000; + + cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */ + cfg[0].g_timebase.num = 1; /* Set fps */ + cfg[0].g_timebase.den = framerate; + + memcpy(&cfg[1], &cfg[0], sizeof(cfg[0])); + cfg[1].rc_target_bitrate = 500; + cfg[1].g_w = width_down; + cfg[1].g_h = height_down; + + for (int i = 0; i < 2; i++) { + cfg[i].ts_number_layers = 2; + cfg[i].ts_periodicity = 2; + cfg[i].ts_rate_decimator[0] = 2; + cfg[i].ts_rate_decimator[1] = 1; + cfg[i].ts_layer_id[0] = 0; + cfg[i].ts_layer_id[1] = 1; + // Invalid parameters. + cfg[i].ts_target_bitrate[0] = 0; + cfg[i].ts_target_bitrate[1] = 0; + } + + // VP9 should report incapable, VP8 invalid for all configurations. + const char kVP9Name[] = "WebM Project VP9"; + const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface), + sizeof(kVP9Name) - 1) == 0; + EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM, + vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0])); + + for (int i = 0; i < 2; i++) { + vpx_codec_destroy(&enc[i]); + } + } +} + } // namespace diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index 89a3b1767..bec6ca3e6 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -128,6 +128,11 @@ class Encoder { ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } + void Control(int ctrl_id, struct vpx_svc_ref_frame_config *arg) { + const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); + ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); + } + void Control(int ctrl_id, struct vpx_svc_parameters *arg) { const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); @@ -137,15 +142,12 @@ class Encoder { const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } -#endif -#if CONFIG_VP8_ENCODER void Control(int ctrl_id, vpx_roi_map_t *arg) { const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); } #endif - void Config(const vpx_codec_enc_cfg_t *cfg) { const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg); ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError(); diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc index 79220b0f6..43a4c6929 100644 --- a/test/invalid_file_test.cc +++ b/test/invalid_file_test.cc @@ -123,6 +123,7 @@ TEST_P(InvalidFileTest, ReturnCode) { RunTest(); } #if CONFIG_VP8_DECODER const DecodeParam kVP8InvalidFileTests[] = { { 1, "invalid-bug-1443.ivf" }, + { 1, "invalid-token-partition.ivf" }, }; VP8_INSTANTIATE_TEST_CASE(InvalidFileTest, diff --git a/test/resize_test.cc b/test/resize_test.cc index 39952074b..5f80af6fb 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -278,10 +278,10 @@ class ResizeTest } virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { - ASSERT_NE(static_cast<int>(pkt->data.frame.width), 0); - ASSERT_NE(static_cast<int>(pkt->data.frame.height), 0); - encode_frame_width_.push_back(pkt->data.frame.width); - encode_frame_height_.push_back(pkt->data.frame.height); + ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0); + ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0); + encode_frame_width_.push_back(pkt->data.frame.width[0]); + encode_frame_height_.push_back(pkt->data.frame.height[0]); } unsigned int GetFrameWidth(size_t idx) const { @@ -485,10 +485,10 @@ class ResizeRealtimeTest } virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { - ASSERT_NE(static_cast<int>(pkt->data.frame.width), 0); - ASSERT_NE(static_cast<int>(pkt->data.frame.height), 0); - encode_frame_width_.push_back(pkt->data.frame.width); - encode_frame_height_.push_back(pkt->data.frame.height); + ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0); + ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0); + encode_frame_width_.push_back(pkt->data.frame.width[0]); + encode_frame_height_.push_back(pkt->data.frame.height[0]); } unsigned int GetMismatchFrames() { return mismatch_nframes_; } diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc index 9c407c649..e9d2efaa5 100644 --- a/test/sum_squares_test.cc +++ b/test/sum_squares_test.cc @@ -112,8 +112,9 @@ INSTANTIATE_TEST_CASE_P( #endif // HAVE_SSE2 #if HAVE_MSA -INSTANTIATE_TEST_CASE_P(MSA, SumSquaresTest, ::testing::Values(make_tuple( - &vpx_sum_squares_2d_i16_c, - &vpx_sum_squares_2d_i16_msa))); +INSTANTIATE_TEST_CASE_P( + MSA, SumSquaresTest, + ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, + &vpx_sum_squares_2d_i16_msa))); #endif // HAVE_MSA } // namespace diff --git a/test/test-data.mk b/test/test-data.mk index f405e4ef1..7ca11bc9c 100644 --- a/test/test-data.mk +++ b/test/test-data.mk @@ -734,6 +734,8 @@ endif # CONFIG_VP9_HIGHBITDEPTH # Invalid files for testing libvpx error checking. LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf +LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm diff --git a/test/test-data.sha1 b/test/test-data.sha1 index 99b4e1e46..3a23ff5db 100644 --- a/test/test-data.sha1 +++ b/test/test-data.sha1 @@ -852,5 +852,7 @@ e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res +1a0e405606939f2febab1a21b30c37cb8f2c8cb1 *invalid-token-partition.ivf +90a8a95e7024f015b87f5483a65036609b3d1b74 *invalid-token-partition.ivf.res 17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5 diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc index 30641ae8c..3405e4566 100644 --- a/test/test_libvpx.cc +++ b/test/test_libvpx.cc @@ -61,7 +61,6 @@ int main(int argc, char **argv) { #if !CONFIG_SHARED // Shared library builds don't support whitebox tests // that exercise internal symbols. - #if CONFIG_VP8 vp8_rtcd(); #endif // CONFIG_VP8 diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc index 955f567ce..d0cbfbbda 100644 --- a/test/vp9_end_to_end_test.cc +++ b/test/vp9_end_to_end_test.cc @@ -59,7 +59,7 @@ const TestVideoParam kTestVectors[] = { // Encoding modes tested const libvpx_test::TestMode kEncodingModeVectors[] = { ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood, - ::libvpx_test::kRealTime, + ::libvpx_test::kRealTime }; // Speed settings tested diff --git a/test/vp9_motion_vector_test.cc b/test/vp9_motion_vector_test.cc index 1030204ae..5dd2ddb47 100644 --- a/test/vp9_motion_vector_test.cc +++ b/test/vp9_motion_vector_test.cc @@ -22,7 +22,7 @@ namespace { // Encoding modes const libvpx_test::TestMode kEncodingModeVectors[] = { ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood, - ::libvpx_test::kRealTime, + ::libvpx_test::kRealTime }; // Encoding speeds diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 8713ac655..f2234f045 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -532,6 +532,14 @@ INSTANTIATE_TEST_CASE_P( false))); #endif // HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH +#if ARCH_X86_64 && HAVE_AVX2 +INSTANTIATE_TEST_CASE_P( + AVX2, VP9QuantizeTest, + ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>, + &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, + 16, true))); +#endif // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH + // TODO(webm:1448): dqcoeff is not handled correctly in HBD builds. #if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P( diff --git a/test/vp9_scale_test.cc b/test/vp9_scale_test.cc index 5d7d38e89..f3e7f0a0e 100644 --- a/test/vp9_scale_test.cc +++ b/test/vp9_scale_test.cc @@ -47,7 +47,7 @@ class ScaleTest : public VpxScaleBase, scale_fn_(&img_, &dst_img_, filter_type, phase_scaler)); } - void RunTest() { + void RunTest(INTERP_FILTER filter_type) { static const int kNumSizesToTest = 20; static const int kNumScaleFactorsToTest = 4; static const int kSizesToTest[] = { @@ -55,50 +55,48 @@ class ScaleTest : public VpxScaleBase, 22, 24, 26, 28, 30, 32, 34, 68, 128, 134 }; static const int kScaleFactors[] = { 1, 2, 3, 4 }; - for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) { - for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) { - for (int h = 0; h < kNumSizesToTest; ++h) { - const int src_height = kSizesToTest[h]; - for (int w = 0; w < kNumSizesToTest; ++w) { - const int src_width = kSizesToTest[w]; - for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; - ++sf_up_idx) { - const int sf_up = kScaleFactors[sf_up_idx]; - for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest; - ++sf_down_idx) { - const int sf_down = kScaleFactors[sf_down_idx]; - const int dst_width = src_width * sf_up / sf_down; - const int dst_height = src_height * sf_up / sf_down; - if (sf_up == sf_down && sf_up != 1) { - continue; - } - // I420 frame width and height must be even. - if (!dst_width || !dst_height || dst_width & 1 || - dst_height & 1) { - continue; - } - // vpx_convolve8_c() has restriction on the step which cannot - // exceed 64 (ratio 1 to 4). - if (src_width > 4 * dst_width || src_height > 4 * dst_height) { - continue; - } - ASSERT_NO_FATAL_FAILURE(ResetScaleImages( - src_width, src_height, dst_width, dst_height)); - ReferenceScaleFrame(filter_type, phase_scaler); - ScaleFrame(filter_type, phase_scaler); - if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc, - ref_img_.frame_size)) { - printf( - "filter_type = %d, phase_scaler = %d, src_width = %4d, " - "src_height = %4d, dst_width = %4d, dst_height = %4d, " - "scale factor = %d:%d\n", - filter_type, phase_scaler, src_width, src_height, - dst_width, dst_height, sf_down, sf_up); - PrintDiff(); - } - CompareImages(dst_img_); - DeallocScaleImages(); + for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) { + for (int h = 0; h < kNumSizesToTest; ++h) { + const int src_height = kSizesToTest[h]; + for (int w = 0; w < kNumSizesToTest; ++w) { + const int src_width = kSizesToTest[w]; + for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; + ++sf_up_idx) { + const int sf_up = kScaleFactors[sf_up_idx]; + for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest; + ++sf_down_idx) { + const int sf_down = kScaleFactors[sf_down_idx]; + const int dst_width = src_width * sf_up / sf_down; + const int dst_height = src_height * sf_up / sf_down; + if (sf_up == sf_down && sf_up != 1) { + continue; } + // I420 frame width and height must be even. + if (!dst_width || !dst_height || dst_width & 1 || + dst_height & 1) { + continue; + } + // vpx_convolve8_c() has restriction on the step which cannot + // exceed 64 (ratio 1 to 4). + if (src_width > 4 * dst_width || src_height > 4 * dst_height) { + continue; + } + ASSERT_NO_FATAL_FAILURE(ResetScaleImages(src_width, src_height, + dst_width, dst_height)); + ReferenceScaleFrame(filter_type, phase_scaler); + ScaleFrame(filter_type, phase_scaler); + if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc, + ref_img_.frame_size)) { + printf( + "filter_type = %d, phase_scaler = %d, src_width = %4d, " + "src_height = %4d, dst_width = %4d, dst_height = %4d, " + "scale factor = %d:%d\n", + filter_type, phase_scaler, src_width, src_height, dst_width, + dst_height, sf_down, sf_up); + PrintDiff(); + } + CompareImages(dst_img_); + DeallocScaleImages(); } } } @@ -145,7 +143,10 @@ class ScaleTest : public VpxScaleBase, ScaleFrameFunc scale_fn_; }; -TEST_P(ScaleTest, ScaleFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); } +TEST_P(ScaleTest, ScaleFrame_EightTap) { RunTest(EIGHTTAP); } +TEST_P(ScaleTest, ScaleFrame_EightTapSmooth) { RunTest(EIGHTTAP_SMOOTH); } +TEST_P(ScaleTest, ScaleFrame_EightTapSharp) { RunTest(EIGHTTAP_SHARP); } +TEST_P(ScaleTest, ScaleFrame_Bilinear) { RunTest(BILINEAR); } TEST_P(ScaleTest, DISABLED_Speed) { static const int kCountSpeedTestBlock = 100; diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc index 576f5e906..8eff3f11f 100644 --- a/test/vp9_thread_test.cc +++ b/test/vp9_thread_test.cc @@ -147,7 +147,6 @@ TEST(VPxWorkerThreadTest, TestInterfaceAPI) { // ----------------------------------------------------------------------------- // Multi-threaded decode tests - #if CONFIG_WEBM_IO struct FileList { const char *name; diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h index 1a3aad16a..ddcc307eb 100644 --- a/vp8/common/blockd.h +++ b/vp8/common/blockd.h @@ -37,7 +37,9 @@ extern "C" { #define SEGMENT_DELTADATA 0 #define SEGMENT_ABSDATA 1 -typedef struct { int r, c; } POS; +typedef struct { + int r, c; +} POS; #define PLANE_TYPE_Y_NO_DC 0 #define PLANE_TYPE_Y2 1 @@ -180,6 +182,9 @@ typedef struct { unsigned int low_res_ref_frames[MAX_REF_FRAMES]; // The video frame counter value for the key frame, for lowest resolution. unsigned int key_frame_counter_value; + // Flags to signal skipped encoding of previous and base layer stream. + unsigned int skip_encoding_prev_stream; + unsigned int skip_encoding_base_stream; LOWER_RES_MB_INFO *mb_info; } LOWER_RES_FRAME_INFO; #endif diff --git a/vp8/common/extend.c b/vp8/common/extend.c index 2d67b516b..f4dbce2cd 100644 --- a/vp8/common/extend.c +++ b/vp8/common/extend.c @@ -20,8 +20,7 @@ static void copy_and_extend_plane(unsigned char *s, /* source */ int et, /* extend top border */ int el, /* extend left border */ int eb, /* extend bottom border */ - int er /* extend right border */ - ) { + int er) { /* extend right border */ int i; unsigned char *src_ptr1, *src_ptr2; unsigned char *dest_ptr1, *dest_ptr2; diff --git a/vp8/common/mips/mmi/idct_blk_mmi.c b/vp8/common/mips/mmi/idct_blk_mmi.c index f6020ab46..3f6907174 100644 --- a/vp8/common/mips/mmi/idct_blk_mmi.c +++ b/vp8/common/mips/mmi/idct_blk_mmi.c @@ -12,7 +12,7 @@ #include "vpx_mem/vpx_mem.h" void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst, - int stride, int8_t *eobs) { + int stride, char *eobs) { int i, j; for (i = 0; i < 4; i++) { @@ -33,8 +33,7 @@ void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst, } void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dstu, - uint8_t *dstv, int stride, - int8_t *eobs) { + uint8_t *dstv, int stride, char *eobs) { int i, j; for (i = 0; i < 2; i++) { diff --git a/vp8/common/modecont.c b/vp8/common/modecont.c index d6ad9bb99..bab410374 100644 --- a/vp8/common/modecont.c +++ b/vp8/common/modecont.c @@ -11,28 +11,16 @@ #include "entropy.h" const int vp8_mode_contexts[6][4] = { - { - /* 0 */ - 7, 1, 1, 143, - }, - { - /* 1 */ - 14, 18, 14, 107, - }, - { - /* 2 */ - 135, 64, 57, 68, - }, - { - /* 3 */ - 60, 56, 128, 65, - }, - { - /* 4 */ - 159, 134, 128, 34, - }, - { - /* 5 */ - 234, 188, 128, 28, - }, + { /* 0 */ + 7, 1, 1, 143 }, + { /* 1 */ + 14, 18, 14, 107 }, + { /* 2 */ + 135, 64, 57, 68 }, + { /* 3 */ + 60, 56, 128, 65 }, + { /* 4 */ + 159, 134, 128, 34 }, + { /* 5 */ + 234, 188, 128, 28 }, }; diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c index de59b3894..de836f19d 100644 --- a/vp8/common/x86/vp8_asm_stubs.c +++ b/vp8/common/x86/vp8_asm_stubs.c @@ -95,9 +95,7 @@ void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, - int dst_pitch - - ) { + int dst_pitch) { DECLARE_ALIGNED(16, unsigned short, FData2[24 * 24]); /* Temp data bufffer used in filtering */ @@ -236,9 +234,7 @@ extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr, void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, - int dst_pitch - - ) { + int dst_pitch) { DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]); if (xoffset) { diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h index 04c027cd7..9c529f266 100644 --- a/vp8/decoder/dboolhuff.h +++ b/vp8/decoder/dboolhuff.h @@ -76,7 +76,7 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) { } { - register int shift = vp8_norm[range]; + const int shift = vp8_norm[range]; range <<= shift; value <<= shift; count -= shift; diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c index 077bd3da2..8bfd3cea3 100644 --- a/vp8/decoder/decodeframe.c +++ b/vp8/decoder/decodeframe.c @@ -674,7 +674,7 @@ static unsigned int read_partition_size(VP8D_COMP *pbi, static int read_is_valid(const unsigned char *start, size_t len, const unsigned char *end) { - return (start + len > start && start + len <= end); + return len != 0 && end > start && len <= (size_t)(end - start); } static unsigned int read_available_partition_size( diff --git a/vp8/decoder/ec_types.h b/vp8/decoder/ec_types.h index 0ab08b649..8da561f47 100644 --- a/vp8/decoder/ec_types.h +++ b/vp8/decoder/ec_types.h @@ -34,7 +34,9 @@ typedef struct { /* Structure used to hold all the overlaps of a macroblock. The overlaps of a * macroblock is further divided into block overlaps. */ -typedef struct { B_OVERLAP overlaps[16]; } MB_OVERLAP; +typedef struct { + B_OVERLAP overlaps[16]; +} MB_OVERLAP; /* Structure for keeping track of motion vectors and which reference frame they * refer to. Used for motion vector interpolation. diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h index 5ecacdbb9..3689258f1 100644 --- a/vp8/decoder/onyxd_int.h +++ b/vp8/decoder/onyxd_int.h @@ -31,7 +31,9 @@ typedef struct { void *ptr2; } DECODETHREAD_DATA; -typedef struct { MACROBLOCKD mbd; } MB_ROW_DEC; +typedef struct { + MACROBLOCKD mbd; +} MB_ROW_DEC; typedef struct { int enabled; diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index d0213f75c..aadc8dc71 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -739,24 +739,21 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { /* Allocate memory for above_row buffers. */ CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR( - pbi->mt_yabove_row[i], - vpx_memalign( - 16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1)))); + CHECK_MEM_ERROR(pbi->mt_yabove_row[i], + vpx_memalign(16, sizeof(unsigned char) * + (width + (VP8BORDERINPIXELS << 1)))); CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR( - pbi->mt_uabove_row[i], - vpx_memalign(16, - sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); + CHECK_MEM_ERROR(pbi->mt_uabove_row[i], + vpx_memalign(16, sizeof(unsigned char) * + (uv_width + VP8BORDERINPIXELS))); CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows); for (i = 0; i < pc->mb_rows; ++i) - CHECK_MEM_ERROR( - pbi->mt_vabove_row[i], - vpx_memalign(16, - sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS))); + CHECK_MEM_ERROR(pbi->mt_vabove_row[i], + vpx_memalign(16, sizeof(unsigned char) * + (uv_width + VP8BORDERINPIXELS))); /* Allocate memory for left_col buffers. */ CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows); diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h index 1593f235e..3c5775f09 100644 --- a/vp8/encoder/boolhuff.h +++ b/vp8/encoder/boolhuff.h @@ -61,7 +61,7 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) { int count = br->count; unsigned int range = br->range; unsigned int lowvalue = br->lowvalue; - register int shift; + int shift; #ifdef VP8_ENTROPY_STATS #if defined(SECTIONBITS_OUTPUT) diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 70f924341..4ea991e52 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -989,11 +989,11 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats, bits_per_mb_at_this_q = vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb; - bits_per_mb_at_this_q = (int)(.5 + - err_correction_factor * speed_correction * - cpi->twopass.est_max_qcorrection_factor * - cpi->twopass.section_max_qfactor * - (double)bits_per_mb_at_this_q); + bits_per_mb_at_this_q = + (int)(.5 + err_correction_factor * speed_correction * + cpi->twopass.est_max_qcorrection_factor * + cpi->twopass.section_max_qfactor * + (double)bits_per_mb_at_this_q); /* Mode and motion overhead */ /* As Q rises in real encode loop rd code will force overhead down @@ -1086,9 +1086,8 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats, vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb; bits_per_mb_at_this_q = - (int)(.5 + - err_correction_factor * speed_correction * clip_iifactor * - (double)bits_per_mb_at_this_q); + (int)(.5 + err_correction_factor * speed_correction * clip_iifactor * + (double)bits_per_mb_at_this_q); /* Mode and motion overhead */ /* As Q rises in real encode loop rd code will force overhead down @@ -1273,9 +1272,8 @@ void vp8_init_second_pass(VP8_COMP *cpi) { * sum duration is not. Its calculated based on the actual durations of * all frames from the first pass. */ - vp8_new_framerate(cpi, - 10000000.0 * cpi->twopass.total_stats.count / - cpi->twopass.total_stats.duration); + vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / + cpi->twopass.total_stats.duration); cpi->output_framerate = cpi->framerate; cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * @@ -1739,10 +1737,11 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { /* Dont break out very close to a key frame */ ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) && ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) && - (!flash_detected) && ((mv_ratio_accumulator > 100.0) || - (abs_mv_in_out_accumulator > 3.0) || - (mv_in_out_accumulator < -2.0) || - ((boost_score - old_boost_score) < 2.0)))) { + (!flash_detected) && + ((mv_ratio_accumulator > 100.0) || + (abs_mv_in_out_accumulator > 3.0) || + (mv_in_out_accumulator < -2.0) || + ((boost_score - old_boost_score) < 2.0)))) { boost_score = old_boost_score; break; } @@ -1815,8 +1814,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { (next_frame.pcnt_inter > 0.75) && ((mv_in_out_accumulator / (double)i > -0.2) || (mv_in_out_accumulator > -2.0)) && - (cpi->gfu_boost > 100) && (cpi->twopass.gf_decay_rate <= - (ARF_DECAY_THRESH + (cpi->gfu_boost / 200)))) + (cpi->gfu_boost > 100) && + (cpi->twopass.gf_decay_rate <= + (ARF_DECAY_THRESH + (cpi->gfu_boost / 200)))) #endif { int Boost; diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 3c92be32f..2da940199 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -2862,7 +2862,6 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) fclose(yframe); } #endif -/* return of 0 means drop frame */ #if !CONFIG_REALTIME_ONLY /* Function to test for conditions that indeicate we should loop @@ -3364,11 +3363,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info; if (cpi->oxcf.mr_encoder_id) { - // TODO(marpan): This constraint shouldn't be needed, as we would like - // to allow for key frame setting (forced or periodic) defined per - // spatial layer. For now, keep this in. - cm->frame_type = low_res_frame_info->frame_type; - // Check if lower resolution is available for motion vector reuse. if (cm->frame_type != KEY_FRAME) { cpi->mr_low_res_mv_avail = 1; @@ -3393,7 +3387,16 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]); */ } + // Disable motion vector reuse (i.e., disable any usage of the low_res) + // if the previous lower stream is skipped/disabled. + if (low_res_frame_info->skip_encoding_prev_stream) { + cpi->mr_low_res_mv_avail = 0; + } } + // This stream is not skipped (i.e., it's being encoded), so set this skip + // flag to 0. This is needed for the next stream (i.e., which is the next + // frame to be encoded). + low_res_frame_info->skip_encoding_prev_stream = 0; // On a key frame: For the lowest resolution, keep track of the key frame // counter value. For the higher resolutions, reset the current video @@ -4782,8 +4785,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, cpi->temporal_pattern_counter++; } -/* reset to normal state now that we are done. */ - #if 0 { char filename[512]; @@ -4999,10 +5000,13 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags, // be received for that high layer, which will yield an incorrect // frame rate (from time-stamp adjustment in above calculation). if (cpi->oxcf.mr_encoder_id) { - cpi->ref_framerate = low_res_frame_info->low_res_framerate; + if (!low_res_frame_info->skip_encoding_base_stream) + cpi->ref_framerate = low_res_frame_info->low_res_framerate; } else { // Keep track of frame rate for lowest resolution. low_res_frame_info->low_res_framerate = cpi->ref_framerate; + // The base stream is being encoded so set skip flag to 0. + low_res_frame_info->skip_encoding_base_stream = 0; } } #endif diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index ab9b8fdc8..fc833bccc 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -1052,9 +1052,8 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) { * overflow when values are large */ projected_size_based_on_q = - (int)(((.5 + - rate_correction_factor * - vp8_bits_per_mb[cpi->common.frame_type][Q]) * + (int)(((.5 + rate_correction_factor * + vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) / (1 << BPER_MB_NORMBITS)); diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index e210b4410..fa5dde15d 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -770,9 +770,9 @@ static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate, vp8_quantize_mbuv(x); rate_to = rd_cost_mbuv(x); - this_rate = rate_to + - x->intra_uv_mode_cost[xd->frame_type] - [xd->mode_info_context->mbmi.uv_mode]; + this_rate = + rate_to + x->intra_uv_mode_cost[xd->frame_type] + [xd->mode_info_context->mbmi.uv_mode]; this_distortion = vp8_mbuverror(x) / 4; diff --git a/vp8/encoder/treewriter.h b/vp8/encoder/treewriter.h index dadbbe3f8..e76699092 100644 --- a/vp8/encoder/treewriter.h +++ b/vp8/encoder/treewriter.h @@ -56,8 +56,7 @@ static INLINE unsigned int vp8_cost_branch(const unsigned int ct[2], static void vp8_treed_write(vp8_writer *const w, vp8_tree t, const vp8_prob *const p, int v, - int n /* number of bits in v, assumed nonzero */ - ) { + int n) { /* number of bits in v, assumed nonzero */ vp8_tree_index i = 0; do { @@ -73,8 +72,7 @@ static INLINE void vp8_write_token(vp8_writer *const w, vp8_tree t, } static int vp8_treed_cost(vp8_tree t, const vp8_prob *const p, int v, - int n /* number of bits in v, assumed nonzero */ - ) { + int n) { /* number of bits in v, assumed nonzero */ int c = 0; vp8_tree_index i = 0; diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 8f0a782bc..d3e200594 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -802,7 +802,20 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, unsigned long deadline) { vpx_codec_err_t res = VPX_CODEC_OK; - if (!ctx->cfg.rc_target_bitrate) return res; + if (!ctx->cfg.rc_target_bitrate) { +#if CONFIG_MULTI_RES_ENCODING + if (!ctx->cpi) return VPX_CODEC_ERROR; + if (ctx->cpi->oxcf.mr_total_resolutions > 1) { + LOWER_RES_FRAME_INFO *low_res_frame_info = + (LOWER_RES_FRAME_INFO *)ctx->cpi->oxcf.mr_low_res_mode_info; + if (!low_res_frame_info) return VPX_CODEC_ERROR; + low_res_frame_info->skip_encoding_prev_stream = 1; + if (ctx->cpi->oxcf.mr_encoder_id == 0) + low_res_frame_info->skip_encoding_base_stream = 1; + } +#endif + return res; + } if (img) res = validate_img(ctx, img); @@ -902,8 +915,8 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, (unsigned long)((delta * ctx->cfg.g_timebase.den + round) / ctx->cfg.g_timebase.num / 10000000); pkt.data.frame.flags = lib_flags << 16; - pkt.data.frame.width = cpi->common.Width; - pkt.data.frame.height = cpi->common.Height; + pkt.data.frame.width[0] = cpi->common.Width; + pkt.data.frame.height[0] = cpi->common.Height; if (lib_flags & FRAMEFLAGS_KEY) { pkt.data.frame.flags |= VPX_FRAME_IS_KEY; @@ -1261,6 +1274,9 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = { vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t cfg_maps; */ vp8e_encode, /* vpx_codec_encode_fn_t encode; */ vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t get_cx_data; */ - vp8e_set_config, NULL, vp8e_get_preview, vp8e_mr_alloc_mem, + vp8e_set_config, + NULL, + vp8e_get_preview, + vp8e_mr_alloc_mem, } /* encoder functions */ }; diff --git a/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c new file mode 100644 index 000000000..46284238d --- /dev/null +++ b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void highbd_iadst4(int32x4_t *const io) { + const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 }; + const int32x4_t sinpi = vld1q_s32(sinpis); + int32x4_t s[8]; + + s[0] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 0); + s[1] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 1); + s[2] = vmulq_lane_s32(io[1], vget_high_s32(sinpi), 0); + s[3] = vmulq_lane_s32(io[2], vget_high_s32(sinpi), 1); + s[4] = vmulq_lane_s32(io[2], vget_low_s32(sinpi), 0); + s[5] = vmulq_lane_s32(io[3], vget_low_s32(sinpi), 1); + s[6] = vmulq_lane_s32(io[3], vget_high_s32(sinpi), 1); + s[7] = vsubq_s32(io[0], io[2]); + s[7] = vaddq_s32(s[7], io[3]); + + s[0] = vaddq_s32(s[0], s[3]); + s[0] = vaddq_s32(s[0], s[5]); + s[1] = vsubq_s32(s[1], s[4]); + s[1] = vsubq_s32(s[1], s[6]); + s[3] = s[2]; + s[2] = vmulq_lane_s32(s[7], vget_high_s32(sinpi), 0); + + io[0] = vaddq_s32(s[0], s[3]); + io[1] = vaddq_s32(s[1], s[3]); + io[2] = s[2]; + io[3] = vaddq_s32(s[0], s[1]); + io[3] = vsubq_s32(io[3], s[3]); + io[0] = vrshrq_n_s32(io[0], DCT_CONST_BITS); + io[1] = vrshrq_n_s32(io[1], DCT_CONST_BITS); + io[2] = vrshrq_n_s32(io[2], DCT_CONST_BITS); + io[3] = vrshrq_n_s32(io[3], DCT_CONST_BITS); +} + +void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + int16x8_t a[2]; + int32x4_t c[4]; + + c[0] = vld1q_s32(input); + c[1] = vld1q_s32(input + 4); + c[2] = vld1q_s32(input + 8); + c[3] = vld1q_s32(input + 12); + + if (bd == 8) { + a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1])); + a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3])); + transpose_s16_4x4q(&a[0], &a[1]); + + switch (tx_type) { + case DCT_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + break; + + case ADST_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); + break; + + case DCT_ADST: + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + break; + + default: + assert(tx_type == ADST_ADST); + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); + break; + } + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); + } else { + switch (tx_type) { + case DCT_DCT: { + const int32x4_t cospis = vld1q_s32(kCospi32); + + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + idct4x4_16_kernel_bd12(cospis, c); + } + break; + } + + case ADST_DCT: { + const int32x4_t cospis = vld1q_s32(kCospi32); + + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + } + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + break; + } + + case DCT_ADST: { + const int32x4_t cospis = vld1q_s32(kCospi32); + + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + } + break; + } + + default: { + assert(tx_type == ADST_ADST); + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + break; + } + } + a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4)); + a[1] = vcombine_s16(vqrshrn_n_s32(c[2], 4), vqrshrn_n_s32(c[3], 4)); + } + + highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max); + highbd_idct4x4_1_add_kernel1(&dest, stride, a[1], max); +} diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c index 025254c3f..4f0a90f21 100644 --- a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c +++ b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c @@ -14,206 +14,63 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/txfm_common.h" -static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) { - int32x4_t q8s32, q9s32; - int16x4x2_t d0x2s16, d1x2s16; - int32x4x2_t q0x2s32; - - d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16)); - d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16)); - - q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1])); - q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1])); - q0x2s32 = vtrnq_s32(q8s32, q9s32); - - *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]); - *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]); -} - -static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16, - int16x4_t *d2s16) { - *d0s16 = vdup_n_s16(cospi_8_64); - *d1s16 = vdup_n_s16(cospi_16_64); - *d2s16 = vdup_n_s16(cospi_24_64); -} - -static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16, - int16x4_t *d5s16, int16x8_t *q3s16) { - *d3s16 = vdup_n_s16(sinpi_1_9); - *d4s16 = vdup_n_s16(sinpi_2_9); - *q3s16 = vdupq_n_s16(sinpi_3_9); - *d5s16 = vdup_n_s16(sinpi_4_9); -} - -static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16, - int16x4_t *d2s16, int16x8_t *q8s16, - int16x8_t *q9s16) { - int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16; - int16x4_t d26s16, d27s16, d28s16, d29s16; - int32x4_t q10s32, q13s32, q14s32, q15s32; - int16x8_t q13s16, q14s16; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, *d2s16); - q10s32 = vmull_s16(d17s16, *d0s16); - q13s32 = vmull_s16(d23s16, *d1s16); - q14s32 = vmull_s16(d24s16, *d1s16); - q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16); - q10s32 = vmlal_s16(q10s32, d19s16, *d2s16); - - d26s16 = vrshrn_n_s32(q13s32, 14); - d27s16 = vrshrn_n_s32(q14s32, 14); - d29s16 = vrshrn_n_s32(q15s32, 14); - d28s16 = vrshrn_n_s32(q10s32, 14); - - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - *q8s16 = vaddq_s16(q13s16, q14s16); - *q9s16 = vsubq_s16(q13s16, q14s16); - *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16)); // vswp -} - -static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16, - int16x4_t *d5s16, int16x8_t *q3s16, - int16x8_t *q8s16, int16x8_t *q9s16) { - int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16; - int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; - - d6s16 = vget_low_s16(*q3s16); - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - - q10s32 = vmull_s16(*d3s16, d16s16); - q11s32 = vmull_s16(*d4s16, d16s16); - q12s32 = vmull_s16(d6s16, d17s16); - q13s32 = vmull_s16(*d5s16, d18s16); - q14s32 = vmull_s16(*d3s16, d18s16); - q15s32 = vmovl_s16(d16s16); - q15s32 = vaddw_s16(q15s32, d19s16); - q8s32 = vmull_s16(*d4s16, d19s16); - q15s32 = vsubw_s16(q15s32, d18s16); - q9s32 = vmull_s16(*d5s16, d19s16); - - q10s32 = vaddq_s32(q10s32, q13s32); - q10s32 = vaddq_s32(q10s32, q8s32); - q11s32 = vsubq_s32(q11s32, q14s32); - q8s32 = vdupq_n_s32(sinpi_3_9); - q11s32 = vsubq_s32(q11s32, q9s32); - q15s32 = vmulq_s32(q15s32, q8s32); - - q13s32 = vaddq_s32(q10s32, q12s32); - q10s32 = vaddq_s32(q10s32, q11s32); - q14s32 = vaddq_s32(q11s32, q12s32); - q10s32 = vsubq_s32(q10s32, q12s32); - - d16s16 = vrshrn_n_s32(q13s32, 14); - d17s16 = vrshrn_n_s32(q14s32, 14); - d18s16 = vrshrn_n_s32(q15s32, 14); - d19s16 = vrshrn_n_s32(q10s32, 14); - - *q8s16 = vcombine_s16(d16s16, d17s16); - *q9s16 = vcombine_s16(d18s16, d19s16); -} - void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { - uint8x8_t d26u8, d27u8; - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16; - uint32x2_t d26u32, d27u32; - int16x8_t q3s16, q8s16, q9s16; - uint16x8_t q8u16, q9u16; - - d26u32 = d27u32 = vdup_n_u32(0); + int16x8_t a[2]; + uint8x8_t s[2], d[2]; + uint16x8_t sum[2]; - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); + assert(!((intptr_t)dest % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); - TRANSPOSE4X4(&q8s16, &q9s16); + a[0] = load_tran_low_to_s16q(input); + a[1] = load_tran_low_to_s16q(input + 8); + transpose_s16_4x4q(&a[0], &a[1]); switch (tx_type) { - case 0: // idct_idct is not supported. Fall back to C - vp9_iht4x4_16_add_c(input, dest, stride, tx_type); - return; - case 1: // iadst_idct - // generate constants - GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); - GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); - - // first transform rows - IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); - - // transpose the matrix - TRANSPOSE4X4(&q8s16, &q9s16); - - // then transform columns - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + case DCT_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); break; - case 2: // idct_iadst - // generate constantsyy - GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); - GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); - // first transform rows - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); - - // transpose the matrix - TRANSPOSE4X4(&q8s16, &q9s16); - - // then transform columns - IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); + case ADST_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); break; - case 3: // iadst_iadst - // generate constants - GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); - - // first transform rows - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); - // transpose the matrix - TRANSPOSE4X4(&q8s16, &q9s16); - - // then transform columns - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + case DCT_ADST: + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); break; - default: // iadst_idct - assert(0); + + default: + assert(tx_type == ADST_ADST); + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); break; } - q8s16 = vrshrq_n_s16(q8s16, 4); - q9s16 = vrshrq_n_s16(q9s16, 4); - - d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0); - dest += stride; - d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1); - dest += stride; - d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0); - dest += stride; - d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1); - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); - - d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1); - dest -= stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0); - dest -= stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1); - dest -= stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0); + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); + s[0] = load_u8(dest, stride); + s[1] = load_u8(dest + 2 * stride, stride); + sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s[0]); + sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), s[1]); + d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0])); + d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1])); + store_u8(dest, stride, d[0]); + store_u8(dest + 2 * stride, stride, d[1]); } diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c index 1c739861c..19163bc87 100644 --- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c +++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c @@ -14,527 +14,199 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vp9/common/vp9_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" -static int16_t cospi_2_64 = 16305; -static int16_t cospi_4_64 = 16069; -static int16_t cospi_6_64 = 15679; -static int16_t cospi_8_64 = 15137; -static int16_t cospi_10_64 = 14449; -static int16_t cospi_12_64 = 13623; -static int16_t cospi_14_64 = 12665; -static int16_t cospi_16_64 = 11585; -static int16_t cospi_18_64 = 10394; -static int16_t cospi_20_64 = 9102; -static int16_t cospi_22_64 = 7723; -static int16_t cospi_24_64 = 6270; -static int16_t cospi_26_64 = 4756; -static int16_t cospi_28_64 = 3196; -static int16_t cospi_30_64 = 1606; - -static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - - d0s16 = vdup_n_s16(cospi_28_64); - d1s16 = vdup_n_s16(cospi_4_64); - d2s16 = vdup_n_s16(cospi_12_64); - d3s16 = vdup_n_s16(cospi_20_64); - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d26s16, d2s16); - q6s32 = vmull_s16(d27s16, d2s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); - q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); - - d8s16 = vrshrn_n_s32(q2s32, 14); - d9s16 = vrshrn_n_s32(q3s32, 14); - d10s16 = vrshrn_n_s32(q5s32, 14); - d11s16 = vrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q2s32 = vmull_s16(d18s16, d1s16); - q3s32 = vmull_s16(d19s16, d1s16); - q9s32 = vmull_s16(d26s16, d3s16); - q13s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlal_s16(q2s32, d30s16, d0s16); - q3s32 = vmlal_s16(q3s32, d31s16, d0s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q13s32 = vmlal_s16(q13s32, d23s16, d2s16); - - d14s16 = vrshrn_n_s32(q2s32, 14); - d15s16 = vrshrn_n_s32(q3s32, 14); - d12s16 = vrshrn_n_s32(q9s32, 14); - d13s16 = vrshrn_n_s32(q13s32, 14); - q6s16 = vcombine_s16(d12s16, d13s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d0s16 = vdup_n_s16(cospi_16_64); - - q2s32 = vmull_s16(d16s16, d0s16); - q3s32 = vmull_s16(d17s16, d0s16); - q13s32 = vmull_s16(d16s16, d0s16); - q15s32 = vmull_s16(d17s16, d0s16); - - q2s32 = vmlal_s16(q2s32, d24s16, d0s16); - q3s32 = vmlal_s16(q3s32, d25s16, d0s16); - q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); - q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); - - d0s16 = vdup_n_s16(cospi_24_64); - d1s16 = vdup_n_s16(cospi_8_64); - - d18s16 = vrshrn_n_s32(q2s32, 14); - d19s16 = vrshrn_n_s32(q3s32, 14); - d22s16 = vrshrn_n_s32(q13s32, 14); - d23s16 = vrshrn_n_s32(q15s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - *q11s16 = vcombine_s16(d22s16, d23s16); - - q2s32 = vmull_s16(d20s16, d0s16); - q3s32 = vmull_s16(d21s16, d0s16); - q8s32 = vmull_s16(d20s16, d1s16); - q12s32 = vmull_s16(d21s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); - q8s32 = vmlal_s16(q8s32, d28s16, d0s16); - q12s32 = vmlal_s16(q12s32, d29s16, d0s16); - - d26s16 = vrshrn_n_s32(q2s32, 14); - d27s16 = vrshrn_n_s32(q3s32, 14); - d30s16 = vrshrn_n_s32(q8s32, 14); - d31s16 = vrshrn_n_s32(q12s32, 14); - *q13s16 = vcombine_s16(d26s16, d27s16); - *q15s16 = vcombine_s16(d30s16, d31s16); - - q0s16 = vaddq_s16(*q9s16, *q15s16); - q1s16 = vaddq_s16(*q11s16, *q13s16); - q2s16 = vsubq_s16(*q11s16, *q13s16); - q3s16 = vsubq_s16(*q9s16, *q15s16); - - *q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - *q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - - d16s16 = vdup_n_s16(cospi_16_64); - - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vrshrn_n_s32(q9s32, 14); - d11s16 = vrshrn_n_s32(q10s32, 14); - d12s16 = vrshrn_n_s32(q11s32, 14); - d13s16 = vrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - *q8s16 = vaddq_s16(q0s16, q7s16); - *q9s16 = vaddq_s16(q1s16, q6s16); - *q10s16 = vaddq_s16(q2s16, q5s16); - *q11s16 = vaddq_s16(q3s16, q4s16); - *q12s16 = vsubq_s16(q3s16, q4s16); - *q13s16 = vsubq_s16(q2s16, q5s16); - *q14s16 = vsubq_s16(q1s16, q6s16); - *q15s16 = vsubq_s16(q0s16, q7s16); +static INLINE void iadst_half_butterfly_neon(int16x8_t *const x, + const int16x4_t c) { + const int16x8_t sum = vaddq_s16(x[0], x[1]); + const int16x8_t sub = vsubq_s16(x[0], x[1]); + int32x4_t t0[2], t1[2]; + + t0[0] = vmull_lane_s16(vget_low_s16(sum), c, 0); + t0[1] = vmull_lane_s16(vget_high_s16(sum), c, 0); + t1[0] = vmull_lane_s16(vget_low_s16(sub), c, 0); + t1[1] = vmull_lane_s16(vget_high_s16(sub), c, 0); + x[0] = dct_const_round_shift_low_8(t0); + x[1] = dct_const_round_shift_low_8(t1); } -static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q2s16, q4s16, q5s16, q6s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32; - int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - d14s16 = vdup_n_s16(cospi_2_64); - d15s16 = vdup_n_s16(cospi_30_64); - - q1s32 = vmull_s16(d30s16, d14s16); - q2s32 = vmull_s16(d31s16, d14s16); - q3s32 = vmull_s16(d30s16, d15s16); - q4s32 = vmull_s16(d31s16, d15s16); - - d30s16 = vdup_n_s16(cospi_18_64); - d31s16 = vdup_n_s16(cospi_14_64); - - q1s32 = vmlal_s16(q1s32, d16s16, d15s16); - q2s32 = vmlal_s16(q2s32, d17s16, d15s16); - q3s32 = vmlsl_s16(q3s32, d16s16, d14s16); - q4s32 = vmlsl_s16(q4s32, d17s16, d14s16); - - q5s32 = vmull_s16(d22s16, d30s16); - q6s32 = vmull_s16(d23s16, d30s16); - q7s32 = vmull_s16(d22s16, d31s16); - q8s32 = vmull_s16(d23s16, d31s16); - - q5s32 = vmlal_s16(q5s32, d24s16, d31s16); - q6s32 = vmlal_s16(q6s32, d25s16, d31s16); - q7s32 = vmlsl_s16(q7s32, d24s16, d30s16); - q8s32 = vmlsl_s16(q8s32, d25s16, d30s16); - - q11s32 = vaddq_s32(q1s32, q5s32); - q12s32 = vaddq_s32(q2s32, q6s32); - q1s32 = vsubq_s32(q1s32, q5s32); - q2s32 = vsubq_s32(q2s32, q6s32); - - d22s16 = vrshrn_n_s32(q11s32, 14); - d23s16 = vrshrn_n_s32(q12s32, 14); - *q11s16 = vcombine_s16(d22s16, d23s16); - - q12s32 = vaddq_s32(q3s32, q7s32); - q15s32 = vaddq_s32(q4s32, q8s32); - q3s32 = vsubq_s32(q3s32, q7s32); - q4s32 = vsubq_s32(q4s32, q8s32); - - d2s16 = vrshrn_n_s32(q1s32, 14); - d3s16 = vrshrn_n_s32(q2s32, 14); - d24s16 = vrshrn_n_s32(q12s32, 14); - d25s16 = vrshrn_n_s32(q15s32, 14); - d6s16 = vrshrn_n_s32(q3s32, 14); - d7s16 = vrshrn_n_s32(q4s32, 14); - *q12s16 = vcombine_s16(d24s16, d25s16); - - d0s16 = vdup_n_s16(cospi_10_64); - d1s16 = vdup_n_s16(cospi_22_64); - q4s32 = vmull_s16(d26s16, d0s16); - q5s32 = vmull_s16(d27s16, d0s16); - q2s32 = vmull_s16(d26s16, d1s16); - q6s32 = vmull_s16(d27s16, d1s16); - - d30s16 = vdup_n_s16(cospi_26_64); - d31s16 = vdup_n_s16(cospi_6_64); - - q4s32 = vmlal_s16(q4s32, d20s16, d1s16); - q5s32 = vmlal_s16(q5s32, d21s16, d1s16); - q2s32 = vmlsl_s16(q2s32, d20s16, d0s16); - q6s32 = vmlsl_s16(q6s32, d21s16, d0s16); - - q0s32 = vmull_s16(d18s16, d30s16); - q13s32 = vmull_s16(d19s16, d30s16); - - q0s32 = vmlal_s16(q0s32, d28s16, d31s16); - q13s32 = vmlal_s16(q13s32, d29s16, d31s16); - - q10s32 = vmull_s16(d18s16, d31s16); - q9s32 = vmull_s16(d19s16, d31s16); - - q10s32 = vmlsl_s16(q10s32, d28s16, d30s16); - q9s32 = vmlsl_s16(q9s32, d29s16, d30s16); - - q14s32 = vaddq_s32(q2s32, q10s32); - q15s32 = vaddq_s32(q6s32, q9s32); - q2s32 = vsubq_s32(q2s32, q10s32); - q6s32 = vsubq_s32(q6s32, q9s32); - - d28s16 = vrshrn_n_s32(q14s32, 14); - d29s16 = vrshrn_n_s32(q15s32, 14); - d4s16 = vrshrn_n_s32(q2s32, 14); - d5s16 = vrshrn_n_s32(q6s32, 14); - *q14s16 = vcombine_s16(d28s16, d29s16); - - q9s32 = vaddq_s32(q4s32, q0s32); - q10s32 = vaddq_s32(q5s32, q13s32); - q4s32 = vsubq_s32(q4s32, q0s32); - q5s32 = vsubq_s32(q5s32, q13s32); - - d30s16 = vdup_n_s16(cospi_8_64); - d31s16 = vdup_n_s16(cospi_24_64); - - d18s16 = vrshrn_n_s32(q9s32, 14); - d19s16 = vrshrn_n_s32(q10s32, 14); - d8s16 = vrshrn_n_s32(q4s32, 14); - d9s16 = vrshrn_n_s32(q5s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - - q5s32 = vmull_s16(d2s16, d30s16); - q6s32 = vmull_s16(d3s16, d30s16); - q7s32 = vmull_s16(d2s16, d31s16); - q0s32 = vmull_s16(d3s16, d31s16); - - q5s32 = vmlal_s16(q5s32, d6s16, d31s16); - q6s32 = vmlal_s16(q6s32, d7s16, d31s16); - q7s32 = vmlsl_s16(q7s32, d6s16, d30s16); - q0s32 = vmlsl_s16(q0s32, d7s16, d30s16); - - q1s32 = vmull_s16(d4s16, d30s16); - q3s32 = vmull_s16(d5s16, d30s16); - q10s32 = vmull_s16(d4s16, d31s16); - q2s32 = vmull_s16(d5s16, d31s16); - - q1s32 = vmlsl_s16(q1s32, d8s16, d31s16); - q3s32 = vmlsl_s16(q3s32, d9s16, d31s16); - q10s32 = vmlal_s16(q10s32, d8s16, d30s16); - q2s32 = vmlal_s16(q2s32, d9s16, d30s16); - - *q8s16 = vaddq_s16(*q11s16, *q9s16); - *q11s16 = vsubq_s16(*q11s16, *q9s16); - q4s16 = vaddq_s16(*q12s16, *q14s16); - *q12s16 = vsubq_s16(*q12s16, *q14s16); - - q14s32 = vaddq_s32(q5s32, q1s32); - q15s32 = vaddq_s32(q6s32, q3s32); - q5s32 = vsubq_s32(q5s32, q1s32); - q6s32 = vsubq_s32(q6s32, q3s32); - - d18s16 = vrshrn_n_s32(q14s32, 14); - d19s16 = vrshrn_n_s32(q15s32, 14); - d10s16 = vrshrn_n_s32(q5s32, 14); - d11s16 = vrshrn_n_s32(q6s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - - q1s32 = vaddq_s32(q7s32, q10s32); - q3s32 = vaddq_s32(q0s32, q2s32); - q7s32 = vsubq_s32(q7s32, q10s32); - q0s32 = vsubq_s32(q0s32, q2s32); - - d28s16 = vrshrn_n_s32(q1s32, 14); - d29s16 = vrshrn_n_s32(q3s32, 14); - d14s16 = vrshrn_n_s32(q7s32, 14); - d15s16 = vrshrn_n_s32(q0s32, 14); - *q14s16 = vcombine_s16(d28s16, d29s16); - - d30s16 = vdup_n_s16(cospi_16_64); - - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - q2s32 = vmull_s16(d22s16, d30s16); - q3s32 = vmull_s16(d23s16, d30s16); - q13s32 = vmull_s16(d22s16, d30s16); - q1s32 = vmull_s16(d23s16, d30s16); +static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0); +} - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - q2s32 = vmlal_s16(q2s32, d24s16, d30s16); - q3s32 = vmlal_s16(q3s32, d25s16, d30s16); - q13s32 = vmlsl_s16(q13s32, d24s16, d30s16); - q1s32 = vmlsl_s16(q1s32, d25s16, d30s16); +static INLINE void iadst_butterfly_lane_2_3_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2); +} - d4s16 = vrshrn_n_s32(q2s32, 14); - d5s16 = vrshrn_n_s32(q3s32, 14); - d24s16 = vrshrn_n_s32(q13s32, 14); - d25s16 = vrshrn_n_s32(q1s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - *q12s16 = vcombine_s16(d24s16, d25s16); +static INLINE void iadst_butterfly_lane_3_2_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3); +} - q13s32 = vmull_s16(d10s16, d30s16); - q1s32 = vmull_s16(d11s16, d30s16); - q11s32 = vmull_s16(d10s16, d30s16); - q0s32 = vmull_s16(d11s16, d30s16); +static INLINE int16x8_t add_dct_const_round_shift_low_8( + const int32x4_t *const in0, const int32x4_t *const in1) { + int32x4_t sum[2]; - q13s32 = vmlal_s16(q13s32, d14s16, d30s16); - q1s32 = vmlal_s16(q1s32, d15s16, d30s16); - q11s32 = vmlsl_s16(q11s32, d14s16, d30s16); - q0s32 = vmlsl_s16(q0s32, d15s16, d30s16); + sum[0] = vaddq_s32(in0[0], in1[0]); + sum[1] = vaddq_s32(in0[1], in1[1]); + return dct_const_round_shift_low_8(sum); +} - d20s16 = vrshrn_n_s32(q13s32, 14); - d21s16 = vrshrn_n_s32(q1s32, 14); - d12s16 = vrshrn_n_s32(q11s32, 14); - d13s16 = vrshrn_n_s32(q0s32, 14); - *q10s16 = vcombine_s16(d20s16, d21s16); - q6s16 = vcombine_s16(d12s16, d13s16); +static INLINE int16x8_t sub_dct_const_round_shift_low_8( + const int32x4_t *const in0, const int32x4_t *const in1) { + int32x4_t sum[2]; - q5s16 = vdupq_n_s16(0); + sum[0] = vsubq_s32(in0[0], in1[0]); + sum[1] = vsubq_s32(in0[1], in1[1]); + return dct_const_round_shift_low_8(sum); +} - *q9s16 = vsubq_s16(q5s16, *q9s16); - *q11s16 = vsubq_s16(q5s16, q2s16); - *q13s16 = vsubq_s16(q5s16, q6s16); - *q15s16 = vsubq_s16(q5s16, q4s16); +static INLINE void iadst8(int16x8_t *const io) { + const int16x4_t c0 = + create_s16x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64); + const int16x4_t c1 = + create_s16x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64); + const int16x4_t c2 = + create_s16x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64); + int16x8_t x[8], t[4]; + int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + + x[0] = io[7]; + x[1] = io[0]; + x[2] = io[5]; + x[3] = io[2]; + x[4] = io[3]; + x[5] = io[4]; + x[6] = io[1]; + x[7] = io[6]; + + // stage 1 + iadst_butterfly_lane_0_1_neon(x[0], x[1], c0, s0, s1); + iadst_butterfly_lane_2_3_neon(x[2], x[3], c0, s2, s3); + iadst_butterfly_lane_0_1_neon(x[4], x[5], c1, s4, s5); + iadst_butterfly_lane_2_3_neon(x[6], x[7], c1, s6, s7); + + x[0] = add_dct_const_round_shift_low_8(s0, s4); + x[1] = add_dct_const_round_shift_low_8(s1, s5); + x[2] = add_dct_const_round_shift_low_8(s2, s6); + x[3] = add_dct_const_round_shift_low_8(s3, s7); + x[4] = sub_dct_const_round_shift_low_8(s0, s4); + x[5] = sub_dct_const_round_shift_low_8(s1, s5); + x[6] = sub_dct_const_round_shift_low_8(s2, s6); + x[7] = sub_dct_const_round_shift_low_8(s3, s7); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + iadst_butterfly_lane_2_3_neon(x[4], x[5], c2, s4, s5); + iadst_butterfly_lane_3_2_neon(x[7], x[6], c2, s7, s6); + + x[0] = vaddq_s16(t[0], t[2]); + x[1] = vaddq_s16(t[1], t[3]); + x[2] = vsubq_s16(t[0], t[2]); + x[3] = vsubq_s16(t[1], t[3]); + x[4] = add_dct_const_round_shift_low_8(s4, s6); + x[5] = add_dct_const_round_shift_low_8(s5, s7); + x[6] = sub_dct_const_round_shift_low_8(s4, s6); + x[7] = sub_dct_const_round_shift_low_8(s5, s7); + + // stage 3 + iadst_half_butterfly_neon(x + 2, c2); + iadst_half_butterfly_neon(x + 6, c2); + + io[0] = x[0]; + io[1] = vnegq_s16(x[4]); + io[2] = x[6]; + io[3] = vnegq_s16(x[2]); + io[4] = x[3]; + io[5] = vnegq_s16(x[7]); + io[6] = x[5]; + io[7] = vnegq_s16(x[1]); } void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { - int i; - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; - - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 8 * 2); - q11s16 = vld1q_s16(input + 8 * 3); - q12s16 = vld1q_s16(input + 8 * 4); - q13s16 = vld1q_s16(input + 8 * 5); - q14s16 = vld1q_s16(input + 8 * 6); - q15s16 = vld1q_s16(input + 8 * 7); - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + int16x8_t a[8]; + + a[0] = load_tran_low_to_s16q(input + 0 * 8); + a[1] = load_tran_low_to_s16q(input + 1 * 8); + a[2] = load_tran_low_to_s16q(input + 2 * 8); + a[3] = load_tran_low_to_s16q(input + 3 * 8); + a[4] = load_tran_low_to_s16q(input + 4 * 8); + a[5] = load_tran_low_to_s16q(input + 5 * 8); + a[6] = load_tran_low_to_s16q(input + 6 * 8); + a[7] = load_tran_low_to_s16q(input + 7 * 8); + + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); switch (tx_type) { - case 0: // idct_idct is not supported. Fall back to C - vp9_iht8x8_64_add_c(input, dest, stride, tx_type); - return; - case 1: // iadst_idct - // generate IDCT constants - // GENERATE_IDCT_CONSTANTS - - // first transform rows - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // transpose the matrix - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, - &q14s16, &q15s16); - - // generate IADST constants - // GENERATE_IADST_CONSTANTS - - // then transform columns - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + case DCT_DCT: + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); break; - case 2: // idct_iadst - // generate IADST constants - // GENERATE_IADST_CONSTANTS - - // first transform rows - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // transpose the matrix - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, - &q14s16, &q15s16); - // generate IDCT constants - // GENERATE_IDCT_CONSTANTS - - // then transform columns - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + case ADST_DCT: + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + iadst8(a); break; - case 3: // iadst_iadst - // generate IADST constants - // GENERATE_IADST_CONSTANTS - - // first transform rows - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // transpose the matrix - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, - &q14s16, &q15s16); - // then transform columns - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + case DCT_ADST: + iadst8(a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); break; - default: // iadst_idct - assert(0); + + default: + assert(tx_type == ADST_ADST); + iadst8(a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + iadst8(a); break; } - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - for (d1 = d2 = dest, i = 0; i < 2; i++) { - if (i != 0) { - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - } - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = - vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = - vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += stride; - } + idct8x8_add8x8_neon(a, dest, stride); } diff --git a/vp9/common/arm/neon/vp9_iht_neon.h b/vp9/common/arm/neon/vp9_iht_neon.h new file mode 100644 index 000000000..08daa1a4a --- /dev/null +++ b/vp9/common/arm/neon/vp9_iht_neon.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_ +#define VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_ + +#include <arm_neon.h> + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void iadst4(int16x8_t *const io) { + const int32x4_t c3 = vdupq_n_s32(sinpi_3_9); + int16x4_t x[4]; + int32x4_t s[8], output[4]; + const int16x4_t c = + create_s16x4_neon(sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9); + + x[0] = vget_low_s16(io[0]); + x[1] = vget_low_s16(io[1]); + x[2] = vget_high_s16(io[0]); + x[3] = vget_high_s16(io[1]); + + s[0] = vmull_lane_s16(x[0], c, 0); + s[1] = vmull_lane_s16(x[0], c, 1); + s[2] = vmull_lane_s16(x[1], c, 2); + s[3] = vmull_lane_s16(x[2], c, 3); + s[4] = vmull_lane_s16(x[2], c, 0); + s[5] = vmull_lane_s16(x[3], c, 1); + s[6] = vmull_lane_s16(x[3], c, 3); + s[7] = vaddl_s16(x[0], x[3]); + s[7] = vsubw_s16(s[7], x[2]); + + s[0] = vaddq_s32(s[0], s[3]); + s[0] = vaddq_s32(s[0], s[5]); + s[1] = vsubq_s32(s[1], s[4]); + s[1] = vsubq_s32(s[1], s[6]); + s[3] = s[2]; + s[2] = vmulq_s32(c3, s[7]); + + output[0] = vaddq_s32(s[0], s[3]); + output[1] = vaddq_s32(s[1], s[3]); + output[2] = s[2]; + output[3] = vaddq_s32(s[0], s[1]); + output[3] = vsubq_s32(output[3], s[3]); + dct_const_round_shift_low_8_dual(output, &io[0], &io[1]); +} + +#endif // VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_ diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c index a575bda72..430b917b8 100644 --- a/vp9/common/vp9_entropy.c +++ b/vp9/common/vp9_entropy.c @@ -42,6 +42,7 @@ const vpx_prob vp9_cat6_prob_high12[] = { 255, 255, 255, 255, 254, 254, 177, 153, 140, 133, 130, 129 }; #endif +/* clang-format off */ const uint8_t vp9_coefband_trans_8x8plus[1024] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, // beyond MAXBAND_INDEX+1 all values are filled as 5 @@ -85,6 +86,7 @@ const uint8_t vp9_coefband_trans_8x8plus[1024] = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, }; +/* clang-format on */ const uint8_t vp9_coefband_trans_4x4[16] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index 1da491166..0ab502592 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -137,7 +137,6 @@ static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) { // 128 lists of probabilities are stored for the following ONE node probs: // 1, 3, 5, 7, ..., 253, 255 // In between probabilities are interpolated linearly - #define COEFF_PROB_MODELS 255 #define UNCONSTRAINED_NODES 3 diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c index 47cd63e94..48cad3318 100644 --- a/vp9/common/vp9_entropymode.c +++ b/vp9/common/vp9_entropymode.c @@ -186,16 +186,19 @@ const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] = { 93, 24, 99 }, // a split, l not split { 85, 119, 44 }, // l split, a not split { 62, 59, 67 }, // a/l both split + // 16x16 -> 8x8 { 149, 53, 53 }, // a/l both not split { 94, 20, 48 }, // a split, l not split { 83, 53, 24 }, // l split, a not split { 52, 18, 18 }, // a/l both split + // 32x32 -> 16x16 { 150, 40, 39 }, // a/l both not split { 78, 12, 26 }, // a split, l not split { 67, 33, 11 }, // l split, a not split { 24, 7, 5 }, // a/l both split + // 64x64 -> 32x32 { 174, 35, 49 }, // a/l both not split { 68, 11, 27 }, // a split, l not split diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c index a18a290cf..b6f052d08 100644 --- a/vp9/common/vp9_entropymv.c +++ b/vp9/common/vp9_entropymv.c @@ -22,9 +22,7 @@ const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = { 18, -MV_CLASS_7, -MV_CLASS_8, -MV_CLASS_9, -MV_CLASS_10, }; -const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { - -0, -1, -}; +const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { -0, -1 }; const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2, -1, 4, -2, -3 }; diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index c7c343aed..da9180b71 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -1174,7 +1174,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm, } // Disable filtering on the leftmost column - border_mask = ~(mi_col == 0); + border_mask = ~(mi_col == 0 ? 1 : 0); #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { highbd_filter_selectively_vert( diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c index a7ddc0b95..15c7e6376 100644 --- a/vp9/common/vp9_pred_common.c +++ b/vp9/common/vp9_pred_common.c @@ -229,9 +229,8 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { else pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME); } else { - pred_context = 1 + - 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME || - edge_mi->ref_frame[1] == GOLDEN_FRAME); + pred_context = 1 + 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME || + edge_mi->ref_frame[1] == GOLDEN_FRAME); } } else { // inter/inter const int above_has_second = has_second_ref(above_mi); diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index dd6120266..d048857dd 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -67,13 +67,13 @@ add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *outp if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { # Note that there are more specializations appended when # CONFIG_VP9_HIGHBITDEPTH is off. - specialize qw/vp9_iht4x4_16_add sse2/; - specialize qw/vp9_iht8x8_64_add sse2/; + specialize qw/vp9_iht4x4_16_add neon sse2/; + specialize qw/vp9_iht8x8_64_add neon sse2/; specialize qw/vp9_iht16x16_256_add sse2/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { # Note that these specializations are appended to the above ones. - specialize qw/vp9_iht4x4_16_add neon dspr2 msa/; - specialize qw/vp9_iht8x8_64_add neon dspr2 msa/; + specialize qw/vp9_iht4x4_16_add dspr2 msa/; + specialize qw/vp9_iht8x8_64_add dspr2 msa/; specialize qw/vp9_iht16x16_256_add dspr2 msa/; } } @@ -97,13 +97,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Note as optimized versions of these functions are added we need to add a check to ensure # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; - if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { - specialize qw/vp9_highbd_iht4x4_16_add sse4_1/; - } add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd"; + + if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { + specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/; + specialize qw/vp9_highbd_iht8x8_64_add sse4_1/; + specialize qw/vp9_highbd_iht16x16_256_add sse4_1/; + } } # @@ -126,7 +129,7 @@ add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_ add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64"; +specialize qw/vp9_quantize_fp neon sse2 avx2/, "$ssse3_x86_64"; add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_fp_32x32 neon/, "$ssse3_x86_64"; diff --git a/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c new file mode 100644 index 000000000..57b79a732 --- /dev/null +++ b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c @@ -0,0 +1,419 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in, + const int c, + __m128i *const s) { + const __m128i pair_c = pair_set_epi32(4 * c, 0); + __m128i x[2]; + + extend_64bit(in, x); + s[0] = _mm_mul_epi32(pair_c, x[0]); + s[1] = _mm_mul_epi32(pair_c, x[1]); +} + +static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0, + const __m128i in1, + const int c0, const int c1, + __m128i *const s0, + __m128i *const s1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i t00[2], t01[2], t10[2], t11[2]; + __m128i x0[2], x1[2]; + + extend_64bit(in0, x0); + extend_64bit(in1, x1); + t00[0] = _mm_mul_epi32(pair_c0, x0[0]); + t00[1] = _mm_mul_epi32(pair_c0, x0[1]); + t01[0] = _mm_mul_epi32(pair_c0, x1[0]); + t01[1] = _mm_mul_epi32(pair_c0, x1[1]); + t10[0] = _mm_mul_epi32(pair_c1, x0[0]); + t10[1] = _mm_mul_epi32(pair_c1, x0[1]); + t11[0] = _mm_mul_epi32(pair_c1, x1[0]); + t11[1] = _mm_mul_epi32(pair_c1, x1[1]); + + s0[0] = _mm_add_epi64(t00[0], t11[0]); + s0[1] = _mm_add_epi64(t00[1], t11[1]); + s1[0] = _mm_sub_epi64(t10[0], t01[0]); + s1[1] = _mm_sub_epi64(t10[1], t01[1]); +} + +static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) { + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2], + s10[2], s11[2], s12[2], s13[2], s14[2], s15[2]; + __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2], + x10[2], x11[2], x12[2], x13[2], x14[2], x15[2]; + + // stage 1 + highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1); + highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3); + highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5); + highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7); + highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9); + highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10, + s11); + highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12, + s13); + highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14, + s15); + + x0[0] = _mm_add_epi64(s0[0], s8[0]); + x0[1] = _mm_add_epi64(s0[1], s8[1]); + x1[0] = _mm_add_epi64(s1[0], s9[0]); + x1[1] = _mm_add_epi64(s1[1], s9[1]); + x2[0] = _mm_add_epi64(s2[0], s10[0]); + x2[1] = _mm_add_epi64(s2[1], s10[1]); + x3[0] = _mm_add_epi64(s3[0], s11[0]); + x3[1] = _mm_add_epi64(s3[1], s11[1]); + x4[0] = _mm_add_epi64(s4[0], s12[0]); + x4[1] = _mm_add_epi64(s4[1], s12[1]); + x5[0] = _mm_add_epi64(s5[0], s13[0]); + x5[1] = _mm_add_epi64(s5[1], s13[1]); + x6[0] = _mm_add_epi64(s6[0], s14[0]); + x6[1] = _mm_add_epi64(s6[1], s14[1]); + x7[0] = _mm_add_epi64(s7[0], s15[0]); + x7[1] = _mm_add_epi64(s7[1], s15[1]); + x8[0] = _mm_sub_epi64(s0[0], s8[0]); + x8[1] = _mm_sub_epi64(s0[1], s8[1]); + x9[0] = _mm_sub_epi64(s1[0], s9[0]); + x9[1] = _mm_sub_epi64(s1[1], s9[1]); + x10[0] = _mm_sub_epi64(s2[0], s10[0]); + x10[1] = _mm_sub_epi64(s2[1], s10[1]); + x11[0] = _mm_sub_epi64(s3[0], s11[0]); + x11[1] = _mm_sub_epi64(s3[1], s11[1]); + x12[0] = _mm_sub_epi64(s4[0], s12[0]); + x12[1] = _mm_sub_epi64(s4[1], s12[1]); + x13[0] = _mm_sub_epi64(s5[0], s13[0]); + x13[1] = _mm_sub_epi64(s5[1], s13[1]); + x14[0] = _mm_sub_epi64(s6[0], s14[0]); + x14[1] = _mm_sub_epi64(s6[1], s14[1]); + x15[0] = _mm_sub_epi64(s7[0], s15[0]); + x15[1] = _mm_sub_epi64(s7[1], s15[1]); + + x0[0] = dct_const_round_shift_64bit(x0[0]); + x0[1] = dct_const_round_shift_64bit(x0[1]); + x1[0] = dct_const_round_shift_64bit(x1[0]); + x1[1] = dct_const_round_shift_64bit(x1[1]); + x2[0] = dct_const_round_shift_64bit(x2[0]); + x2[1] = dct_const_round_shift_64bit(x2[1]); + x3[0] = dct_const_round_shift_64bit(x3[0]); + x3[1] = dct_const_round_shift_64bit(x3[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x8[0] = dct_const_round_shift_64bit(x8[0]); + x8[1] = dct_const_round_shift_64bit(x8[1]); + x9[0] = dct_const_round_shift_64bit(x9[0]); + x9[1] = dct_const_round_shift_64bit(x9[1]); + x10[0] = dct_const_round_shift_64bit(x10[0]); + x10[1] = dct_const_round_shift_64bit(x10[1]); + x11[0] = dct_const_round_shift_64bit(x11[0]); + x11[1] = dct_const_round_shift_64bit(x11[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x0[0] = pack_4(x0[0], x0[1]); + x1[0] = pack_4(x1[0], x1[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x8[0] = pack_4(x8[0], x8[1]); + x9[0] = pack_4(x9[0], x9[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 2 + s0[0] = x0[0]; + s1[0] = x1[0]; + s2[0] = x2[0]; + s3[0] = x3[0]; + s4[0] = x4[0]; + s5[0] = x5[0]; + s6[0] = x6[0]; + s7[0] = x7[0]; + x0[0] = _mm_add_epi32(s0[0], s4[0]); + x1[0] = _mm_add_epi32(s1[0], s5[0]); + x2[0] = _mm_add_epi32(s2[0], s6[0]); + x3[0] = _mm_add_epi32(s3[0], s7[0]); + x4[0] = _mm_sub_epi32(s0[0], s4[0]); + x5[0] = _mm_sub_epi32(s1[0], s5[0]); + x6[0] = _mm_sub_epi32(s2[0], s6[0]); + x7[0] = _mm_sub_epi32(s3[0], s7[0]); + + highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9); + highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10, + s11); + highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13, + s12); + highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15, + s14); + + x8[0] = _mm_add_epi64(s8[0], s12[0]); + x8[1] = _mm_add_epi64(s8[1], s12[1]); + x9[0] = _mm_add_epi64(s9[0], s13[0]); + x9[1] = _mm_add_epi64(s9[1], s13[1]); + x10[0] = _mm_add_epi64(s10[0], s14[0]); + x10[1] = _mm_add_epi64(s10[1], s14[1]); + x11[0] = _mm_add_epi64(s11[0], s15[0]); + x11[1] = _mm_add_epi64(s11[1], s15[1]); + x12[0] = _mm_sub_epi64(s8[0], s12[0]); + x12[1] = _mm_sub_epi64(s8[1], s12[1]); + x13[0] = _mm_sub_epi64(s9[0], s13[0]); + x13[1] = _mm_sub_epi64(s9[1], s13[1]); + x14[0] = _mm_sub_epi64(s10[0], s14[0]); + x14[1] = _mm_sub_epi64(s10[1], s14[1]); + x15[0] = _mm_sub_epi64(s11[0], s15[0]); + x15[1] = _mm_sub_epi64(s11[1], s15[1]); + x8[0] = dct_const_round_shift_64bit(x8[0]); + x8[1] = dct_const_round_shift_64bit(x8[1]); + x9[0] = dct_const_round_shift_64bit(x9[0]); + x9[1] = dct_const_round_shift_64bit(x9[1]); + x10[0] = dct_const_round_shift_64bit(x10[0]); + x10[1] = dct_const_round_shift_64bit(x10[1]); + x11[0] = dct_const_round_shift_64bit(x11[0]); + x11[1] = dct_const_round_shift_64bit(x11[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x8[0] = pack_4(x8[0], x8[1]); + x9[0] = pack_4(x9[0], x9[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 3 + s0[0] = x0[0]; + s1[0] = x1[0]; + s2[0] = x2[0]; + s3[0] = x3[0]; + highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5); + highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6); + s8[0] = x8[0]; + s9[0] = x9[0]; + s10[0] = x10[0]; + s11[0] = x11[0]; + highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12, + s13); + highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15, + s14); + + x0[0] = _mm_add_epi32(s0[0], s2[0]); + x1[0] = _mm_add_epi32(s1[0], s3[0]); + x2[0] = _mm_sub_epi32(s0[0], s2[0]); + x3[0] = _mm_sub_epi32(s1[0], s3[0]); + x4[0] = _mm_add_epi64(s4[0], s6[0]); + x4[1] = _mm_add_epi64(s4[1], s6[1]); + x5[0] = _mm_add_epi64(s5[0], s7[0]); + x5[1] = _mm_add_epi64(s5[1], s7[1]); + x6[0] = _mm_sub_epi64(s4[0], s6[0]); + x6[1] = _mm_sub_epi64(s4[1], s6[1]); + x7[0] = _mm_sub_epi64(s5[0], s7[0]); + x7[1] = _mm_sub_epi64(s5[1], s7[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x8[0] = _mm_add_epi32(s8[0], s10[0]); + x9[0] = _mm_add_epi32(s9[0], s11[0]); + x10[0] = _mm_sub_epi32(s8[0], s10[0]); + x11[0] = _mm_sub_epi32(s9[0], s11[0]); + x12[0] = _mm_add_epi64(s12[0], s14[0]); + x12[1] = _mm_add_epi64(s12[1], s14[1]); + x13[0] = _mm_add_epi64(s13[0], s15[0]); + x13[1] = _mm_add_epi64(s13[1], s15[1]); + x14[0] = _mm_sub_epi64(s12[0], s14[0]); + x14[1] = _mm_sub_epi64(s12[1], s14[1]); + x15[0] = _mm_sub_epi64(s13[0], s15[0]); + x15[1] = _mm_sub_epi64(s13[1], s15[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 4 + s2[0] = _mm_add_epi32(x2[0], x3[0]); + s3[0] = _mm_sub_epi32(x2[0], x3[0]); + s6[0] = _mm_add_epi32(x7[0], x6[0]); + s7[0] = _mm_sub_epi32(x7[0], x6[0]); + s10[0] = _mm_add_epi32(x11[0], x10[0]); + s11[0] = _mm_sub_epi32(x11[0], x10[0]); + s14[0] = _mm_add_epi32(x14[0], x15[0]); + s15[0] = _mm_sub_epi32(x14[0], x15[0]); + highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2); + highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3); + highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6); + highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7); + highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10); + highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11); + highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14); + highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15); + + x2[0] = dct_const_round_shift_64bit(s2[0]); + x2[1] = dct_const_round_shift_64bit(s2[1]); + x3[0] = dct_const_round_shift_64bit(s3[0]); + x3[1] = dct_const_round_shift_64bit(s3[1]); + x6[0] = dct_const_round_shift_64bit(s6[0]); + x6[1] = dct_const_round_shift_64bit(s6[1]); + x7[0] = dct_const_round_shift_64bit(s7[0]); + x7[1] = dct_const_round_shift_64bit(s7[1]); + x10[0] = dct_const_round_shift_64bit(s10[0]); + x10[1] = dct_const_round_shift_64bit(s10[1]); + x11[0] = dct_const_round_shift_64bit(s11[0]); + x11[1] = dct_const_round_shift_64bit(s11[1]); + x14[0] = dct_const_round_shift_64bit(s14[0]); + x14[1] = dct_const_round_shift_64bit(s14[1]); + x15[0] = dct_const_round_shift_64bit(s15[0]); + x15[1] = dct_const_round_shift_64bit(s15[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + io[0] = x0[0]; + io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]); + io[2] = x12[0]; + io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]); + io[4] = x6[0]; + io[5] = x14[0]; + io[6] = x10[0]; + io[7] = x2[0]; + io[8] = x3[0]; + io[9] = x11[0]; + io[10] = x15[0]; + io[11] = x7[0]; + io[12] = x5[0]; + io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]); + io[14] = x9[0]; + io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]); +} + +void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + int i; + __m128i out[16], *in; + + if (bd == 8) { + __m128i l[16], r[16]; + + in = l; + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + idct16_8col(in, in); + } else { + vpx_iadst16_8col_sse2(in); + } + in = r; + input += 128; + } + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + idct16_8col(out, out); + } else { + vpx_iadst16_8col_sse2(out); + } + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[4][16]; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_highbd_idct16_4col_sse4_1(in); + } else { + highbd_iadst16_4col_sse4_1(in); + } + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_highbd_idct16_4col_sse4_1(out); + } else { + highbd_iadst16_4col_sse4_1(out); + } + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} diff --git a/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c b/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c new file mode 100644 index 000000000..7d949b6db --- /dev/null +++ b/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in, + const int c, + __m128i *const s) { + const __m128i pair_c = pair_set_epi32(4 * c, 0); + __m128i x[2]; + + extend_64bit(in, x); + s[0] = _mm_mul_epi32(pair_c, x[0]); + s[1] = _mm_mul_epi32(pair_c, x[1]); +} + +static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0, + const __m128i in1, + const int c0, const int c1, + __m128i *const s0, + __m128i *const s1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i t00[2], t01[2], t10[2], t11[2]; + __m128i x0[2], x1[2]; + + extend_64bit(in0, x0); + extend_64bit(in1, x1); + t00[0] = _mm_mul_epi32(pair_c0, x0[0]); + t00[1] = _mm_mul_epi32(pair_c0, x0[1]); + t01[0] = _mm_mul_epi32(pair_c0, x1[0]); + t01[1] = _mm_mul_epi32(pair_c0, x1[1]); + t10[0] = _mm_mul_epi32(pair_c1, x0[0]); + t10[1] = _mm_mul_epi32(pair_c1, x0[1]); + t11[0] = _mm_mul_epi32(pair_c1, x1[0]); + t11[1] = _mm_mul_epi32(pair_c1, x1[1]); + + s0[0] = _mm_add_epi64(t00[0], t11[0]); + s0[1] = _mm_add_epi64(t00[1], t11[1]); + s1[0] = _mm_sub_epi64(t10[0], t01[0]); + s1[1] = _mm_sub_epi64(t10[1], t01[1]); +} + +static void highbd_iadst8_sse4_1(__m128i *const io) { + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2]; + + transpose_32bit_4x4x2(io, io); + + // stage 1 + highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1); + highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5); + x0[0] = _mm_add_epi64(s0[0], s4[0]); + x0[1] = _mm_add_epi64(s0[1], s4[1]); + x1[0] = _mm_add_epi64(s1[0], s5[0]); + x1[1] = _mm_add_epi64(s1[1], s5[1]); + x4[0] = _mm_sub_epi64(s0[0], s4[0]); + x4[1] = _mm_sub_epi64(s0[1], s4[1]); + x5[0] = _mm_sub_epi64(s1[0], s5[0]); + x5[1] = _mm_sub_epi64(s1[1], s5[1]); + + highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3); + highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7); + x2[0] = _mm_add_epi64(s2[0], s6[0]); + x2[1] = _mm_add_epi64(s2[1], s6[1]); + x3[0] = _mm_add_epi64(s3[0], s7[0]); + x3[1] = _mm_add_epi64(s3[1], s7[1]); + x6[0] = _mm_sub_epi64(s2[0], s6[0]); + x6[1] = _mm_sub_epi64(s2[1], s6[1]); + x7[0] = _mm_sub_epi64(s3[0], s7[0]); + x7[1] = _mm_sub_epi64(s3[1], s7[1]); + + x0[0] = dct_const_round_shift_64bit(x0[0]); + x0[1] = dct_const_round_shift_64bit(x0[1]); + x1[0] = dct_const_round_shift_64bit(x1[0]); + x1[1] = dct_const_round_shift_64bit(x1[1]); + x2[0] = dct_const_round_shift_64bit(x2[0]); + x2[1] = dct_const_round_shift_64bit(x2[1]); + x3[0] = dct_const_round_shift_64bit(x3[0]); + x3[1] = dct_const_round_shift_64bit(x3[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + s0[0] = pack_4(x0[0], x0[1]); // s0 = x0; + s1[0] = pack_4(x1[0], x1[1]); // s1 = x1; + s2[0] = pack_4(x2[0], x2[1]); // s2 = x2; + s3[0] = pack_4(x3[0], x3[1]); // s3 = x3; + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + // stage 2 + x0[0] = _mm_add_epi32(s0[0], s2[0]); + x1[0] = _mm_add_epi32(s1[0], s3[0]); + x2[0] = _mm_sub_epi32(s0[0], s2[0]); + x3[0] = _mm_sub_epi32(s1[0], s3[0]); + + highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5); + highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6); + + x4[0] = _mm_add_epi64(s4[0], s6[0]); + x4[1] = _mm_add_epi64(s4[1], s6[1]); + x5[0] = _mm_add_epi64(s5[0], s7[0]); + x5[1] = _mm_add_epi64(s5[1], s7[1]); + x6[0] = _mm_sub_epi64(s4[0], s6[0]); + x6[1] = _mm_sub_epi64(s4[1], s6[1]); + x7[0] = _mm_sub_epi64(s5[0], s7[0]); + x7[1] = _mm_sub_epi64(s5[1], s7[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + // stage 3 + s2[0] = _mm_add_epi32(x2[0], x3[0]); + s3[0] = _mm_sub_epi32(x2[0], x3[0]); + s6[0] = _mm_add_epi32(x6[0], x7[0]); + s7[0] = _mm_sub_epi32(x6[0], x7[0]); + highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2); + highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3); + highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6); + highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7); + + x2[0] = dct_const_round_shift_64bit(s2[0]); + x2[1] = dct_const_round_shift_64bit(s2[1]); + x3[0] = dct_const_round_shift_64bit(s3[0]); + x3[1] = dct_const_round_shift_64bit(s3[1]); + x6[0] = dct_const_round_shift_64bit(s6[0]); + x6[1] = dct_const_round_shift_64bit(s6[1]); + x7[0] = dct_const_round_shift_64bit(s7[0]); + x7[1] = dct_const_round_shift_64bit(s7[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + io[0] = x0[0]; + io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]); + io[2] = x6[0]; + io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]); + io[4] = x3[0]; + io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]); + io[6] = x5[0]; + io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]); +} + +void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], io[4]); + io_short[1] = _mm_packs_epi32(io[1], io[5]); + io_short[2] = _mm_packs_epi32(io[2], io[6]); + io_short[3] = _mm_packs_epi32(io[3], io[7]); + io_short[4] = _mm_packs_epi32(io[8], io[12]); + io_short[5] = _mm_packs_epi32(io[9], io[13]); + io_short[6] = _mm_packs_epi32(io[10], io[14]); + io_short[7] = _mm_packs_epi32(io[11], io[15]); + + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_idct8_sse2(io_short); + } else { + iadst8_sse2(io_short); + } + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_idct8_sse2(io_short); + } else { + iadst8_sse2(io_short); + } + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_highbd_idct8x8_half1d_sse4_1(io); + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + } else { + highbd_iadst8_sse4_1(io); + highbd_iadst8_sse4_1(&io[8]); + } + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + io[4] = io[8]; + io[5] = io[9]; + io[6] = io[10]; + io[7] = io[11]; + + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_highbd_idct8x8_half1d_sse4_1(io); + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + } else { + highbd_iadst8_sse4_1(io); + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_iadst8_sse4_1(&io[8]); + } + highbd_idct8x8_final_round(io); + } + recon_and_store_8x8(io, dest, stride, bd); +} diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 6996260e2..ad693718c 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -10,8 +10,6 @@ #include "./vp9_rtcd.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" -#include "vpx_dsp/x86/txfm_common_sse2.h" -#include "vpx_ports/mem.h" void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { @@ -22,23 +20,23 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[1] = load_input_data8(input + 8); switch (tx_type) { - case 0: // DCT_DCT + case DCT_DCT: idct4_sse2(in); idct4_sse2(in); break; - case 1: // ADST_DCT + case ADST_DCT: idct4_sse2(in); iadst4_sse2(in); break; - case 2: // DCT_ADST + case DCT_ADST: iadst4_sse2(in); idct4_sse2(in); break; - case 3: // ADST_ADST + default: + assert(tx_type == ADST_ADST); iadst4_sse2(in); iadst4_sse2(in); break; - default: assert(0); break; } // Final round and shift @@ -67,23 +65,23 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[7] = load_input_data8(input + 8 * 7); switch (tx_type) { - case 0: // DCT_DCT - idct8_sse2(in); - idct8_sse2(in); + case DCT_DCT: + vpx_idct8_sse2(in); + vpx_idct8_sse2(in); break; - case 1: // ADST_DCT - idct8_sse2(in); + case ADST_DCT: + vpx_idct8_sse2(in); iadst8_sse2(in); break; - case 2: // DCT_ADST + case DCT_ADST: iadst8_sse2(in); - idct8_sse2(in); + vpx_idct8_sse2(in); break; - case 3: // ADST_ADST + default: + assert(tx_type == ADST_ADST); iadst8_sse2(in); iadst8_sse2(in); break; - default: assert(0); break; } // Final rounding and shift @@ -201,23 +199,23 @@ void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, load_buffer_8x16(input, in1); switch (tx_type) { - case 0: // DCT_DCT + case DCT_DCT: idct16_sse2(in0, in1); idct16_sse2(in0, in1); break; - case 1: // ADST_DCT + case ADST_DCT: idct16_sse2(in0, in1); iadst16_sse2(in0, in1); break; - case 2: // DCT_ADST + case DCT_ADST: iadst16_sse2(in0, in1); idct16_sse2(in0, in1); break; - case 3: // ADST_ADST + default: + assert(tx_type == ADST_ADST); iadst16_sse2(in0, in1); iadst16_sse2(in0, in1); break; - default: assert(0); break; } write_buffer_8x16(dest, in0, stride); diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index 2f2f0055a..ed43de701 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -464,10 +464,6 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { cr->rate_ratio_qdelta = VPXMAX(cr->rate_ratio_qdelta, 2.5); } } - if (cpi->svc.spatial_layer_id > 0) { - cr->motion_thresh = 4; - cr->rate_boost_fac = 12; - } if (cpi->oxcf.rc_mode == VPX_VBR) { // To be adjusted for VBR mode, e.g., based on gf period and boost. // For now use smaller qp-delta (than CBR), no second boosted seg, and diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c index 2f7e54433..52a81afb5 100644 --- a/vp9/encoder/vp9_context_tree.c +++ b/vp9/encoder/vp9_context_tree.c @@ -12,7 +12,10 @@ #include "vp9/encoder/vp9_encoder.h" static const BLOCK_SIZE square[] = { - BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, + BLOCK_8X8, + BLOCK_16X16, + BLOCK_32X32, + BLOCK_64X64, }; static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 682477df1..6517fb358 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1513,9 +1513,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } } - if (is_key_frame || (low_res && - vt.split[i].split[j].part_variances.none.variance > - threshold_4x4avg)) { + if (is_key_frame || + (low_res && vt.split[i].split[j].part_variances.none.variance > + threshold_4x4avg)) { force_split[split_index] = 0; // Go down to 4x4 down-sampling for variance. variance4x4downsample[i2 + j] = 1; @@ -3403,9 +3403,10 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, // Rate and distortion based partition search termination clause. if (!cpi->sf.ml_partition_search_early_termination && - !x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) || - (best_rdc.dist < dist_breakout_thr && - best_rdc.rate < rate_breakout_thr))) { + !x->e_mbd.lossless && + ((best_rdc.dist < (dist_breakout_thr >> 2)) || + (best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr))) { do_rect = 0; } } @@ -4620,8 +4621,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) { if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { if (cpi->tile_data != NULL) vpx_free(cpi->tile_data); - CHECK_MEM_ERROR(cm, cpi->tile_data, vpx_malloc(tile_cols * tile_rows * - sizeof(*cpi->tile_data))); + CHECK_MEM_ERROR( + cm, cpi->tile_data, + vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data))); cpi->allocated_tiles = tile_cols * tile_rows; for (tile_row = 0; tile_row < tile_rows; ++tile_row) diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index f3c17f255..970077d89 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -50,7 +50,8 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { } static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { - { 10, 6 }, { 8, 5 }, + { 10, 6 }, + { 8, 5 }, }; // 'num' can be negative, but 'shift' must be non-negative. @@ -200,9 +201,9 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, const int band_next = band_translate[i + 1]; const int token_next = (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN; - unsigned int( - *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = - token_costs + band_next; + unsigned int(*const token_costs_next)[2][COEFF_CONTEXTS] + [ENTROPY_TOKENS] = + token_costs + band_next; token_cache[rc] = vp9_pt_energy_class[t0]; ctx_next = get_coef_context(nb, token_cache, i + 1); token_tree_sel_next = (x == 0); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 1e91d0155..fd3889dda 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -65,12 +65,12 @@ #define AM_SEGMENT_ID_INACTIVE 7 #define AM_SEGMENT_ID_ACTIVE 0 -#define ALTREF_HIGH_PRECISION_MV 1 // Whether to use high precision mv - // for altref computation. -#define HIGH_PRECISION_MV_QTHRESH 200 // Q threshold for high precision - // mv. Choose a very high value for - // now so that HIGH_PRECISION is always - // chosen. +// Whether to use high precision mv for altref computation. +#define ALTREF_HIGH_PRECISION_MV 1 + +// Q threshold for high precision mv. Choose a very high value for now so that +// HIGH_PRECISION is always chosen. +#define HIGH_PRECISION_MV_QTHRESH 200 #define FRAME_SIZE_FACTOR 128 // empirical params for context model threshold #define FRAME_RATE_FACTOR 8 @@ -547,6 +547,74 @@ static void apply_active_map(VP9_COMP *cpi) { } } +static void apply_roi_map(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + struct segmentation *const seg = &cm->seg; + vpx_roi_map_t *roi = &cpi->roi; + const int *delta_q = roi->delta_q; + const int *delta_lf = roi->delta_lf; + const int *skip = roi->skip; + int ref_frame[8]; + int internal_delta_q[MAX_SEGMENTS]; + int i; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + + // TODO(jianj): Investigate why ROI not working in speed < 5 or in non + // realtime mode. + if (cpi->oxcf.mode != REALTIME || cpi->oxcf.speed < 5) return; + if (!roi->enabled) return; + + memcpy(&ref_frame, roi->ref_frame, sizeof(ref_frame)); + + vp9_enable_segmentation(seg); + vp9_clearall_segfeatures(seg); + // Select delta coding method; + seg->abs_delta = SEGMENT_DELTADATA; + + memcpy(cpi->segmentation_map, roi->roi_map, (cm->mi_rows * cm->mi_cols)); + + for (i = 0; i < MAX_SEGMENTS; ++i) { + // Translate the external delta q values to internal values. + internal_delta_q[i] = vp9_quantizer_to_qindex(abs(delta_q[i])); + if (delta_q[i] < 0) internal_delta_q[i] = -internal_delta_q[i]; + vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q); + vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF); + if (internal_delta_q[i] != 0) { + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, internal_delta_q[i]); + } + if (delta_lf[i] != 0) { + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF); + vp9_set_segdata(seg, i, SEG_LVL_ALT_LF, delta_lf[i]); + } + if (skip[i] != 0) { + vp9_enable_segfeature(seg, i, SEG_LVL_SKIP); + vp9_set_segdata(seg, i, SEG_LVL_SKIP, skip[i]); + } + if (ref_frame[i] >= 0) { + int valid_ref = 1; + // ALTREF is not used as reference for nonrd_pickmode with 0 lag. + if (ref_frame[i] == ALTREF_FRAME && cpi->sf.use_nonrd_pick_mode) + valid_ref = 0; + // If GOLDEN is selected, make sure it's set as reference. + if (ref_frame[i] == GOLDEN_FRAME && + !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) { + valid_ref = 0; + } + // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are + // same reference. + if (ref_frame[i] == GOLDEN_FRAME && cpi->rc.frames_since_golden == 0) + ref_frame[i] = LAST_FRAME; + if (valid_ref) { + vp9_enable_segfeature(seg, i, SEG_LVL_REF_FRAME); + vp9_set_segdata(seg, i, SEG_LVL_REF_FRAME, ref_frame[i]); + } + } + } + roi->enabled = 1; +} + static void init_level_info(Vp9LevelInfo *level_info) { Vp9LevelStats *const level_stats = &level_info->level_stats; Vp9LevelSpec *const level_spec = &level_info->level_spec; @@ -557,6 +625,13 @@ static void init_level_info(Vp9LevelInfo *level_info) { level_spec->min_altref_distance = INT_MAX; } +static int check_seg_range(int seg_data[8], int range) { + return !(abs(seg_data[0]) > range || abs(seg_data[1]) > range || + abs(seg_data[2]) > range || abs(seg_data[3]) > range || + abs(seg_data[4]) > range || abs(seg_data[5]) > range || + abs(seg_data[6]) > range || abs(seg_data[7]) > range); +} + VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { int i; const Vp9LevelSpec *this_level; @@ -583,6 +658,61 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level; } +int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, + unsigned int cols, int delta_q[8], int delta_lf[8], + int skip[8], int ref_frame[8]) { + VP9_COMMON *cm = &cpi->common; + vpx_roi_map_t *roi = &cpi->roi; + const int range = 63; + const int ref_frame_range = 3; // Alt-ref + const int skip_range = 1; + const int frame_rows = cpi->common.mi_rows; + const int frame_cols = cpi->common.mi_cols; + + // Check number of rows and columns match + if (frame_rows != (int)rows || frame_cols != (int)cols) { + return -1; + } + + if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) || + !check_seg_range(ref_frame, ref_frame_range) || + !check_seg_range(skip, skip_range)) + return -1; + + // Also disable segmentation if no deltas are specified. + if (!map || + (!(delta_q[0] | delta_q[1] | delta_q[2] | delta_q[3] | delta_q[4] | + delta_q[5] | delta_q[6] | delta_q[7] | delta_lf[0] | delta_lf[1] | + delta_lf[2] | delta_lf[3] | delta_lf[4] | delta_lf[5] | delta_lf[6] | + delta_lf[7] | skip[0] | skip[1] | skip[2] | skip[3] | skip[4] | + skip[5] | skip[6] | skip[7]) && + (ref_frame[0] == -1 && ref_frame[1] == -1 && ref_frame[2] == -1 && + ref_frame[3] == -1 && ref_frame[4] == -1 && ref_frame[5] == -1 && + ref_frame[6] == -1 && ref_frame[7] == -1))) { + vp9_disable_segmentation(&cm->seg); + cpi->roi.enabled = 0; + return 0; + } + + if (roi->roi_map) { + vpx_free(roi->roi_map); + roi->roi_map = NULL; + } + CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols)); + + // Copy to ROI sturcture in the compressor. + memcpy(roi->roi_map, map, rows * cols); + memcpy(&roi->delta_q, delta_q, MAX_SEGMENTS * sizeof(delta_q[0])); + memcpy(&roi->delta_lf, delta_lf, MAX_SEGMENTS * sizeof(delta_lf[0])); + memcpy(&roi->skip, skip, MAX_SEGMENTS * sizeof(skip[0])); + memcpy(&roi->ref_frame, ref_frame, MAX_SEGMENTS * sizeof(ref_frame[0])); + roi->enabled = 1; + roi->rows = rows; + roi->cols = cols; + + return 0; +} + int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, int cols) { if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) { @@ -817,6 +947,9 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { vpx_free(cpi->active_map.map); cpi->active_map.map = NULL; + vpx_free(cpi->roi.roi_map); + cpi->roi.roi_map = NULL; + vpx_free(cpi->consec_zero_mv); cpi->consec_zero_mv = NULL; @@ -1121,8 +1254,9 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a - // target of 1/4x1/4. - if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc) { + // target of 1/4x1/4. number_spatial_layers must be greater than 2. + if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc && + cpi->svc.number_spatial_layers > 2) { cpi->svc.scaled_temp_is_alloc = 1; if (vpx_realloc_frame_buffer( &cpi->svc.scaled_temp, cm->width >> 1, cm->height >> 1, @@ -2017,8 +2151,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, realloc_segmentation_maps(cpi); - CHECK_MEM_ERROR(cm, cpi->skin_map, vpx_calloc(cm->mi_rows * cm->mi_cols, - sizeof(cpi->skin_map[0]))); + CHECK_MEM_ERROR( + cm, cpi->skin_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0]))); CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create()); @@ -3630,6 +3765,8 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, // it may be pretty bad for rate-control, // and I should handle it somehow vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi); + } else if (cpi->roi.enabled && cm->frame_type != KEY_FRAME) { + apply_roi_map(cpi); } apply_active_map(cpi); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index d723d93cb..2989af35e 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -723,6 +723,8 @@ typedef struct VP9_COMP { uint8_t *count_arf_frame_usage; uint8_t *count_lastgolden_frame_usage; + + vpx_roi_map_t roi; } VP9_COMP; void vp9_initialize_enc(void); @@ -868,9 +870,8 @@ static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) { #if CONFIG_VP9_TEMPORAL_DENOISING static INLINE int denoise_svc(const struct VP9_COMP *const cpi) { - return (!cpi->use_svc || - (cpi->use_svc && - cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise)); + return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >= + cpi->svc.first_layer_denoise)); } #endif @@ -938,6 +939,10 @@ static INLINE int log_tile_cols_from_picsize_level(uint32_t width, VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec); +int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, + unsigned int cols, int delta_q[8], int delta_lf[8], + int skip[8], int ref_frame[8]); + void vp9_new_framerate(VP9_COMP *cpi, double framerate); void vp9_set_row_mt(VP9_COMP *cpi); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 4093b5529..f4fda0965 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -731,9 +731,8 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps, // Exclude any image dead zone if (fp_acc_data->image_data_start_row > 0) { fp_acc_data->intra_skip_count = - VPXMAX(0, - fp_acc_data->intra_skip_count - - (fp_acc_data->image_data_start_row * cm->mb_cols * 2)); + VPXMAX(0, fp_acc_data->intra_skip_count - + (fp_acc_data->image_data_start_row * cm->mb_cols * 2)); } fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs; @@ -2238,9 +2237,6 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, } gf_group->arf_update_idx[0] = arf_buffer_indices[0]; gf_group->arf_ref_idx[0] = arf_buffer_indices[0]; - - // Step over the golden frame / overlay frame - if (EOF == input_stats(twopass, &frame_stats)) return; } // Deduct the boost bits for arf (or gf if it is not a key frame) @@ -2285,7 +2281,8 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, // Define middle frame mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1; - normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending); + normal_frames = + rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending); if (normal_frames > 1) normal_frame_bits = (int)(total_group_bits / normal_frames); else @@ -2551,9 +2548,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } // Break out conditions. - if ( - // Break at active_max_gf_interval unless almost totally static. - ((i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) || + // Break at maximum of active_max_gf_interval unless almost totally static. + if (((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) && + (i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) || ( // Don't break out with a very short interval. (i >= active_min_gf_interval) && @@ -2573,8 +2570,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; // Should we use the alternate reference frame. - if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && - (i >= rc->min_gf_interval)) { + if ((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) && allow_alt_ref && + (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) { const int forward_frames = (rc->frames_to_key - i >= i - 1) ? i - 1 : VPXMAX(0, rc->frames_to_key - i); @@ -2602,7 +2599,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { #endif // Set the interval until the next gf. - rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending); + rc->baseline_gf_interval = + (twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) + ? (i - (is_key_frame || rc->source_alt_ref_pending)) + : i; // Only encode alt reference frame in temporal base layer. So // baseline_gf_interval should be multiple of a temporal layer group @@ -2700,13 +2700,24 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { #endif } +// Intra / Inter threshold very low +#define VERY_LOW_II 1.5 +// Clean slide transitions we expect a sharp single frame spike in error. +#define ERROR_SPIKE 5.0 + // Slide show transition detection. // Tests for case where there is very low error either side of the current frame // but much higher just for this frame. This can help detect key frames in // slide shows even where the slides are pictures of different sizes. +// Also requires that intra and inter errors are very similar to help eliminate +// harmful false positives. // It will not help if the transition is a fade or other multi-frame effect. -static int slide_transition(double this_err, double last_err, double next_err) { - return (this_err > (last_err * 5.0)) && (this_err > (next_err * 5.0)); +static int slide_transition(const FIRSTPASS_STATS *this_frame, + const FIRSTPASS_STATS *last_frame, + const FIRSTPASS_STATS *next_frame) { + return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) && + (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) && + (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE)); } // Threshold for use of the lagging second reference frame. High second ref @@ -2753,8 +2764,7 @@ static int test_candidate_kf(TWO_PASS *twopass, if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) || - (slide_transition(this_frame->coded_error, last_frame->coded_error, - next_frame->coded_error)) || + (slide_transition(this_frame, last_frame, next_frame)) || ((pcnt_intra > MIN_INTRA_LEVEL) && (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) && ((this_frame->intra_error / @@ -3019,8 +3029,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double zm_factor; // Monitor for static sections. - zero_motion_accumulator = VPXMIN( - zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); + // First frame in kf group the second ref indicator is invalid. + if (i > 0) { + zero_motion_accumulator = VPXMIN( + zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); + } else { + zero_motion_accumulator = + next_frame.pcnt_inter - next_frame.pcnt_motion; + } // Factor 0.75-1.25 based on how much of frame is static. zm_factor = (0.75 + (zero_motion_accumulator / 2.0)); @@ -3056,10 +3072,16 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->section_intra_rating = calculate_section_intra_ratio( start_position, twopass->stats_in_end, rc->frames_to_key); - // Apply various clamps for min and max boost - rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); - rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); - rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST); + // Special case for static / slide show content but dont apply + // if the kf group is very short. + if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) { + rc->kf_boost = VPXMAX((rc->frames_to_key * 100), MAX_KF_TOT_BOOST); + } else { + // Apply various clamps for min and max boost + rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); + rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); + rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST); + } // Work out how many bits to allocate for the key frame itself. kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 000ecd779..aa497e3da 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -120,12 +120,12 @@ typedef enum { typedef struct { unsigned char index; unsigned char first_inter_index; - RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1]; - FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1]; - unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1]; - unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1]; - unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1]; - int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1]; + RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 1]; + FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 1]; + unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1]; + unsigned char arf_update_idx[MAX_STATIC_GF_GROUP_LENGTH + 1]; + unsigned char arf_ref_idx[MAX_STATIC_GF_GROUP_LENGTH + 1]; + int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 1]; } GF_GROUP; typedef struct { diff --git a/vp9/encoder/vp9_mbgraph.h b/vp9/encoder/vp9_mbgraph.h index df2fb98ef..c3af972bc 100644 --- a/vp9/encoder/vp9_mbgraph.h +++ b/vp9/encoder/vp9_mbgraph.h @@ -25,7 +25,9 @@ typedef struct { } ref[MAX_REF_FRAMES]; } MBGRAPH_MB_STATS; -typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS; +typedef struct { + MBGRAPH_MB_STATS *mb_stats; +} MBGRAPH_FRAME_STATS; struct VP9_COMP; diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 44f01be25..37406c232 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -1785,7 +1785,10 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) { } static const MV search_pos[4] = { - { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 }, + { -1, 0 }, + { 0, -1 }, + { 0, 1 }, + { 1, 0 }, }; unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, @@ -1876,7 +1879,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, { const uint8_t *const pos[4] = { - ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride, + ref_buf - ref_stride, + ref_buf - 1, + ref_buf + 1, + ref_buf + ref_stride, }; cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad); diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index f2f323a28..212c260fa 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1488,7 +1488,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int skip_ref_find_pred[4] = { 0 }; unsigned int sse_zeromv_normalized = UINT_MAX; unsigned int best_sse_sofar = UINT_MAX; - unsigned int thresh_svc_skip_golden = 500; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_PICKMODE_CTX_DEN ctx_den; int64_t zero_last_cost_orig = INT64_MAX; @@ -1496,11 +1495,23 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, #endif INTERP_FILTER filter_gf_svc = EIGHTTAP; MV_REFERENCE_FRAME best_second_ref_frame = NONE; + const struct segmentation *const seg = &cm->seg; int comp_modes = 0; int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES; int flag_svc_subpel = 0; int svc_mv_col = 0; int svc_mv_row = 0; + unsigned int thresh_svc_skip_golden = 500; + // Lower the skip threshold if lower spatial layer is better quality relative + // to current layer. + if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex > 150 && + cm->base_qindex > cpi->svc.lower_layer_qindex + 15) + thresh_svc_skip_golden = 100; + // Increase skip threshold if lower spatial layer is lower quality relative + // to current layer. + else if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex < 140 && + cm->base_qindex < cpi->svc.lower_layer_qindex - 20) + thresh_svc_skip_golden = 1000; init_ref_frame_cost(cm, xd, ref_frame_cost); @@ -1638,6 +1649,16 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME) comp_modes = 2; + // If the segment reference frame feature is enabled and it's set to GOLDEN + // reference, then make sure we don't skip checking GOLDEN, this is to + // prevent possibility of not picking any mode. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) { + usable_ref_frame = GOLDEN_FRAME; + skip_ref_find_pred[GOLDEN_FRAME] = 0; + thresh_svc_skip_golden = 0; + } + for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) { if (!skip_ref_find_pred[ref_frame]) { find_predictors(cpi, x, ref_frame, frame_mv, const_motion, @@ -1699,6 +1720,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (ref_frame > usable_ref_frame) continue; if (skip_ref_find_pred[ref_frame]) continue; + // If the segment reference frame feature is enabled then do nothing if the + // current ref frame is not allowed. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) + continue; + if (flag_svc_subpel && ref_frame == GOLDEN_FRAME) { force_gf_mv = 1; // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row), @@ -1713,7 +1740,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } if (comp_pred) { - const struct segmentation *const seg = &cm->seg; if (!cpi->allow_comp_inter_inter) continue; // Skip compound inter modes if ARF is not available. if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; @@ -1785,29 +1811,34 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, continue; } - if (sf->reference_masking && - !(frame_mv[this_mode][ref_frame].as_int == 0 && - ref_frame == LAST_FRAME)) { - if (usable_ref_frame < ALTREF_FRAME) { - if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) { - i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; - if ((cpi->ref_frame_flags & flag_list[i])) - if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) - ref_frame_skip_mask |= (1 << ref_frame); + // Disable this drop out case if the ref frame segment level feature is + // enabled for this segment. This is to prevent the possibility that we end + // up unable to pick any mode. + if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) { + if (sf->reference_masking && + !(frame_mv[this_mode][ref_frame].as_int == 0 && + ref_frame == LAST_FRAME)) { + if (usable_ref_frame < ALTREF_FRAME) { + if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) { + i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; + if ((cpi->ref_frame_flags & flag_list[i])) + if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) + ref_frame_skip_mask |= (1 << ref_frame); + } + } else if (!cpi->rc.is_src_frame_alt_ref && + !(frame_mv[this_mode][ref_frame].as_int == 0 && + ref_frame == ALTREF_FRAME)) { + int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME; + int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME; + if (((cpi->ref_frame_flags & flag_list[ref1]) && + (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) || + ((cpi->ref_frame_flags & flag_list[ref2]) && + (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1)))) + ref_frame_skip_mask |= (1 << ref_frame); } - } else if (!cpi->rc.is_src_frame_alt_ref && - !(frame_mv[this_mode][ref_frame].as_int == 0 && - ref_frame == ALTREF_FRAME)) { - int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME; - int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME; - if (((cpi->ref_frame_flags & flag_list[ref1]) && - (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) || - ((cpi->ref_frame_flags & flag_list[ref2]) && - (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1)))) - ref_frame_skip_mask |= (1 << ref_frame); } + if (ref_frame_skip_mask & (1 << ref_frame)) continue; } - if (ref_frame_skip_mask & (1 << ref_frame)) continue; // Select prediction reference frames. for (i = 0; i < MAX_MB_PLANE; i++) { @@ -2202,9 +2233,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // For spatial enhancemanent layer: perform intra prediction only if base // layer is chosen as the reference. Always perform intra prediction if - // LAST is the only reference or is_key_frame is set. + // LAST is the only reference, or is_key_frame is set, or on base + // temporal layer. if (cpi->svc.spatial_layer_id) { perform_intra_pred = + cpi->svc.temporal_layer_id == 0 || cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame || !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) || (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && @@ -2214,6 +2247,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.is_src_frame_alt_ref) perform_intra_pred = 0; + + // If the segment reference frame feature is enabled and set then + // skip the intra prediction. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) > 0) + perform_intra_pred = 0; + // Perform intra prediction search, if the best SAD is above a certain // threshold. if (best_rdc.rdcost == INT64_MAX || diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 4a7ccc4e5..2e4c9d2bf 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -31,10 +31,13 @@ #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_ratectrl.h" -// Max rate target for 1080P and below encodes under normal circumstances -// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB +// Max rate per frame for 1080P and below encodes if no level requirement given. +// For larger formats limit to MAX_MB_RATE bits per MB +// 4Mbits is derived from the level requirement for level 4 (1080P 30) which +// requires that HW can sustain a rate of 16Mbits over a 4 frame group. +// If a lower level requirement is specified then this may over ride this value. #define MAX_MB_RATE 250 -#define MAXRATE_1080P 2025000 +#define MAXRATE_1080P 4000000 #define DEFAULT_KF_BOOST 2000 #define DEFAULT_GF_BOOST 2000 @@ -1100,6 +1103,9 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, // Baseline value derived from cpi->active_worst_quality and kf boost. active_best_quality = get_kf_active_quality(rc, active_worst_quality, cm->bit_depth); + if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) { + active_best_quality /= 4; + } // Allow somewhat lower kf minq with small image formats. if ((cm->width * cm->height) <= (352 * 288)) { @@ -1490,6 +1496,9 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0; rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth; + if (cpi->use_svc && + cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) + cpi->svc.lower_layer_qindex = cm->base_qindex; } void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { @@ -1584,9 +1593,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { // Adjust boost and af_ratio based on avg_frame_low_motion, which varies // between 0 and 100 (stationary, 100% zero/small motion). rc->gfu_boost = - VPXMAX(500, - DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / - (rc->avg_frame_low_motion + 100)); + VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / + (rc->avg_frame_low_motion + 100)); rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400)); } adjust_gfint_frame_constraint(cpi, rc->frames_to_key); @@ -1861,13 +1869,8 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi, rc->max_gf_interval = vp9_rc_get_default_max_gf_interval( cpi->framerate, rc->min_gf_interval); - // Extended interval for genuinely static scenes - rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2; - - if (is_altref_enabled(cpi)) { - if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) - rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; - } + // Extended max interval for genuinely static scenes like slide shows. + rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH; if (rc->max_gf_interval > rc->static_scene_max_gf_interval) rc->max_gf_interval = rc->static_scene_max_gf_interval; @@ -1911,12 +1914,12 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) { VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS); // A maximum bitrate for a frame is defined. - // The baseline for this aligns with HW implementations that - // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits - // per 16x16 MB (averaged over a frame). However this limit is extended if - // a very high rate is given on the command line or the the rate cannnot - // be acheived because of a user specificed max q (e.g. when the user - // specifies lossless encode. + // However this limit is extended if a very high rate is given on the command + // line or the the rate cannnot be acheived because of a user specificed max q + // (e.g. when the user specifies lossless encode). + // + // If a level is specified that requires a lower maximum rate then the level + // value take precedence. vbr_max_bits = (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) / 100); diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index c1b210677..3a40e0138 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -34,6 +34,14 @@ extern "C" { #define FRAME_OVERHEAD_BITS 200 +// Threshold used to define a KF group as static (e.g. a slide show). +// Essentially this means that no frame in the group has more than 1% of MBs +// that are not marked as coded with 0,0 motion in the first pass. +#define STATIC_KF_GROUP_THRESH 99 + +// The maximum duration of a GF group that is static (for example a slide show). +#define MAX_STATIC_GF_GROUP_LENGTH 250 + typedef enum { INTER_NORMAL = 0, INTER_HIGH = 1, diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 2ba6378c5..90f06720b 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -59,7 +59,9 @@ typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } MODE_DEFINITION; -typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION; +typedef struct { + MV_REFERENCE_FRAME ref_frame[2]; +} REF_DEFINITION; struct rdcost_block_args { const VP9_COMP *cpi; diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index eb39bab25..389d48f21 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -45,8 +45,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { svc->ext_lst_fb_idx[sl] = 0; svc->ext_gld_fb_idx[sl] = 1; svc->ext_alt_fb_idx[sl] = 2; - svc->downsample_filter_type[sl] = EIGHTTAP; - svc->downsample_filter_phase[sl] = 0; // Set to 8 for averaging filter. + svc->downsample_filter_type[sl] = BILINEAR; + svc->downsample_filter_phase[sl] = 8; // Set to 8 for averaging filter. } if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) { @@ -155,6 +155,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, int sl, tl, layer = 0, spatial_layer_target; float bitrate_alloc = 1.0; + cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode; + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) { for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { @@ -547,6 +549,8 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) { if (!spatial_id) { cpi->ref_frame_flags = VP9_LAST_FLAG; } else { + if (spatial_id == cpi->svc.number_spatial_layers - 1) + cpi->ext_refresh_alt_ref_frame = 0; cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; } } diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index 8b708e271..87686fe59 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -103,6 +103,8 @@ typedef struct SVC { int first_layer_denoise; int skip_enhancement_layer; + + int lower_layer_qindex; } SVC; struct VP9_COMP; diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c index dbd243ac1..293cdcd67 100644 --- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -170,13 +170,13 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, fadst4_sse2(in); write_buffer_4x4(output, in); break; - case ADST_ADST: + default: + assert(tx_type == ADST_ADST); load_buffer_4x4(input, in, stride); fadst4_sse2(in); fadst4_sse2(in); write_buffer_4x4(output, in); break; - default: assert(0); break; } } @@ -1097,14 +1097,14 @@ void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); break; - case ADST_ADST: + default: + assert(tx_type == ADST_ADST); load_buffer_8x8(input, in, stride); fadst8_sse2(in); fadst8_sse2(in); right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); break; - default: assert(0); break; } } @@ -1963,13 +1963,13 @@ void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, fadst16_sse2(in0, in1); write_buffer_16x16(output, in0, in1, 16); break; - case ADST_ADST: + default: + assert(tx_type == ADST_ADST); load_buffer_16x16(input, in0, in1, stride); fadst16_sse2(in0, in1); right_shift_16x16(in0, in1); fadst16_sse2(in0, in1); write_buffer_16x16(output, in0, in1, 16); break; - default: assert(0); break; } } diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c new file mode 100644 index 000000000..4bebc34d6 --- /dev/null +++ b/vp9/encoder/x86/vp9_quantize_avx2.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <immintrin.h> // AVX2 + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" +#include "vpx_dsp/x86/quantize_x86.h" + +// Zero fill 8 positions in the output buffer. +static INLINE void store_zero_tran_low(tran_low_t *a) { + const __m256i zero = _mm256_setzero_si256(); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(a), zero); + _mm256_storeu_si256((__m256i *)(a + 8), zero); +#else + _mm256_storeu_si256((__m256i *)(a), zero); +#endif +} + +static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr, + __m256i *coeff256) { + const __m256i iscan = _mm256_loadu_si256(iscan_ptr); + const __m256i zero256 = _mm256_setzero_si256(); +#if CONFIG_VP9_HIGHBITDEPTH + // The _mm256_packs_epi32() in load_tran_low() packs the 64 bit coeff as + // B1 A1 B0 A0. Shuffle to B1 B0 A1 A0 in order to scan eob correctly. + const __m256i _coeff256 = _mm256_permute4x64_epi64(*coeff256, 0xd8); + const __m256i zero_coeff0 = _mm256_cmpeq_epi16(_coeff256, zero256); +#else + const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256); +#endif + const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256); + // Add one to convert from indices to counts + const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0); + return _mm256_and_si256(iscan_plus_one, nzero_coeff0); +} + +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + __m128i eob; + __m256i round256, quant256, dequant256; + __m256i eob256, thr256; + + (void)scan_ptr; + (void)skip_block; + assert(!skip_block); + + coeff_ptr += n_coeffs; + iscan_ptr += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + { + __m256i coeff256; + + // Setup global values + { + const __m128i round = _mm_load_si128((const __m128i *)round_ptr); + const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr); + round256 = _mm256_castsi128_si256(round); + round256 = _mm256_permute4x64_epi64(round256, 0x54); + + quant256 = _mm256_castsi128_si256(quant); + quant256 = _mm256_permute4x64_epi64(quant256, 0x54); + + dequant256 = _mm256_castsi128_si256(dequant); + dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54); + } + + { + __m256i qcoeff256; + __m256i qtmp256; + coeff256 = load_tran_low(coeff_ptr + n_coeffs); + qcoeff256 = _mm256_abs_epi16(coeff256); + qcoeff256 = _mm256_adds_epi16(qcoeff256, round256); + qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256); + qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256); + store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs); + coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256); + store_tran_low(coeff256, dqcoeff_ptr + n_coeffs); + } + + eob256 = scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256); + n_coeffs += 8 * 2; + } + + // remove dc constants + dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31); + quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31); + round256 = _mm256_permute2x128_si256(round256, round256, 0x31); + + thr256 = _mm256_srai_epi16(dequant256, 1); + + // AC only loop + while (n_coeffs < 0) { + __m256i coeff256 = load_tran_low(coeff_ptr + n_coeffs); + __m256i qcoeff256 = _mm256_abs_epi16(coeff256); + int32_t nzflag = + _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256)); + + if (nzflag) { + __m256i qtmp256; + qcoeff256 = _mm256_adds_epi16(qcoeff256, round256); + qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256); + qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256); + store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs); + coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256); + store_tran_low(coeff256, dqcoeff_ptr + n_coeffs); + eob256 = _mm256_max_epi16( + eob256, + scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256)); + } else { + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + } + n_coeffs += 8 * 2; + } + + eob = _mm_max_epi16(_mm256_castsi256_si128(eob256), + _mm256_extracti128_si256(eob256, 1)); + + *eob_ptr = accumulate_eob(eob); +} diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 9819fb641..721e170bf 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -64,10 +64,13 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht_neon.h ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c @@ -78,10 +81,11 @@ ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c else +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_highbd_iht4x4_add_neon.c VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht4x4_add_sse4.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht8x8_add_sse4.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht16x16_add_sse4.c endif $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl)) diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 5bfe9aa05..40f7ab531 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1067,12 +1067,11 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi, vpx_codec_frame_flags_t flags = lib_flags << 16; if (lib_flags & FRAMEFLAGS_KEY || - (cpi->use_svc && - cpi->svc - .layer_context[cpi->svc.spatial_layer_id * - cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id] - .is_key_frame)) + (cpi->use_svc && cpi->svc + .layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id] + .is_key_frame)) flags |= VPX_FRAME_IS_KEY; if (cpi->droppable) flags |= VPX_FRAME_IS_DROPPABLE; @@ -1234,6 +1233,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, ctx->pending_frame_magnitude |= size; cx_data += size; cx_data_sz -= size; + pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width; + pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height; if (ctx->output_cx_pkt_cb.output_cx_pkt) { pkt.kind = VPX_CODEC_CX_FRAME_PKT; @@ -1260,8 +1261,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units( timebase, dst_end_time_stamp - dst_time_stamp); pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); - pkt.data.frame.width = cpi->common.width; - pkt.data.frame.height = cpi->common.height; + pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width; + pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height; if (ctx->pending_cx_data) { if (size) ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; @@ -1340,9 +1341,8 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type), &sd); return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, @@ -1356,9 +1356,8 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, vp9_copy_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type), &sd); return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, @@ -1371,9 +1370,8 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, yuvconfig2image(&frame->img, fb, NULL); return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx, @@ -1383,9 +1381,8 @@ static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx, if (config != NULL) { ctx->preview_ppcfg = *config; return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; #else (void)ctx; (void)args; @@ -1407,17 +1404,24 @@ static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) { if (vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags) == 0) { yuvconfig2image(&ctx->preview_img, &sd, NULL); return &ctx->preview_img; - } else { - return NULL; } + return NULL; } static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx, va_list args) { - (void)ctx; - (void)args; + vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *); - // TODO(yaowu): Need to re-implement and test for VP9. + if (data) { + vpx_roi_map_t *roi = (vpx_roi_map_t *)data; + + if (!vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols, + roi->delta_q, roi->delta_lf, roi->skip, + roi->ref_frame)) { + return VPX_CODEC_OK; + } + return VPX_CODEC_INVALID_PARAM; + } return VPX_CODEC_INVALID_PARAM; } @@ -1429,11 +1433,10 @@ static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx, if (!vp9_set_active_map(ctx->cpi, map->active_map, (int)map->rows, (int)map->cols)) return VPX_CODEC_OK; - else - return VPX_CODEC_INVALID_PARAM; - } else { + return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx, @@ -1444,11 +1447,10 @@ static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx, if (!vp9_get_active_map(ctx->cpi, map->active_map, (int)map->rows, (int)map->cols)) return VPX_CODEC_OK; - else - return VPX_CODEC_INVALID_PARAM; - } else { + return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, @@ -1460,9 +1462,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode, (VPX_SCALING)mode->v_scaling_mode); return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) { @@ -1608,7 +1609,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { // Setters { VP8_SET_REFERENCE, ctrl_set_reference }, { VP8_SET_POSTPROC, ctrl_set_previewpp }, - { VP8E_SET_ROI_MAP, ctrl_set_roi_map }, + { VP9E_SET_ROI_MAP, ctrl_set_roi_map }, { VP8E_SET_ACTIVEMAP, ctrl_set_active_map }, { VP8E_SET_SCALEMODE, ctrl_set_scale_mode }, { VP8E_SET_CPUUSED, ctrl_set_cpuused }, diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index d633ed142..6186d4614 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -103,6 +103,7 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c index 4390cf7c8..1cf2dca69 100644 --- a/vpx/src/vpx_encoder.c +++ b/vpx/src/vpx_encoder.c @@ -12,8 +12,11 @@ * \brief Provides the high level interface to wrap encoder algorithms. * */ +#include <assert.h> #include <limits.h> +#include <stdlib.h> #include <string.h> +#include "vp8/common/blockd.h" #include "vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" @@ -81,6 +84,8 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( int i; void *mem_loc = NULL; + if (iface->enc.mr_get_mem_loc == NULL) return VPX_CODEC_INCAPABLE; + if (!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) { for (i = 0; i < num_enc; i++) { vpx_codec_priv_enc_mr_cfg_t mr_cfg; @@ -89,28 +94,27 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 || dsf->den > dsf->num) { res = VPX_CODEC_INVALID_PARAM; - break; + } else { + mr_cfg.mr_low_res_mode_info = mem_loc; + mr_cfg.mr_total_resolutions = num_enc; + mr_cfg.mr_encoder_id = num_enc - 1 - i; + mr_cfg.mr_down_sampling_factor.num = dsf->num; + mr_cfg.mr_down_sampling_factor.den = dsf->den; + + /* Force Key-frame synchronization. Namely, encoder at higher + * resolution always use the same frame_type chosen by the + * lowest-resolution encoder. + */ + if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED; + + ctx->iface = iface; + ctx->name = iface->name; + ctx->priv = NULL; + ctx->init_flags = flags; + ctx->config.enc = cfg; + res = ctx->iface->init(ctx, &mr_cfg); } - mr_cfg.mr_low_res_mode_info = mem_loc; - mr_cfg.mr_total_resolutions = num_enc; - mr_cfg.mr_encoder_id = num_enc - 1 - i; - mr_cfg.mr_down_sampling_factor.num = dsf->num; - mr_cfg.mr_down_sampling_factor.den = dsf->den; - - /* Force Key-frame synchronization. Namely, encoder at higher - * resolution always use the same frame_type chosen by the - * lowest-resolution encoder. - */ - if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED; - - ctx->iface = iface; - ctx->name = iface->name; - ctx->priv = NULL; - ctx->init_flags = flags; - ctx->config.enc = cfg; - res = ctx->iface->init(ctx, &mr_cfg); - if (res) { const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL; /* Destroy current ctx */ @@ -124,10 +128,14 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver( vpx_codec_destroy(ctx); i--; } +#if CONFIG_MULTI_RES_ENCODING + assert(mem_loc); + free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info); + free(mem_loc); +#endif + return SAVE_STATUS(ctx, res); } - if (res) break; - ctx++; cfg++; dsf++; diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 6c67ec24f..261ed8614 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -125,7 +125,7 @@ extern vpx_codec_iface_t *vpx_codec_vp9_cx(void); enum vp8e_enc_control_id { /*!\brief Codec control function to pass an ROI map to encoder. * - * Supported in codecs: VP8, VP9 + * Supported in codecs: VP8 */ VP8E_SET_ROI_MAP = 8, @@ -423,6 +423,12 @@ enum vp8e_enc_control_id { */ VP9E_SET_SVC, + /*!\brief Codec control function to pass an ROI map to encoder. + * + * Supported in codecs: VP9 + */ + VP9E_SET_ROI_MAP, + /*!\brief Codec control function to set parameters for SVC. * \note Parameters contain min_q, max_q, scaling factor for each of the * SVC layers. @@ -643,16 +649,20 @@ typedef enum vp9e_temporal_layering_mode { */ typedef struct vpx_roi_map { - /*! An id between 0 and 3 for each 16x16 region within a frame. */ + /*! If ROI is enabled. */ + uint8_t enabled; + /*! An id between 0-3 (0-7 for vp9) for each 16x16 (8x8 for VP9) + * region within a frame. */ unsigned char *roi_map; unsigned int rows; /**< Number of rows. */ unsigned int cols; /**< Number of columns. */ - // TODO(paulwilkins): broken for VP9 which has 8 segments - // q and loop filter deltas for each segment - // (see MAX_MB_SEGMENTS) - int delta_q[4]; /**< Quantizer deltas. */ - int delta_lf[4]; /**< Loop filter deltas. */ - /*! Static breakout threshold for each segment. */ + /*! VP8 only uses the first 4 segments. VP9 uses 8 segments. */ + int delta_q[8]; /**< Quantizer deltas. */ + int delta_lf[8]; /**< Loop filter deltas. */ + /*! skip and ref frame segment is only used in VP9. */ + int skip[8]; /**< Skip this block. */ + int ref_frame[8]; /**< Reference frame for this block. */ + /*! Static breakout threshold for each segment. Only used in VP8. */ unsigned int static_threshold[4]; } vpx_roi_map_t; @@ -749,6 +759,8 @@ VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int) #define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *) #define VPX_CTRL_VP8E_SET_ROI_MAP +VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *) +#define VPX_CTRL_VP9E_SET_ROI_MAP VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *) #define VPX_CTRL_VP8E_SET_ACTIVEMAP VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *) diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h index 41c53e48d..398c67022 100644 --- a/vpx/vp8dx.h +++ b/vpx/vp8dx.h @@ -179,6 +179,8 @@ VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER #define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int) +#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER +VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int) /*!\endcond */ /*! @} - end defgroup vp8_decoder */ diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index de964eb46..f9a2ed374 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -63,7 +63,7 @@ extern "C" { * fields to structures */ #define VPX_ENCODER_ABI_VERSION \ - (7 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ + (8 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield * @@ -182,8 +182,10 @@ typedef struct vpx_codec_cx_pkt { * Only applicable when "output partition" mode is enabled. First * partition has id 0.*/ int partition_id; - unsigned int width; /**< frame width */ - unsigned int height; /**< frame height */ + /*!\brief Width and height of frames in this packet. VP8 will only use the + * first one.*/ + unsigned int width[VPX_SS_MAX_LAYERS]; /**< frame width */ + unsigned int height[VPX_SS_MAX_LAYERS]; /**< frame height */ } frame; /**< data for compressed frame packet */ vpx_fixed_buf_t twopass_stats; /**< data for two-pass packet */ vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */ diff --git a/vpx/vpx_frame_buffer.h b/vpx/vpx_frame_buffer.h index ad70cdd57..ac17c529d 100644 --- a/vpx/vpx_frame_buffer.h +++ b/vpx/vpx_frame_buffer.h @@ -57,7 +57,7 @@ typedef struct vpx_codec_frame_buffer { * return 0. Any failure the callback must return a value less than 0. * * \param[in] priv Callback's private data - * \param[in] new_size Size in bytes needed by the buffer + * \param[in] min_size Size in bytes needed by the buffer * \param[in,out] fb Pointer to vpx_codec_frame_buffer_t */ typedef int (*vpx_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size, diff --git a/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/vpx_dsp/arm/highbd_idct16x16_add_neon.c index 5358839b5..3fa2f9e28 100644 --- a/vpx_dsp/arm/highbd_idct16x16_add_neon.c +++ b/vpx_dsp/arm/highbd_idct16x16_add_neon.c @@ -14,58 +14,33 @@ #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/inv_txfm.h" -static INLINE void highbd_idct16x16_add_wrap_low_8x2(const int64x2x2_t *const t, - int32x4x2_t *const d0, - int32x4x2_t *const d1) { - int32x2x2_t t32[4]; - - t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS); - t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS); - t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS); - t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS); - t32[2].val[0] = vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS); - t32[2].val[1] = vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS); - t32[3].val[0] = vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS); - t32[3].val[1] = vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS); - d0->val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]); - d0->val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]); - d1->val[0] = vcombine_s32(t32[2].val[0], t32[2].val[1]); - d1->val[1] = vcombine_s32(t32[3].val[0], t32[3].val[1]); +static INLINE int32x4_t dct_const_round_shift_high_4(const int64x2x2_t in) { + int32x2x2_t t32; + + t32.val[0] = vrshrn_n_s64(in.val[0], DCT_CONST_BITS); + t32.val[1] = vrshrn_n_s64(in.val[1], DCT_CONST_BITS); + return vcombine_s32(t32.val[0], t32.val[1]); } -static INLINE void highbd_idct16x16_add_wrap_low_4x2(const int64x2x2_t *const t, - int32x4_t *const d0, - int32x4_t *const d1) { - int32x2x2_t t32[2]; - - t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS); - t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS); - t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS); - t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS); - *d0 = vcombine_s32(t32[0].val[0], t32[0].val[1]); - *d1 = vcombine_s32(t32[1].val[0], t32[1].val[1]); +static INLINE void dct_const_round_shift_high_4_dual( + const int64x2x2_t *const in, int32x4_t *const d0, int32x4_t *const d1) { + *d0 = dct_const_round_shift_high_4(in[0]); + *d1 = dct_const_round_shift_high_4(in[1]); } static INLINE int32x4x2_t -highbd_idct16x16_add_wrap_low_8x1(const int64x2x2_t *const t) { - int32x2x2_t t32[2]; - int32x4x2_t d; - - t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS); - t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS); - t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS); - t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS); - d.val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]); - d.val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]); - return d; +dct_const_round_shift_high_4x2_int64x2x2(const int64x2x2_t *const in) { + int32x4x2_t out; + out.val[0] = dct_const_round_shift_high_4(in[0]); + out.val[1] = dct_const_round_shift_high_4(in[1]); + return out; } -static INLINE int32x4_t highbd_idct16x16_add_wrap_low_4x1(const int64x2x2_t t) { - int32x2x2_t t32; - - t32.val[0] = vrshrn_n_s64(t.val[0], DCT_CONST_BITS); - t32.val[1] = vrshrn_n_s64(t.val[1], DCT_CONST_BITS); - return vcombine_s32(t32.val[0], t32.val[1]); +static INLINE void dct_const_round_shift_high_4x2x2(const int64x2x2_t *const in, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + *d0 = dct_const_round_shift_high_4x2_int64x2x2(in + 0); + *d1 = dct_const_round_shift_high_4x2_int64x2x2(in + 2); } static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0, @@ -107,7 +82,7 @@ static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0, vget_low_s32(cospi_2_30_10_22), 0); t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), vget_low_s32(cospi_2_30_10_22), 0); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0, @@ -149,7 +124,7 @@ static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0, vget_low_s32(cospi_4_12_20N_28), 0); t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), vget_low_s32(cospi_4_12_20N_28), 0); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0, @@ -191,7 +166,7 @@ static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0, vget_low_s32(cospi_6_26N_14_18N), 1); t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), vget_low_s32(cospi_6_26N_14_18N), 1); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0, @@ -233,7 +208,7 @@ static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0, vget_high_s32(cospi_2_30_10_22), 0); t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), vget_high_s32(cospi_2_30_10_22), 0); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0, @@ -275,7 +250,7 @@ static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0, vget_high_s32(cospi_4_12_20N_28), 0); t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), vget_high_s32(cospi_4_12_20N_28), 0); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0, @@ -317,7 +292,7 @@ static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0, vget_high_s32(cospi_6_26N_14_18N), 1); t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), vget_high_s32(cospi_6_26N_14_18N), 1); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_8_24_q_kernel( @@ -386,7 +361,7 @@ static INLINE void highbd_idct_cospi_8_24_q(const int32x4x2_t s0, int64x2x2_t t[4]; highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0, @@ -397,7 +372,7 @@ static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0, int64x2x2_t t[2]; highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t); - highbd_idct16x16_add_wrap_low_4x2(t, d0, d1); + dct_const_round_shift_high_4_dual(t, d0, d1); } static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0, @@ -412,7 +387,7 @@ static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0, t[2].val[1] = vsubq_s64(vdupq_n_s64(0), t[2].val[1]); t[3].val[0] = vsubq_s64(vdupq_n_s64(0), t[3].val[0]); t[3].val[1] = vsubq_s64(vdupq_n_s64(0), t[3].val[1]); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0, @@ -425,7 +400,7 @@ static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0, highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t); t[1].val[0] = vsubq_s64(vdupq_n_s64(0), t[1].val[0]); t[1].val[1] = vsubq_s64(vdupq_n_s64(0), t[1].val[1]); - highbd_idct16x16_add_wrap_low_4x2(t, d0, d1); + dct_const_round_shift_high_4_dual(t, d0, d1); } static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0, @@ -459,7 +434,7 @@ static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0, vget_high_s32(cospi_0_8_16_24), 0); t[3].val[1] = vmlal_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]), vget_high_s32(cospi_0_8_16_24), 0); - highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); + dct_const_round_shift_high_4x2x2(t, d0, d1); } static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0, @@ -481,7 +456,7 @@ static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0, vget_high_s32(cospi_0_8_16_24), 0); t[1].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0), vget_high_s32(cospi_0_8_16_24), 0); - highbd_idct16x16_add_wrap_low_4x2(t, d0, d1); + dct_const_round_shift_high_4_dual(t, d0, d1); } static INLINE void highbd_idct16x16_add_stage7_dual( @@ -815,7 +790,7 @@ static INLINE int32x4x2_t highbd_idct_cospi_lane0_dual(const int32x4x2_t s, t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 0); t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 0); t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 0); - return highbd_idct16x16_add_wrap_low_8x1(t); + return dct_const_round_shift_high_4x2_int64x2x2(t); } static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s, @@ -824,7 +799,7 @@ static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s, t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 0); t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 0); - return highbd_idct16x16_add_wrap_low_4x1(t); + return dct_const_round_shift_high_4(t); } static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s, @@ -835,7 +810,7 @@ static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s, t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 1); t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 1); t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 1); - return highbd_idct16x16_add_wrap_low_8x1(t); + return dct_const_round_shift_high_4x2_int64x2x2(t); } static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s, @@ -844,7 +819,7 @@ static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s, t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 1); t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 1); - return highbd_idct16x16_add_wrap_low_4x1(t); + return dct_const_round_shift_high_4(t); } static void vpx_highbd_idct16x16_38_add_half1d(const int32_t *input, diff --git a/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c index 96a55c472..5b36f7336 100644 --- a/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c +++ b/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c @@ -124,83 +124,77 @@ static INLINE void do_butterfly(const int32x4x2_t qIn0, const int32x4x2_t qIn1, vrshrn_n_s64(q[3].val[1], DCT_CONST_BITS)); } -static INLINE void load_s32x4q_dual( - const int32_t *in, int32x4x2_t *const s0, int32x4x2_t *const s1, - int32x4x2_t *const s2, int32x4x2_t *const s3, int32x4x2_t *const s4, - int32x4x2_t *const s5, int32x4x2_t *const s6, int32x4x2_t *const s7) { - s0->val[0] = vld1q_s32(in); - s0->val[1] = vld1q_s32(in + 4); +static INLINE void load_s32x4q_dual(const int32_t *in, int32x4x2_t *const s) { + s[0].val[0] = vld1q_s32(in); + s[0].val[1] = vld1q_s32(in + 4); in += 32; - s1->val[0] = vld1q_s32(in); - s1->val[1] = vld1q_s32(in + 4); + s[1].val[0] = vld1q_s32(in); + s[1].val[1] = vld1q_s32(in + 4); in += 32; - s2->val[0] = vld1q_s32(in); - s2->val[1] = vld1q_s32(in + 4); + s[2].val[0] = vld1q_s32(in); + s[2].val[1] = vld1q_s32(in + 4); in += 32; - s3->val[0] = vld1q_s32(in); - s3->val[1] = vld1q_s32(in + 4); + s[3].val[0] = vld1q_s32(in); + s[3].val[1] = vld1q_s32(in + 4); in += 32; - s4->val[0] = vld1q_s32(in); - s4->val[1] = vld1q_s32(in + 4); + s[4].val[0] = vld1q_s32(in); + s[4].val[1] = vld1q_s32(in + 4); in += 32; - s5->val[0] = vld1q_s32(in); - s5->val[1] = vld1q_s32(in + 4); + s[5].val[0] = vld1q_s32(in); + s[5].val[1] = vld1q_s32(in + 4); in += 32; - s6->val[0] = vld1q_s32(in); - s6->val[1] = vld1q_s32(in + 4); + s[6].val[0] = vld1q_s32(in); + s[6].val[1] = vld1q_s32(in + 4); in += 32; - s7->val[0] = vld1q_s32(in); - s7->val[1] = vld1q_s32(in + 4); + s[7].val[0] = vld1q_s32(in); + s[7].val[1] = vld1q_s32(in + 4); } -static INLINE void transpose_and_store_s32_8x8(int32x4x2_t a0, int32x4x2_t a1, - int32x4x2_t a2, int32x4x2_t a3, - int32x4x2_t a4, int32x4x2_t a5, - int32x4x2_t a6, int32x4x2_t a7, +static INLINE void transpose_and_store_s32_8x8(int32x4x2_t *const a, int32_t **out) { - transpose_s32_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + transpose_s32_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); - vst1q_s32(*out, a0.val[0]); + vst1q_s32(*out, a[0].val[0]); *out += 4; - vst1q_s32(*out, a0.val[1]); + vst1q_s32(*out, a[0].val[1]); *out += 4; - vst1q_s32(*out, a1.val[0]); + vst1q_s32(*out, a[1].val[0]); *out += 4; - vst1q_s32(*out, a1.val[1]); + vst1q_s32(*out, a[1].val[1]); *out += 4; - vst1q_s32(*out, a2.val[0]); + vst1q_s32(*out, a[2].val[0]); *out += 4; - vst1q_s32(*out, a2.val[1]); + vst1q_s32(*out, a[2].val[1]); *out += 4; - vst1q_s32(*out, a3.val[0]); + vst1q_s32(*out, a[3].val[0]); *out += 4; - vst1q_s32(*out, a3.val[1]); + vst1q_s32(*out, a[3].val[1]); *out += 4; - vst1q_s32(*out, a4.val[0]); + vst1q_s32(*out, a[4].val[0]); *out += 4; - vst1q_s32(*out, a4.val[1]); + vst1q_s32(*out, a[4].val[1]); *out += 4; - vst1q_s32(*out, a5.val[0]); + vst1q_s32(*out, a[5].val[0]); *out += 4; - vst1q_s32(*out, a5.val[1]); + vst1q_s32(*out, a[5].val[1]); *out += 4; - vst1q_s32(*out, a6.val[0]); + vst1q_s32(*out, a[6].val[0]); *out += 4; - vst1q_s32(*out, a6.val[1]); + vst1q_s32(*out, a[6].val[1]); *out += 4; - vst1q_s32(*out, a7.val[0]); + vst1q_s32(*out, a[7].val[0]); *out += 4; - vst1q_s32(*out, a7.val[1]); + vst1q_s32(*out, a[7].val[1]); *out += 4; } static INLINE void idct32_transpose_pair(const int32_t *input, int32_t *t_buf) { int i; - int32x4x2_t s0, s1, s2, s3, s4, s5, s6, s7; + int32x4x2_t s[8]; for (i = 0; i < 4; i++, input += 8) { - load_s32x4q_dual(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); - transpose_and_store_s32_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf); + load_s32x4q_dual(input, s); + transpose_and_store_s32_8x8(s, &t_buf); } } diff --git a/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/vpx_dsp/arm/highbd_idct4x4_add_neon.c index 1418a75a1..7be1dad1d 100644 --- a/vpx_dsp/arm/highbd_idct4x4_add_neon.c +++ b/vpx_dsp/arm/highbd_idct4x4_add_neon.c @@ -11,27 +11,10 @@ #include <arm_neon.h> #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/inv_txfm.h" -static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest, - const int stride, - const int16x8_t res, - const int16x8_t max) { - const uint16x4_t a0 = vld1_u16(*dest); - const uint16x4_t a1 = vld1_u16(*dest + stride); - const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1)); - // Note: In some profile tests, res is quite close to +/-32767. - // We use saturating addition. - const int16x8_t b = vqaddq_s16(res, a); - const int16x8_t c = vminq_s16(b, max); - const uint16x8_t d = vqshluq_n_s16(c, 0); - vst1_u16(*dest, vget_low_u16(d)); - *dest += stride; - vst1_u16(*dest, vget_high_u16(d)); - *dest += stride; -} - // res is in reverse row order static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest, const int stride, @@ -65,109 +48,42 @@ void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); } -static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis, - int32x4_t *const a0, - int32x4_t *const a1, - int32x4_t *const a2, - int32x4_t *const a3) { - int32x4_t b0, b1, b2, b3; - - transpose_s32_4x4(a0, a1, a2, a3); - b0 = vaddq_s32(*a0, *a2); - b1 = vsubq_s32(*a0, *a2); - b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0); - b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0); - b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1); - b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1); - b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1); - b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1); - b0 = vrshrq_n_s32(b0, DCT_CONST_BITS); - b1 = vrshrq_n_s32(b1, DCT_CONST_BITS); - b2 = vrshrq_n_s32(b2, DCT_CONST_BITS); - b3 = vrshrq_n_s32(b3, DCT_CONST_BITS); - *a0 = vaddq_s32(b0, b3); - *a1 = vaddq_s32(b1, b2); - *a2 = vsubq_s32(b1, b2); - *a3 = vsubq_s32(b0, b3); -} - -static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, - int32x4_t *const a0, - int32x4_t *const a1, - int32x4_t *const a2, - int32x4_t *const a3) { - int32x4_t b0, b1, b2, b3; - int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11; - - transpose_s32_4x4(a0, a1, a2, a3); - b0 = vaddq_s32(*a0, *a2); - b1 = vsubq_s32(*a0, *a2); - c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0); - c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0); - c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0); - c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0); - c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1); - c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1); - c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1); - c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1); - c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1); - c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1); - c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1); - c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1); - c4 = vsubq_s64(c4, c8); - c5 = vsubq_s64(c5, c9); - c6 = vaddq_s64(c6, c10); - c7 = vaddq_s64(c7, c11); - b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS), - vrshrn_n_s64(c1, DCT_CONST_BITS)); - b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS), - vrshrn_n_s64(c3, DCT_CONST_BITS)); - b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS), - vrshrn_n_s64(c5, DCT_CONST_BITS)); - b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS), - vrshrn_n_s64(c7, DCT_CONST_BITS)); - *a0 = vaddq_s32(b0, b3); - *a1 = vaddq_s32(b1, b2); - *a2 = vsubq_s32(b1, b2); - *a3 = vsubq_s32(b0, b3); -} - void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); - int32x4_t c0 = vld1q_s32(input); - int32x4_t c1 = vld1q_s32(input + 4); - int32x4_t c2 = vld1q_s32(input + 8); - int32x4_t c3 = vld1q_s32(input + 12); - int16x8_t a0, a1; + int16x8_t a[2]; + int32x4_t c[4]; - if (bd == 8) { - const int16x4_t cospis = vld1_s16(kCospi); + c[0] = vld1q_s32(input); + c[1] = vld1q_s32(input + 4); + c[2] = vld1q_s32(input + 8); + c[3] = vld1q_s32(input + 12); + if (bd == 8) { // Rows - a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1)); - a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3)); - idct4x4_16_kernel_bd8(cospis, &a0, &a1); + a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1])); + a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3])); + transpose_idct4x4_16_bd8(a); // Columns - a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1)); - idct4x4_16_kernel_bd8(cospis, &a0, &a1); - a0 = vrshrq_n_s16(a0, 4); - a1 = vrshrq_n_s16(a1, 4); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_idct4x4_16_bd8(a); + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); } else { const int32x4_t cospis = vld1q_s32(kCospi32); if (bd == 10) { - idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); - idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); + idct4x4_16_kernel_bd10(cospis, c); + idct4x4_16_kernel_bd10(cospis, c); } else { - idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); - idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); + idct4x4_16_kernel_bd12(cospis, c); + idct4x4_16_kernel_bd12(cospis, c); } - a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4)); - a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4)); + a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4)); + a[1] = vcombine_s16(vqrshrn_n_s32(c[3], 4), vqrshrn_n_s32(c[2], 4)); } - highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max); - highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max); + highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max); + highbd_idct4x4_1_add_kernel2(&dest, stride, a[1], max); } diff --git a/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/vpx_dsp/arm/highbd_idct8x8_add_neon.c index dd90134a6..e51e574cc 100644 --- a/vpx_dsp/arm/highbd_idct8x8_add_neon.c +++ b/vpx_dsp/arm/highbd_idct8x8_add_neon.c @@ -127,7 +127,7 @@ static INLINE void idct8x8_12_half1d_bd12( int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, int32x4_t *const io7) { - int32x2_t input_1l, input_1h, input_3l, input_3h; + int32x2_t input1l, input1h, input3l, input3h; int32x2_t step1l[2], step1h[2]; int32x4_t step1[8], step2[8]; int64x2_t t64[8]; @@ -136,23 +136,23 @@ static INLINE void idct8x8_12_half1d_bd12( transpose_s32_4x4(io0, io1, io2, io3); // stage 1 - input_1l = vget_low_s32(*io1); - input_1h = vget_high_s32(*io1); - input_3l = vget_low_s32(*io3); - input_3h = vget_high_s32(*io3); + input1l = vget_low_s32(*io1); + input1h = vget_high_s32(*io1); + input3l = vget_low_s32(*io3); + input3h = vget_high_s32(*io3); step1l[0] = vget_low_s32(*io0); step1h[0] = vget_high_s32(*io0); step1l[1] = vget_low_s32(*io2); step1h[1] = vget_high_s32(*io2); - t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1); - t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1); - t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0); - t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0); - t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1); - t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1); - t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0); - t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0); + t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1); + t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1); + t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0); + t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0); + t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1); + t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1); + t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0); + t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0); t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); @@ -222,9 +222,7 @@ static INLINE void idct8x8_12_half1d_bd12( *io7 = vsubq_s32(step1[0], step2[7]); } -static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2, - int16x8_t a3, int16x8_t a4, int16x8_t a5, - int16x8_t a6, int16x8_t a7, uint16_t *dest, +static INLINE void highbd_add8x8(int16x8_t *const a, uint16_t *dest, const int stride, const int bd) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); const uint16_t *dst = dest; @@ -248,14 +246,14 @@ static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2, dst += stride; d7 = vld1q_u16(dst); - d0_s16 = vqaddq_s16(a0, vreinterpretq_s16_u16(d0)); - d1_s16 = vqaddq_s16(a1, vreinterpretq_s16_u16(d1)); - d2_s16 = vqaddq_s16(a2, vreinterpretq_s16_u16(d2)); - d3_s16 = vqaddq_s16(a3, vreinterpretq_s16_u16(d3)); - d4_s16 = vqaddq_s16(a4, vreinterpretq_s16_u16(d4)); - d5_s16 = vqaddq_s16(a5, vreinterpretq_s16_u16(d5)); - d6_s16 = vqaddq_s16(a6, vreinterpretq_s16_u16(d6)); - d7_s16 = vqaddq_s16(a7, vreinterpretq_s16_u16(d7)); + d0_s16 = vqaddq_s16(a[0], vreinterpretq_s16_u16(d0)); + d1_s16 = vqaddq_s16(a[1], vreinterpretq_s16_u16(d1)); + d2_s16 = vqaddq_s16(a[2], vreinterpretq_s16_u16(d2)); + d3_s16 = vqaddq_s16(a[3], vreinterpretq_s16_u16(d3)); + d4_s16 = vqaddq_s16(a[4], vreinterpretq_s16_u16(d4)); + d5_s16 = vqaddq_s16(a[5], vreinterpretq_s16_u16(d5)); + d6_s16 = vqaddq_s16(a[6], vreinterpretq_s16_u16(d6)); + d7_s16 = vqaddq_s16(a[7], vreinterpretq_s16_u16(d7)); d0_s16 = vminq_s16(d0_s16, max); d1_s16 = vminq_s16(d1_s16, max); @@ -293,11 +291,13 @@ static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2, void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - int32x4_t a0 = vld1q_s32(input); - int32x4_t a1 = vld1q_s32(input + 8); - int32x4_t a2 = vld1q_s32(input + 16); - int32x4_t a3 = vld1q_s32(input + 24); - int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + int32x4_t a[16]; + int16x8_t c[8]; + + a[0] = vld1q_s32(input); + a[1] = vld1q_s32(input + 8); + a[2] = vld1q_s32(input + 16); + a[3] = vld1q_s32(input + 24); if (bd == 8) { const int16x8_t cospis = vld1q_s16(kCospi); @@ -305,54 +305,52 @@ void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24 const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28 - int16x4_t b0 = vmovn_s32(a0); - int16x4_t b1 = vmovn_s32(a1); - int16x4_t b2 = vmovn_s32(a2); - int16x4_t b3 = vmovn_s32(a3); - int16x4_t b4, b5, b6, b7; - - idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &b0, &b1, &b2, &b3, &b4, - &b5, &b6, &b7); - idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b0, b1, b2, b3, b4, b5, - b6, b7, &c0, &c1, &c2, &c3, &c4, &c5, &c6, &c7); - c0 = vrshrq_n_s16(c0, 5); - c1 = vrshrq_n_s16(c1, 5); - c2 = vrshrq_n_s16(c2, 5); - c3 = vrshrq_n_s16(c3, 5); - c4 = vrshrq_n_s16(c4, 5); - c5 = vrshrq_n_s16(c5, 5); - c6 = vrshrq_n_s16(c6, 5); - c7 = vrshrq_n_s16(c7, 5); + int16x4_t b[8]; + + b[0] = vmovn_s32(a[0]); + b[1] = vmovn_s32(a[1]); + b[2] = vmovn_s32(a[2]); + b[3] = vmovn_s32(a[3]); + + idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, b); + idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b, c); + c[0] = vrshrq_n_s16(c[0], 5); + c[1] = vrshrq_n_s16(c[1], 5); + c[2] = vrshrq_n_s16(c[2], 5); + c[3] = vrshrq_n_s16(c[3], 5); + c[4] = vrshrq_n_s16(c[4], 5); + c[5] = vrshrq_n_s16(c[5], 5); + c[6] = vrshrq_n_s16(c[6], 5); + c[7] = vrshrq_n_s16(c[7], 5); } else { const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 - int32x4_t a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15; if (bd == 10) { - idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, - &a6, &a7); - idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9, - &a10, &a11); - idct8x8_12_half1d_bd10(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13, - &a14, &a15); + idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[8], &a[9], &a[10], &a[11]); + idct8x8_12_half1d_bd10(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7], + &a[12], &a[13], &a[14], &a[15]); } else { - idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, - &a6, &a7); - idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9, - &a10, &a11); - idct8x8_12_half1d_bd12(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13, - &a14, &a15); + idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[8], &a[9], &a[10], &a[11]); + idct8x8_12_half1d_bd12(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7], + &a[12], &a[13], &a[14], &a[15]); } - c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5)); - c1 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5)); - c2 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5)); - c3 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5)); - c4 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5)); - c5 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5)); - c6 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5)); - c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5)); + c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5)); + c[1] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5)); + c[2] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5)); + c[3] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5)); + c[4] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5)); + c[5] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5)); + c[6] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5)); + c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5)); } - highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd); + highbd_add8x8(c, dest, stride, bd); } static INLINE void idct8x8_64_half1d_bd10( @@ -428,8 +426,8 @@ static INLINE void idct8x8_64_half1d_bd12( int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, int32x4_t *const io7) { - int32x2_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h, - input_7l, input_7h; + int32x2_t input1l, input1h, input3l, input3h, input5l, input5h, input7l, + input7h; int32x2_t step1l[4], step1h[4]; int32x4_t step1[8], step2[8]; int64x2_t t64[8]; @@ -438,14 +436,14 @@ static INLINE void idct8x8_64_half1d_bd12( transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7); // stage 1 - input_1l = vget_low_s32(*io1); - input_1h = vget_high_s32(*io1); - input_3l = vget_low_s32(*io3); - input_3h = vget_high_s32(*io3); - input_5l = vget_low_s32(*io5); - input_5h = vget_high_s32(*io5); - input_7l = vget_low_s32(*io7); - input_7h = vget_high_s32(*io7); + input1l = vget_low_s32(*io1); + input1h = vget_high_s32(*io1); + input3l = vget_low_s32(*io3); + input3h = vget_high_s32(*io3); + input5l = vget_low_s32(*io5); + input5h = vget_high_s32(*io5); + input7l = vget_low_s32(*io7); + input7h = vget_high_s32(*io7); step1l[0] = vget_low_s32(*io0); step1h[0] = vget_high_s32(*io0); step1l[1] = vget_low_s32(*io2); @@ -455,22 +453,22 @@ static INLINE void idct8x8_64_half1d_bd12( step1l[3] = vget_low_s32(*io6); step1h[3] = vget_high_s32(*io6); - t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1); - t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1); - t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0); - t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0); - t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1); - t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1); - t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0); - t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0); - t64[0] = vmlsl_lane_s32(t64[0], input_7l, vget_low_s32(cospis1), 0); - t64[1] = vmlsl_lane_s32(t64[1], input_7h, vget_low_s32(cospis1), 0); - t64[2] = vmlal_lane_s32(t64[2], input_5l, vget_low_s32(cospis1), 1); - t64[3] = vmlal_lane_s32(t64[3], input_5h, vget_low_s32(cospis1), 1); - t64[4] = vmlsl_lane_s32(t64[4], input_5l, vget_high_s32(cospis1), 0); - t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0); - t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1); - t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1); + t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1); + t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1); + t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0); + t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0); + t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1); + t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1); + t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0); + t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0); + t64[0] = vmlsl_lane_s32(t64[0], input7l, vget_low_s32(cospis1), 0); + t64[1] = vmlsl_lane_s32(t64[1], input7h, vget_low_s32(cospis1), 0); + t64[2] = vmlal_lane_s32(t64[2], input5l, vget_low_s32(cospis1), 1); + t64[3] = vmlal_lane_s32(t64[3], input5h, vget_low_s32(cospis1), 1); + t64[4] = vmlsl_lane_s32(t64[4], input5l, vget_high_s32(cospis1), 0); + t64[5] = vmlsl_lane_s32(t64[5], input5h, vget_high_s32(cospis1), 0); + t64[6] = vmlal_lane_s32(t64[6], input7l, vget_high_s32(cospis1), 1); + t64[7] = vmlal_lane_s32(t64[7], input7h, vget_high_s32(cospis1), 1); t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); @@ -553,79 +551,83 @@ static INLINE void idct8x8_64_half1d_bd12( void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - int32x4_t a0 = vld1q_s32(input); - int32x4_t a1 = vld1q_s32(input + 4); - int32x4_t a2 = vld1q_s32(input + 8); - int32x4_t a3 = vld1q_s32(input + 12); - int32x4_t a4 = vld1q_s32(input + 16); - int32x4_t a5 = vld1q_s32(input + 20); - int32x4_t a6 = vld1q_s32(input + 24); - int32x4_t a7 = vld1q_s32(input + 28); - int32x4_t a8 = vld1q_s32(input + 32); - int32x4_t a9 = vld1q_s32(input + 36); - int32x4_t a10 = vld1q_s32(input + 40); - int32x4_t a11 = vld1q_s32(input + 44); - int32x4_t a12 = vld1q_s32(input + 48); - int32x4_t a13 = vld1q_s32(input + 52); - int32x4_t a14 = vld1q_s32(input + 56); - int32x4_t a15 = vld1q_s32(input + 60); - int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + int32x4_t a[16]; + int16x8_t c[8]; + + a[0] = vld1q_s32(input); + a[1] = vld1q_s32(input + 4); + a[2] = vld1q_s32(input + 8); + a[3] = vld1q_s32(input + 12); + a[4] = vld1q_s32(input + 16); + a[5] = vld1q_s32(input + 20); + a[6] = vld1q_s32(input + 24); + a[7] = vld1q_s32(input + 28); + a[8] = vld1q_s32(input + 32); + a[9] = vld1q_s32(input + 36); + a[10] = vld1q_s32(input + 40); + a[11] = vld1q_s32(input + 44); + a[12] = vld1q_s32(input + 48); + a[13] = vld1q_s32(input + 52); + a[14] = vld1q_s32(input + 56); + a[15] = vld1q_s32(input + 60); if (bd == 8) { const int16x8_t cospis = vld1q_s16(kCospi); const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 - int16x8_t b0 = vcombine_s16(vmovn_s32(a0), vmovn_s32(a1)); - int16x8_t b1 = vcombine_s16(vmovn_s32(a2), vmovn_s32(a3)); - int16x8_t b2 = vcombine_s16(vmovn_s32(a4), vmovn_s32(a5)); - int16x8_t b3 = vcombine_s16(vmovn_s32(a6), vmovn_s32(a7)); - int16x8_t b4 = vcombine_s16(vmovn_s32(a8), vmovn_s32(a9)); - int16x8_t b5 = vcombine_s16(vmovn_s32(a10), vmovn_s32(a11)); - int16x8_t b6 = vcombine_s16(vmovn_s32(a12), vmovn_s32(a13)); - int16x8_t b7 = vcombine_s16(vmovn_s32(a14), vmovn_s32(a15)); - - idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7); - idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7); - - c0 = vrshrq_n_s16(b0, 5); - c1 = vrshrq_n_s16(b1, 5); - c2 = vrshrq_n_s16(b2, 5); - c3 = vrshrq_n_s16(b3, 5); - c4 = vrshrq_n_s16(b4, 5); - c5 = vrshrq_n_s16(b5, 5); - c6 = vrshrq_n_s16(b6, 5); - c7 = vrshrq_n_s16(b7, 5); + int16x8_t b[8]; + + b[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1])); + b[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3])); + b[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5])); + b[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7])); + b[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9])); + b[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11])); + b[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13])); + b[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15])); + + idct8x8_64_1d_bd8(cospis0, cospis1, b); + idct8x8_64_1d_bd8(cospis0, cospis1, b); + + c[0] = vrshrq_n_s16(b[0], 5); + c[1] = vrshrq_n_s16(b[1], 5); + c[2] = vrshrq_n_s16(b[2], 5); + c[3] = vrshrq_n_s16(b[3], 5); + c[4] = vrshrq_n_s16(b[4], 5); + c[5] = vrshrq_n_s16(b[5], 5); + c[6] = vrshrq_n_s16(b[6], 5); + c[7] = vrshrq_n_s16(b[7], 5); } else { const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 if (bd == 10) { - idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, - &a6, &a7); - idct8x8_64_half1d_bd10(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13, - &a14, &a15); - idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10, - &a3, &a11); - idct8x8_64_half1d_bd10(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14, - &a7, &a15); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); } else { - idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, - &a6, &a7); - idct8x8_64_half1d_bd12(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13, - &a14, &a15); - idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10, - &a3, &a11); - idct8x8_64_half1d_bd12(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14, - &a7, &a15); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); } - c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5)); - c1 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5)); - c2 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5)); - c3 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5)); - c4 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5)); - c5 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5)); - c6 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5)); - c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5)); + c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5)); + c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5)); + c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5)); + c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5)); + c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5)); + c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5)); + c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5)); + c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5)); } - highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd); + highbd_add8x8(c, dest, stride, bd); } diff --git a/vpx_dsp/arm/highbd_idct_neon.h b/vpx_dsp/arm/highbd_idct_neon.h new file mode 100644 index 000000000..92fcb7f3a --- /dev/null +++ b/vpx_dsp/arm/highbd_idct_neon.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_ +#define VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x4_t a0 = vld1_u16(*dest); + const uint16x4_t a1 = vld1_u16(*dest + stride); + const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1)); + // Note: In some profile tests, res is quite close to +/-32767. + // We use saturating addition. + const int16x8_t b = vqaddq_s16(res, a); + const int16x8_t c = vminq_s16(b, max); + const uint16x8_t d = vqshluq_n_s16(c, 0); + vst1_u16(*dest, vget_low_u16(d)); + *dest += stride; + vst1_u16(*dest, vget_high_u16(d)); + *dest += stride; +} + +static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis, + int32x4_t *const a) { + int32x4_t b0, b1, b2, b3; + + transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]); + b0 = vaddq_s32(a[0], a[2]); + b1 = vsubq_s32(a[0], a[2]); + b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0); + b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0); + b2 = vmulq_lane_s32(a[1], vget_high_s32(cospis), 1); + b3 = vmulq_lane_s32(a[1], vget_low_s32(cospis), 1); + b2 = vmlsq_lane_s32(b2, a[3], vget_low_s32(cospis), 1); + b3 = vmlaq_lane_s32(b3, a[3], vget_high_s32(cospis), 1); + b0 = vrshrq_n_s32(b0, DCT_CONST_BITS); + b1 = vrshrq_n_s32(b1, DCT_CONST_BITS); + b2 = vrshrq_n_s32(b2, DCT_CONST_BITS); + b3 = vrshrq_n_s32(b3, DCT_CONST_BITS); + a[0] = vaddq_s32(b0, b3); + a[1] = vaddq_s32(b1, b2); + a[2] = vsubq_s32(b1, b2); + a[3] = vsubq_s32(b0, b3); +} + +static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, + int32x4_t *const a) { + int32x4_t b0, b1, b2, b3; + int64x2_t c[12]; + + transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]); + b0 = vaddq_s32(a[0], a[2]); + b1 = vsubq_s32(a[0], a[2]); + c[0] = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0); + c[1] = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0); + c[2] = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0); + c[3] = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0); + c[4] = vmull_lane_s32(vget_low_s32(a[1]), vget_high_s32(cospis), 1); + c[5] = vmull_lane_s32(vget_high_s32(a[1]), vget_high_s32(cospis), 1); + c[6] = vmull_lane_s32(vget_low_s32(a[1]), vget_low_s32(cospis), 1); + c[7] = vmull_lane_s32(vget_high_s32(a[1]), vget_low_s32(cospis), 1); + c[8] = vmull_lane_s32(vget_low_s32(a[3]), vget_low_s32(cospis), 1); + c[9] = vmull_lane_s32(vget_high_s32(a[3]), vget_low_s32(cospis), 1); + c[10] = vmull_lane_s32(vget_low_s32(a[3]), vget_high_s32(cospis), 1); + c[11] = vmull_lane_s32(vget_high_s32(a[3]), vget_high_s32(cospis), 1); + c[4] = vsubq_s64(c[4], c[8]); + c[5] = vsubq_s64(c[5], c[9]); + c[6] = vaddq_s64(c[6], c[10]); + c[7] = vaddq_s64(c[7], c[11]); + b0 = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), + vrshrn_n_s64(c[1], DCT_CONST_BITS)); + b1 = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), + vrshrn_n_s64(c[3], DCT_CONST_BITS)); + b2 = vcombine_s32(vrshrn_n_s64(c[4], DCT_CONST_BITS), + vrshrn_n_s64(c[5], DCT_CONST_BITS)); + b3 = vcombine_s32(vrshrn_n_s64(c[6], DCT_CONST_BITS), + vrshrn_n_s64(c[7], DCT_CONST_BITS)); + a[0] = vaddq_s32(b0, b3); + a[1] = vaddq_s32(b1, b2); + a[2] = vsubq_s32(b1, b2); + a[3] = vsubq_s32(b0, b3); +} + +#endif // VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_ diff --git a/vpx_dsp/arm/idct32x32_135_add_neon.c b/vpx_dsp/arm/idct32x32_135_add_neon.c index 021211bc9..057731ad9 100644 --- a/vpx_dsp/arm/idct32x32_135_add_neon.c +++ b/vpx_dsp/arm/idct32x32_135_add_neon.c @@ -650,14 +650,10 @@ void vpx_idct32_16_neon(const int16_t *const input, void *const output, highbd_add_and_store_bd8(out, output, stride); } else { uint8_t *const outputT = (uint8_t *)output; - add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6], - out[7], outputT, stride); - add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13], - out[14], out[15], outputT + (8 * stride), stride); - add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21], - out[22], out[23], outputT + (16 * stride), stride); - add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29], - out[30], out[31], outputT + (24 * stride), stride); + add_and_store_u8_s16(out + 0, outputT, stride); + add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride); + add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride); + add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride); } } diff --git a/vpx_dsp/arm/idct32x32_34_add_neon.c b/vpx_dsp/arm/idct32x32_34_add_neon.c index f3c336fa3..f570547e4 100644 --- a/vpx_dsp/arm/idct32x32_34_add_neon.c +++ b/vpx_dsp/arm/idct32x32_34_add_neon.c @@ -490,14 +490,10 @@ void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride, highbd_add_and_store_bd8(out, output, stride); } else { uint8_t *const outputT = (uint8_t *)output; - add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6], - out[7], outputT, stride); - add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13], - out[14], out[15], outputT + (8 * stride), stride); - add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21], - out[22], out[23], outputT + (16 * stride), stride); - add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29], - out[30], out[31], outputT + (24 * stride), stride); + add_and_store_u8_s16(out + 0, outputT, stride); + add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride); + add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride); + add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride); } } diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c index 673a36840..8192ee4cf 100644 --- a/vpx_dsp/arm/idct4x4_add_neon.c +++ b/vpx_dsp/arm/idct4x4_add_neon.c @@ -19,44 +19,41 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { const uint8_t *dst = dest; - const int16x4_t cospis = vld1_s16(kCospi); - uint8x8_t dest01_u8; - uint32x2_t dest32_u32 = vdup_n_u32(0); - int16x8_t a0, a1; - uint8x8_t d01, d32; - uint16x8_t d01_u16, d32_u16; + uint32x2_t s32 = vdup_n_u32(0); + int16x8_t a[2]; + uint8x8_t s, d[2]; + uint16x8_t sum[2]; assert(!((intptr_t)dest % sizeof(uint32_t))); assert(!(stride % sizeof(uint32_t))); // Rows - a0 = load_tran_low_to_s16q(input); - a1 = load_tran_low_to_s16q(input + 8); - idct4x4_16_kernel_bd8(cospis, &a0, &a1); + a[0] = load_tran_low_to_s16q(input); + a[1] = load_tran_low_to_s16q(input + 8); + transpose_idct4x4_16_bd8(a); // Columns - a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1)); - idct4x4_16_kernel_bd8(cospis, &a0, &a1); - a0 = vrshrq_n_s16(a0, 4); - a1 = vrshrq_n_s16(a1, 4); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_idct4x4_16_bd8(a); + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); - dest01_u8 = load_u8(dst, stride); + s = load_u8(dst, stride); dst += 2 * stride; // The elements are loaded in reverse order. - dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1); + s32 = vld1_lane_u32((const uint32_t *)dst, s32, 1); dst += stride; - dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0); + s32 = vld1_lane_u32((const uint32_t *)dst, s32, 0); - d01_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), dest01_u8); - d32_u16 = - vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32)); - d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16)); - d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16)); + sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s); + sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), vreinterpret_u8_u32(s32)); + d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0])); + d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1])); - store_u8(dest, stride, d01); + store_u8(dest, stride, d[0]); dest += 2 * stride; // The elements are stored in reverse order. - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1); + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 1); dest += stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0); + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 0); } diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c index 1121ade27..7471387e4 100644 --- a/vpx_dsp/arm/idct8x8_add_neon.c +++ b/vpx_dsp/arm/idct8x8_add_neon.c @@ -17,91 +17,25 @@ #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" -static INLINE void add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2, - int16x8_t a3, int16x8_t a4, int16x8_t a5, - int16x8_t a6, int16x8_t a7, uint8_t *dest, - const int stride) { - const uint8_t *dst = dest; - uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; - uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16; - - a0 = vrshrq_n_s16(a0, 5); - a1 = vrshrq_n_s16(a1, 5); - a2 = vrshrq_n_s16(a2, 5); - a3 = vrshrq_n_s16(a3, 5); - a4 = vrshrq_n_s16(a4, 5); - a5 = vrshrq_n_s16(a5, 5); - a6 = vrshrq_n_s16(a6, 5); - a7 = vrshrq_n_s16(a7, 5); - - d0 = vld1_u8(dst); - dst += stride; - d1 = vld1_u8(dst); - dst += stride; - d2 = vld1_u8(dst); - dst += stride; - d3 = vld1_u8(dst); - dst += stride; - d4 = vld1_u8(dst); - dst += stride; - d5 = vld1_u8(dst); - dst += stride; - d6 = vld1_u8(dst); - dst += stride; - d7 = vld1_u8(dst); - - d0_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), d0); - d1_u16 = vaddw_u8(vreinterpretq_u16_s16(a1), d1); - d2_u16 = vaddw_u8(vreinterpretq_u16_s16(a2), d2); - d3_u16 = vaddw_u8(vreinterpretq_u16_s16(a3), d3); - d4_u16 = vaddw_u8(vreinterpretq_u16_s16(a4), d4); - d5_u16 = vaddw_u8(vreinterpretq_u16_s16(a5), d5); - d6_u16 = vaddw_u8(vreinterpretq_u16_s16(a6), d6); - d7_u16 = vaddw_u8(vreinterpretq_u16_s16(a7), d7); - - d0 = vqmovun_s16(vreinterpretq_s16_u16(d0_u16)); - d1 = vqmovun_s16(vreinterpretq_s16_u16(d1_u16)); - d2 = vqmovun_s16(vreinterpretq_s16_u16(d2_u16)); - d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16)); - d4 = vqmovun_s16(vreinterpretq_s16_u16(d4_u16)); - d5 = vqmovun_s16(vreinterpretq_s16_u16(d5_u16)); - d6 = vqmovun_s16(vreinterpretq_s16_u16(d6_u16)); - d7 = vqmovun_s16(vreinterpretq_s16_u16(d7_u16)); - - vst1_u8(dest, d0); - dest += stride; - vst1_u8(dest, d1); - dest += stride; - vst1_u8(dest, d2); - dest += stride; - vst1_u8(dest, d3); - dest += stride; - vst1_u8(dest, d4); - dest += stride; - vst1_u8(dest, d5); - dest += stride; - vst1_u8(dest, d6); - dest += stride; - vst1_u8(dest, d7); -} - void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { const int16x8_t cospis = vld1q_s16(kCospi); const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 - int16x8_t a0 = load_tran_low_to_s16q(input); - int16x8_t a1 = load_tran_low_to_s16q(input + 8); - int16x8_t a2 = load_tran_low_to_s16q(input + 16); - int16x8_t a3 = load_tran_low_to_s16q(input + 24); - int16x8_t a4 = load_tran_low_to_s16q(input + 32); - int16x8_t a5 = load_tran_low_to_s16q(input + 40); - int16x8_t a6 = load_tran_low_to_s16q(input + 48); - int16x8_t a7 = load_tran_low_to_s16q(input + 56); - - idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); - idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); - add8x8(a0, a1, a2, a3, a4, a5, a6, a7, dest, stride); + int16x8_t a[8]; + + a[0] = load_tran_low_to_s16q(input); + a[1] = load_tran_low_to_s16q(input + 8); + a[2] = load_tran_low_to_s16q(input + 16); + a[3] = load_tran_low_to_s16q(input + 24); + a[4] = load_tran_low_to_s16q(input + 32); + a[5] = load_tran_low_to_s16q(input + 40); + a[6] = load_tran_low_to_s16q(input + 48); + a[7] = load_tran_low_to_s16q(input + 56); + + idct8x8_64_1d_bd8(cospis0, cospis1, a); + idct8x8_64_1d_bd8(cospis0, cospis1, a); + idct8x8_add8x8_neon(a, dest, stride); } void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, @@ -111,17 +45,15 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24 const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28 - int16x4_t a0, a1, a2, a3, a4, a5, a6, a7; - int16x8_t b0, b1, b2, b3, b4, b5, b6, b7; + int16x4_t a[8]; + int16x8_t b[8]; - a0 = load_tran_low_to_s16d(input); - a1 = load_tran_low_to_s16d(input + 8); - a2 = load_tran_low_to_s16d(input + 16); - a3 = load_tran_low_to_s16d(input + 24); + a[0] = load_tran_low_to_s16d(input); + a[1] = load_tran_low_to_s16d(input + 8); + a[2] = load_tran_low_to_s16d(input + 16); + a[3] = load_tran_low_to_s16d(input + 24); - idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &a0, &a1, &a2, &a3, &a4, - &a5, &a6, &a7); - idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a0, a1, a2, a3, a4, a5, a6, - a7, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7); - add8x8(b0, b1, b2, b3, b4, b5, b6, b7, dest, stride); + idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, a); + idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a, b); + idct8x8_add8x8_neon(b, dest, stride); } diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h index 6ed02af5a..c4d3b4711 100644 --- a/vpx_dsp/arm/idct_neon.h +++ b/vpx_dsp/arm/idct_neon.h @@ -78,6 +78,28 @@ static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0, //------------------------------------------------------------------------------ +static INLINE int16x8_t dct_const_round_shift_low_8(const int32x4_t *const in) { + return vcombine_s16(vrshrn_n_s32(in[0], DCT_CONST_BITS), + vrshrn_n_s32(in[1], DCT_CONST_BITS)); +} + +static INLINE void dct_const_round_shift_low_8_dual(const int32x4_t *const t32, + int16x8_t *const d0, + int16x8_t *const d1) { + *d0 = dct_const_round_shift_low_8(t32 + 0); + *d1 = dct_const_round_shift_low_8(t32 + 2); +} + +static INLINE int32x4x2_t +dct_const_round_shift_high_4x2(const int64x2_t *const in) { + int32x4x2_t out; + out.val[0] = vcombine_s32(vrshrn_n_s64(in[0], DCT_CONST_BITS), + vrshrn_n_s64(in[1], DCT_CONST_BITS)); + out.val[1] = vcombine_s32(vrshrn_n_s64(in[2], DCT_CONST_BITS), + vrshrn_n_s64(in[3], DCT_CONST_BITS)); + return out; +} + // Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS. static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a, const int16_t a_const) { @@ -102,24 +124,24 @@ static INLINE int16x8_t add_multiply_shift_and_narrow_s16( // input) this function can not use vaddq_s16. // In order to match existing behavior and intentionally out of range tests, // expand the addition up to 32 bits to prevent truncation. - int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b)); - int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b)); - temp_low = vmulq_n_s32(temp_low, ab_const); - temp_high = vmulq_n_s32(temp_high, ab_const); - return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), - vrshrn_n_s32(temp_high, DCT_CONST_BITS)); + int32x4_t t[2]; + t[0] = vaddl_s16(vget_low_s16(a), vget_low_s16(b)); + t[1] = vaddl_s16(vget_high_s16(a), vget_high_s16(b)); + t[0] = vmulq_n_s32(t[0], ab_const); + t[1] = vmulq_n_s32(t[1], ab_const); + return dct_const_round_shift_low_8(t); } // Subtract b from a, then multiply by ab_const. Shift and narrow by // DCT_CONST_BITS. static INLINE int16x8_t sub_multiply_shift_and_narrow_s16( const int16x8_t a, const int16x8_t b, const int16_t ab_const) { - int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b)); - int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b)); - temp_low = vmulq_n_s32(temp_low, ab_const); - temp_high = vmulq_n_s32(temp_high, ab_const); - return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), - vrshrn_n_s32(temp_high, DCT_CONST_BITS)); + int32x4_t t[2]; + t[0] = vsubl_s16(vget_low_s16(a), vget_low_s16(b)); + t[1] = vsubl_s16(vget_high_s16(a), vget_high_s16(b)); + t[0] = vmulq_n_s32(t[0], ab_const); + t[1] = vmulq_n_s32(t[1], ab_const); + return dct_const_round_shift_low_8(t); } // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by @@ -127,12 +149,12 @@ static INLINE int16x8_t sub_multiply_shift_and_narrow_s16( static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16( const int16x8_t a, const int16_t a_const, const int16x8_t b, const int16_t b_const) { - int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const); - int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const); - temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const); - temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const); - return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), - vrshrn_n_s32(temp_high, DCT_CONST_BITS)); + int32x4_t t[2]; + t[0] = vmull_n_s16(vget_low_s16(a), a_const); + t[1] = vmull_n_s16(vget_high_s16(a), a_const); + t[0] = vmlal_n_s16(t[0], vget_low_s16(b), b_const); + t[1] = vmlal_n_s16(t[1], vget_high_s16(b), b_const); + return dct_const_round_shift_low_8(t); } //------------------------------------------------------------------------------ @@ -145,53 +167,43 @@ static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16( static INLINE int32x4x2_t multiply_shift_and_narrow_s32_dual(const int32x4x2_t a, const int32_t a_const) { int64x2_t b[4]; - int32x4x2_t c; + b[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const); b[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const); b[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const); b[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const); - c.val[0] = vcombine_s32(vrshrn_n_s64(b[0], DCT_CONST_BITS), - vrshrn_n_s64(b[1], DCT_CONST_BITS)); - c.val[1] = vcombine_s32(vrshrn_n_s64(b[2], DCT_CONST_BITS), - vrshrn_n_s64(b[3], DCT_CONST_BITS)); - return c; + return dct_const_round_shift_high_4x2(b); } // Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS. static INLINE int32x4x2_t add_multiply_shift_and_narrow_s32_dual( const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) { - const int32x4_t temp_low = vaddq_s32(a.val[0], b.val[0]); - const int32x4_t temp_high = vaddq_s32(a.val[1], b.val[1]); + int32x4_t t[2]; int64x2_t c[4]; - int32x4x2_t d; - c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const); - c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const); - c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const); - c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const); - d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), - vrshrn_n_s64(c[1], DCT_CONST_BITS)); - d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), - vrshrn_n_s64(c[3], DCT_CONST_BITS)); - return d; + + t[0] = vaddq_s32(a.val[0], b.val[0]); + t[1] = vaddq_s32(a.val[1], b.val[1]); + c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const); + c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const); + c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const); + c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const); + return dct_const_round_shift_high_4x2(c); } // Subtract b from a, then multiply by ab_const. Shift and narrow by // DCT_CONST_BITS. static INLINE int32x4x2_t sub_multiply_shift_and_narrow_s32_dual( const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) { - const int32x4_t temp_low = vsubq_s32(a.val[0], b.val[0]); - const int32x4_t temp_high = vsubq_s32(a.val[1], b.val[1]); + int32x4_t t[2]; int64x2_t c[4]; - int32x4x2_t d; - c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const); - c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const); - c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const); - c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const); - d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), - vrshrn_n_s64(c[1], DCT_CONST_BITS)); - d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), - vrshrn_n_s64(c[3], DCT_CONST_BITS)); - return d; + + t[0] = vsubq_s32(a.val[0], b.val[0]); + t[1] = vsubq_s32(a.val[1], b.val[1]); + c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const); + c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const); + c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const); + c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const); + return dct_const_round_shift_high_4x2(c); } // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by @@ -200,7 +212,6 @@ static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual( const int32x4x2_t a, const int32_t a_const, const int32x4x2_t b, const int32_t b_const) { int64x2_t c[4]; - int32x4x2_t d; c[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const); c[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const); c[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const); @@ -209,72 +220,66 @@ static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual( c[1] = vmlal_n_s32(c[1], vget_high_s32(b.val[0]), b_const); c[2] = vmlal_n_s32(c[2], vget_low_s32(b.val[1]), b_const); c[3] = vmlal_n_s32(c[3], vget_high_s32(b.val[1]), b_const); - d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS), - vrshrn_n_s64(c[1], DCT_CONST_BITS)); - d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS), - vrshrn_n_s64(c[3], DCT_CONST_BITS)); - return d; + return dct_const_round_shift_high_4x2(c); } // Shift the output down by 6 and add it to the destination buffer. -static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1, - const int16x8_t a2, const int16x8_t a3, - const int16x8_t a4, const int16x8_t a5, - const int16x8_t a6, const int16x8_t a7, - uint8_t *b, const int b_stride) { - uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7; - int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; - b0 = vld1_u8(b); - b += b_stride; - b1 = vld1_u8(b); - b += b_stride; - b2 = vld1_u8(b); - b += b_stride; - b3 = vld1_u8(b); - b += b_stride; - b4 = vld1_u8(b); - b += b_stride; - b5 = vld1_u8(b); - b += b_stride; - b6 = vld1_u8(b); - b += b_stride; - b7 = vld1_u8(b); - b -= (7 * b_stride); +static INLINE void add_and_store_u8_s16(const int16x8_t *const a, uint8_t *d, + const int stride) { + uint8x8_t b[8]; + int16x8_t c[8]; + + b[0] = vld1_u8(d); + d += stride; + b[1] = vld1_u8(d); + d += stride; + b[2] = vld1_u8(d); + d += stride; + b[3] = vld1_u8(d); + d += stride; + b[4] = vld1_u8(d); + d += stride; + b[5] = vld1_u8(d); + d += stride; + b[6] = vld1_u8(d); + d += stride; + b[7] = vld1_u8(d); + d -= (7 * stride); // c = b + (a >> 6) - c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6); - c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6); - c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6); - c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6); - c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6); - c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6); - c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6); - c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6); - - b0 = vqmovun_s16(c0); - b1 = vqmovun_s16(c1); - b2 = vqmovun_s16(c2); - b3 = vqmovun_s16(c3); - b4 = vqmovun_s16(c4); - b5 = vqmovun_s16(c5); - b6 = vqmovun_s16(c6); - b7 = vqmovun_s16(c7); - - vst1_u8(b, b0); - b += b_stride; - vst1_u8(b, b1); - b += b_stride; - vst1_u8(b, b2); - b += b_stride; - vst1_u8(b, b3); - b += b_stride; - vst1_u8(b, b4); - b += b_stride; - vst1_u8(b, b5); - b += b_stride; - vst1_u8(b, b6); - b += b_stride; - vst1_u8(b, b7); + c[0] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[0])), a[0], 6); + c[1] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[1])), a[1], 6); + c[2] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[2])), a[2], 6); + c[3] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[3])), a[3], 6); + c[4] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[4])), a[4], 6); + c[5] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[5])), a[5], 6); + c[6] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[6])), a[6], 6); + c[7] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[7])), a[7], 6); + + b[0] = vqmovun_s16(c[0]); + b[1] = vqmovun_s16(c[1]); + b[2] = vqmovun_s16(c[2]); + b[3] = vqmovun_s16(c[3]); + b[4] = vqmovun_s16(c[4]); + b[5] = vqmovun_s16(c[5]); + b[6] = vqmovun_s16(c[6]); + b[7] = vqmovun_s16(c[7]); + + vst1_u8(d, b[0]); + d += stride; + vst1_u8(d, b[1]); + d += stride; + vst1_u8(d, b[2]); + d += stride; + vst1_u8(d, b[3]); + d += stride; + vst1_u8(d, b[4]); + d += stride; + vst1_u8(d, b[5]); + d += stride; + vst1_u8(d, b[6]); + d += stride; + vst1_u8(d, b[7]); } static INLINE uint8x16_t create_dcq(const int16_t dc) { @@ -283,56 +288,53 @@ static INLINE uint8x16_t create_dcq(const int16_t dc) { return vdupq_n_u8((uint8_t)t); } -static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis, - int16x8_t *const a0, - int16x8_t *const a1) { - int16x4_t b0, b1, b2, b3; - int32x4_t c0, c1, c2, c3; - int16x8_t d0, d1; - - transpose_s16_4x4q(a0, a1); - b0 = vget_low_s16(*a0); - b1 = vget_high_s16(*a0); - b2 = vget_low_s16(*a1); - b3 = vget_high_s16(*a1); - c0 = vmull_lane_s16(b0, cospis, 2); - c2 = vmull_lane_s16(b1, cospis, 2); - c1 = vsubq_s32(c0, c2); - c0 = vaddq_s32(c0, c2); - c2 = vmull_lane_s16(b2, cospis, 3); - c3 = vmull_lane_s16(b2, cospis, 1); - c2 = vmlsl_lane_s16(c2, b3, cospis, 1); - c3 = vmlal_lane_s16(c3, b3, cospis, 3); - b0 = vrshrn_n_s32(c0, DCT_CONST_BITS); - b1 = vrshrn_n_s32(c1, DCT_CONST_BITS); - b2 = vrshrn_n_s32(c2, DCT_CONST_BITS); - b3 = vrshrn_n_s32(c3, DCT_CONST_BITS); - d0 = vcombine_s16(b0, b1); - d1 = vcombine_s16(b3, b2); - *a0 = vaddq_s16(d0, d1); - *a1 = vsubq_s16(d0, d1); -} - -static INLINE void idct8x8_12_pass1_bd8( - const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1, - int16x4_t *const io0, int16x4_t *const io1, int16x4_t *const io2, - int16x4_t *const io3, int16x4_t *const io4, int16x4_t *const io5, - int16x4_t *const io6, int16x4_t *const io7) { +static INLINE void idct4x4_16_kernel_bd8(int16x8_t *const a) { + const int16x4_t cospis = vld1_s16(kCospi); + int16x4_t b[4]; + int32x4_t c[4]; + int16x8_t d[2]; + + b[0] = vget_low_s16(a[0]); + b[1] = vget_high_s16(a[0]); + b[2] = vget_low_s16(a[1]); + b[3] = vget_high_s16(a[1]); + c[0] = vmull_lane_s16(b[0], cospis, 2); + c[2] = vmull_lane_s16(b[1], cospis, 2); + c[1] = vsubq_s32(c[0], c[2]); + c[0] = vaddq_s32(c[0], c[2]); + c[3] = vmull_lane_s16(b[2], cospis, 3); + c[2] = vmull_lane_s16(b[2], cospis, 1); + c[3] = vmlsl_lane_s16(c[3], b[3], cospis, 1); + c[2] = vmlal_lane_s16(c[2], b[3], cospis, 3); + dct_const_round_shift_low_8_dual(c, &d[0], &d[1]); + a[0] = vaddq_s16(d[0], d[1]); + a[1] = vsubq_s16(d[0], d[1]); +} + +static INLINE void transpose_idct4x4_16_bd8(int16x8_t *const a) { + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); +} + +static INLINE void idct8x8_12_pass1_bd8(const int16x4_t cospis0, + const int16x4_t cospisd0, + const int16x4_t cospisd1, + int16x4_t *const io) { int16x4_t step1[8], step2[8]; int32x4_t t32[2]; - transpose_s16_4x4d(io0, io1, io2, io3); + transpose_s16_4x4d(&io[0], &io[1], &io[2], &io[3]); // stage 1 - step1[4] = vqrdmulh_lane_s16(*io1, cospisd1, 3); - step1[5] = vqrdmulh_lane_s16(*io3, cospisd1, 2); - step1[6] = vqrdmulh_lane_s16(*io3, cospisd1, 1); - step1[7] = vqrdmulh_lane_s16(*io1, cospisd1, 0); + step1[4] = vqrdmulh_lane_s16(io[1], cospisd1, 3); + step1[5] = vqrdmulh_lane_s16(io[3], cospisd1, 2); + step1[6] = vqrdmulh_lane_s16(io[3], cospisd1, 1); + step1[7] = vqrdmulh_lane_s16(io[1], cospisd1, 0); // stage 2 - step2[1] = vqrdmulh_lane_s16(*io0, cospisd0, 2); - step2[2] = vqrdmulh_lane_s16(*io2, cospisd0, 3); - step2[3] = vqrdmulh_lane_s16(*io2, cospisd0, 1); + step2[1] = vqrdmulh_lane_s16(io[0], cospisd0, 2); + step2[2] = vqrdmulh_lane_s16(io[2], cospisd0, 3); + step2[3] = vqrdmulh_lane_s16(io[2], cospisd0, 1); step2[4] = vadd_s16(step1[4], step1[5]); step2[5] = vsub_s16(step1[4], step1[5]); @@ -352,32 +354,27 @@ static INLINE void idct8x8_12_pass1_bd8( step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); // stage 4 - *io0 = vadd_s16(step1[0], step2[7]); - *io1 = vadd_s16(step1[1], step1[6]); - *io2 = vadd_s16(step1[2], step1[5]); - *io3 = vadd_s16(step1[3], step2[4]); - *io4 = vsub_s16(step1[3], step2[4]); - *io5 = vsub_s16(step1[2], step1[5]); - *io6 = vsub_s16(step1[1], step1[6]); - *io7 = vsub_s16(step1[0], step2[7]); -} - -static INLINE void idct8x8_12_pass2_bd8( - const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1, - const int16x4_t input0, const int16x4_t input1, const int16x4_t input2, - const int16x4_t input3, const int16x4_t input4, const int16x4_t input5, - const int16x4_t input6, const int16x4_t input7, int16x8_t *const output0, - int16x8_t *const output1, int16x8_t *const output2, - int16x8_t *const output3, int16x8_t *const output4, - int16x8_t *const output5, int16x8_t *const output6, - int16x8_t *const output7) { + io[0] = vadd_s16(step1[0], step2[7]); + io[1] = vadd_s16(step1[1], step1[6]); + io[2] = vadd_s16(step1[2], step1[5]); + io[3] = vadd_s16(step1[3], step2[4]); + io[4] = vsub_s16(step1[3], step2[4]); + io[5] = vsub_s16(step1[2], step1[5]); + io[6] = vsub_s16(step1[1], step1[6]); + io[7] = vsub_s16(step1[0], step2[7]); +} + +static INLINE void idct8x8_12_pass2_bd8(const int16x4_t cospis0, + const int16x4_t cospisd0, + const int16x4_t cospisd1, + const int16x4_t *const input, + int16x8_t *const output) { int16x8_t in[4]; int16x8_t step1[8], step2[8]; int32x4_t t32[8]; - int16x4_t t16[8]; - transpose_s16_4x8(input0, input1, input2, input3, input4, input5, input6, - input7, &in[0], &in[1], &in[2], &in[3]); + transpose_s16_4x8(input[0], input[1], input[2], input[3], input[4], input[5], + input[6], input[7], &in[0], &in[1], &in[2], &in[3]); // stage 1 step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3); @@ -407,86 +404,64 @@ static INLINE void idct8x8_12_pass2_bd8( t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); - t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); - t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); - t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); - t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); - step1[5] = vcombine_s16(t16[0], t16[1]); - step1[6] = vcombine_s16(t16[2], t16[3]); + dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]); // stage 4 - *output0 = vaddq_s16(step1[0], step2[7]); - *output1 = vaddq_s16(step1[1], step1[6]); - *output2 = vaddq_s16(step1[2], step1[5]); - *output3 = vaddq_s16(step1[3], step2[4]); - *output4 = vsubq_s16(step1[3], step2[4]); - *output5 = vsubq_s16(step1[2], step1[5]); - *output6 = vsubq_s16(step1[1], step1[6]); - *output7 = vsubq_s16(step1[0], step2[7]); -} - -static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, - const int16x4_t cospis1, - int16x8_t *const io0, int16x8_t *const io1, - int16x8_t *const io2, int16x8_t *const io3, - int16x8_t *const io4, int16x8_t *const io5, - int16x8_t *const io6, - int16x8_t *const io7) { - int16x4_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h, - input_7l, input_7h; + output[0] = vaddq_s16(step1[0], step2[7]); + output[1] = vaddq_s16(step1[1], step1[6]); + output[2] = vaddq_s16(step1[2], step1[5]); + output[3] = vaddq_s16(step1[3], step2[4]); + output[4] = vsubq_s16(step1[3], step2[4]); + output[5] = vsubq_s16(step1[2], step1[5]); + output[6] = vsubq_s16(step1[1], step1[6]); + output[7] = vsubq_s16(step1[0], step2[7]); +} + +static INLINE void idct8x8_64_1d_bd8_kernel(const int16x4_t cospis0, + const int16x4_t cospis1, + int16x8_t *const io) { + int16x4_t input1l, input1h, input3l, input3h, input5l, input5h, input7l, + input7h; int16x4_t step1l[4], step1h[4]; int16x8_t step1[8], step2[8]; int32x4_t t32[8]; - int16x4_t t16[8]; - - transpose_s16_8x8(io0, io1, io2, io3, io4, io5, io6, io7); // stage 1 - input_1l = vget_low_s16(*io1); - input_1h = vget_high_s16(*io1); - input_3l = vget_low_s16(*io3); - input_3h = vget_high_s16(*io3); - input_5l = vget_low_s16(*io5); - input_5h = vget_high_s16(*io5); - input_7l = vget_low_s16(*io7); - input_7h = vget_high_s16(*io7); - step1l[0] = vget_low_s16(*io0); - step1h[0] = vget_high_s16(*io0); - step1l[1] = vget_low_s16(*io2); - step1h[1] = vget_high_s16(*io2); - step1l[2] = vget_low_s16(*io4); - step1h[2] = vget_high_s16(*io4); - step1l[3] = vget_low_s16(*io6); - step1h[3] = vget_high_s16(*io6); - - t32[0] = vmull_lane_s16(input_1l, cospis1, 3); - t32[1] = vmull_lane_s16(input_1h, cospis1, 3); - t32[2] = vmull_lane_s16(input_3l, cospis1, 2); - t32[3] = vmull_lane_s16(input_3h, cospis1, 2); - t32[4] = vmull_lane_s16(input_3l, cospis1, 1); - t32[5] = vmull_lane_s16(input_3h, cospis1, 1); - t32[6] = vmull_lane_s16(input_1l, cospis1, 0); - t32[7] = vmull_lane_s16(input_1h, cospis1, 0); - t32[0] = vmlsl_lane_s16(t32[0], input_7l, cospis1, 0); - t32[1] = vmlsl_lane_s16(t32[1], input_7h, cospis1, 0); - t32[2] = vmlal_lane_s16(t32[2], input_5l, cospis1, 1); - t32[3] = vmlal_lane_s16(t32[3], input_5h, cospis1, 1); - t32[4] = vmlsl_lane_s16(t32[4], input_5l, cospis1, 2); - t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2); - t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3); - t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3); - t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); - t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); - t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); - t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); - t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS); - t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS); - t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS); - t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS); - step1[4] = vcombine_s16(t16[0], t16[1]); - step1[5] = vcombine_s16(t16[2], t16[3]); - step1[6] = vcombine_s16(t16[4], t16[5]); - step1[7] = vcombine_s16(t16[6], t16[7]); + input1l = vget_low_s16(io[1]); + input1h = vget_high_s16(io[1]); + input3l = vget_low_s16(io[3]); + input3h = vget_high_s16(io[3]); + input5l = vget_low_s16(io[5]); + input5h = vget_high_s16(io[5]); + input7l = vget_low_s16(io[7]); + input7h = vget_high_s16(io[7]); + step1l[0] = vget_low_s16(io[0]); + step1h[0] = vget_high_s16(io[0]); + step1l[1] = vget_low_s16(io[2]); + step1h[1] = vget_high_s16(io[2]); + step1l[2] = vget_low_s16(io[4]); + step1h[2] = vget_high_s16(io[4]); + step1l[3] = vget_low_s16(io[6]); + step1h[3] = vget_high_s16(io[6]); + + t32[0] = vmull_lane_s16(input1l, cospis1, 3); + t32[1] = vmull_lane_s16(input1h, cospis1, 3); + t32[2] = vmull_lane_s16(input3l, cospis1, 2); + t32[3] = vmull_lane_s16(input3h, cospis1, 2); + t32[4] = vmull_lane_s16(input3l, cospis1, 1); + t32[5] = vmull_lane_s16(input3h, cospis1, 1); + t32[6] = vmull_lane_s16(input1l, cospis1, 0); + t32[7] = vmull_lane_s16(input1h, cospis1, 0); + t32[0] = vmlsl_lane_s16(t32[0], input7l, cospis1, 0); + t32[1] = vmlsl_lane_s16(t32[1], input7h, cospis1, 0); + t32[2] = vmlal_lane_s16(t32[2], input5l, cospis1, 1); + t32[3] = vmlal_lane_s16(t32[3], input5h, cospis1, 1); + t32[4] = vmlsl_lane_s16(t32[4], input5l, cospis1, 2); + t32[5] = vmlsl_lane_s16(t32[5], input5h, cospis1, 2); + t32[6] = vmlal_lane_s16(t32[6], input7l, cospis1, 3); + t32[7] = vmlal_lane_s16(t32[7], input7h, cospis1, 3); + dct_const_round_shift_low_8_dual(&t32[0], &step1[4], &step1[5]); + dct_const_round_shift_low_8_dual(&t32[4], &step1[6], &step1[7]); // stage 2 t32[2] = vmull_lane_s16(step1l[0], cospis0, 2); @@ -503,18 +478,8 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1); t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3); t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3); - t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); - t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); - t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); - t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); - t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS); - t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS); - t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS); - t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS); - step2[0] = vcombine_s16(t16[0], t16[1]); - step2[1] = vcombine_s16(t16[2], t16[3]); - step2[2] = vcombine_s16(t16[4], t16[5]); - step2[3] = vcombine_s16(t16[6], t16[7]); + dct_const_round_shift_low_8_dual(&t32[0], &step2[0], &step2[1]); + dct_const_round_shift_low_8_dual(&t32[4], &step2[2], &step2[3]); step2[4] = vaddq_s16(step1[4], step1[5]); step2[5] = vsubq_s16(step1[4], step1[5]); @@ -533,35 +498,25 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); - t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); - t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); - t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); - t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); - step1[5] = vcombine_s16(t16[0], t16[1]); - step1[6] = vcombine_s16(t16[2], t16[3]); + dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]); // stage 4 - *io0 = vaddq_s16(step1[0], step2[7]); - *io1 = vaddq_s16(step1[1], step1[6]); - *io2 = vaddq_s16(step1[2], step1[5]); - *io3 = vaddq_s16(step1[3], step2[4]); - *io4 = vsubq_s16(step1[3], step2[4]); - *io5 = vsubq_s16(step1[2], step1[5]); - *io6 = vsubq_s16(step1[1], step1[6]); - *io7 = vsubq_s16(step1[0], step2[7]); + io[0] = vaddq_s16(step1[0], step2[7]); + io[1] = vaddq_s16(step1[1], step1[6]); + io[2] = vaddq_s16(step1[2], step1[5]); + io[3] = vaddq_s16(step1[3], step2[4]); + io[4] = vsubq_s16(step1[3], step2[4]); + io[5] = vsubq_s16(step1[2], step1[5]); + io[6] = vsubq_s16(step1[1], step1[6]); + io[7] = vsubq_s16(step1[0], step2[7]); } -static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32, - int16x8_t *const d0, - int16x8_t *const d1) { - int16x4_t t16[4]; - - t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); - t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); - t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); - t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); - *d0 = vcombine_s16(t16[0], t16[1]); - *d1 = vcombine_s16(t16[2], t16[3]); +static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, + const int16x4_t cospis1, + int16x8_t *const io) { + transpose_s16_8x8(&io[0], &io[1], &io[2], &io[3], &io[4], &io[5], &io[6], + &io[7]); + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, io); } static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0, @@ -584,7 +539,7 @@ static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1, int32x4_t t32[4]; idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1, @@ -596,7 +551,7 @@ static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1, idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32); t32[2] = vnegq_s32(t32[2]); t32[3] = vnegq_s32(t32[3]); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1, @@ -611,7 +566,7 @@ static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1, t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2); t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2); t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1, @@ -627,7 +582,7 @@ static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1, t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1, @@ -643,7 +598,7 @@ static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1, t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1, @@ -659,7 +614,7 @@ static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1, t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 1); t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 1); t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 1); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1, @@ -675,7 +630,7 @@ static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1, t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1, @@ -691,7 +646,7 @@ static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1, t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2); t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2); t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1, @@ -707,7 +662,7 @@ static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1, t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 3); t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 3); t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 3); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + dct_const_round_shift_low_8_dual(t32, d0, d1); } static INLINE void idct16x16_add_stage7(const int16x8_t *const step2, @@ -786,73 +741,94 @@ static INLINE void idct16x16_store_pass1(const int16x8_t *const out, vst1q_s16(output, out[15]); } -static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest, - const int stride) { - uint8x8_t d = vld1_u8(*dest); - uint16x8_t q; - - res = vrshrq_n_s16(res, 6); - q = vaddw_u8(vreinterpretq_u16_s16(res), d); - d = vqmovun_s16(vreinterpretq_s16_u16(q)); +static INLINE void idct8x8_add8x1(const int16x8_t a, uint8_t **const dest, + const int stride) { + const uint8x8_t s = vld1_u8(*dest); + const int16x8_t res = vrshrq_n_s16(a, 5); + const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q)); vst1_u8(*dest, d); *dest += stride; } -static INLINE void highbd_idct16x16_add8x1(int16x8_t res, const int16x8_t max, - uint16_t **dest, const int stride) { - uint16x8_t d = vld1q_u16(*dest); +static INLINE void idct8x8_add8x8_neon(int16x8_t *const out, uint8_t *dest, + const int stride) { + idct8x8_add8x1(out[0], &dest, stride); + idct8x8_add8x1(out[1], &dest, stride); + idct8x8_add8x1(out[2], &dest, stride); + idct8x8_add8x1(out[3], &dest, stride); + idct8x8_add8x1(out[4], &dest, stride); + idct8x8_add8x1(out[5], &dest, stride); + idct8x8_add8x1(out[6], &dest, stride); + idct8x8_add8x1(out[7], &dest, stride); +} - res = vqaddq_s16(res, vreinterpretq_s16_u16(d)); - res = vminq_s16(res, max); - d = vqshluq_n_s16(res, 0); +static INLINE void idct16x16_add8x1(const int16x8_t a, uint8_t **const dest, + const int stride) { + const uint8x8_t s = vld1_u8(*dest); + const int16x8_t res = vrshrq_n_s16(a, 6); + const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s); + const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q)); + vst1_u8(*dest, d); + *dest += stride; +} + +static INLINE void highbd_idct16x16_add8x1(const int16x8_t a, + const int16x8_t max, + uint16_t **const dest, + const int stride) { + const uint16x8_t s = vld1q_u16(*dest); + const int16x8_t res0 = vqaddq_s16(a, vreinterpretq_s16_u16(s)); + const int16x8_t res1 = vminq_s16(res0, max); + const uint16x8_t d = vqshluq_n_s16(res1, 0); vst1q_u16(*dest, d); *dest += stride; } -static INLINE void highbd_idct16x16_add8x1_bd8(int16x8_t res, uint16_t **dest, +static INLINE void highbd_idct16x16_add8x1_bd8(const int16x8_t a, + uint16_t **const dest, const int stride) { - uint16x8_t d = vld1q_u16(*dest); - - res = vrsraq_n_s16(vreinterpretq_s16_u16(d), res, 6); - d = vmovl_u8(vqmovun_s16(res)); + const uint16x8_t s = vld1q_u16(*dest); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), a, 6); + const uint16x8_t d = vmovl_u8(vqmovun_s16(res)); vst1q_u16(*dest, d); *dest += stride; } static INLINE void highbd_add_and_store_bd8(const int16x8_t *const a, - uint16_t *out, const int b_stride) { - highbd_idct16x16_add8x1_bd8(a[0], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[1], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[2], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[3], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[4], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[5], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[6], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[7], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[8], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[9], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[10], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[11], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[12], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[13], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[14], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[15], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[16], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[17], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[18], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[19], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[20], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[21], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[22], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[23], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[24], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[25], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[26], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[27], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[28], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[29], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[30], &out, b_stride); - highbd_idct16x16_add8x1_bd8(a[31], &out, b_stride); + uint16_t *out, const int stride) { + highbd_idct16x16_add8x1_bd8(a[0], &out, stride); + highbd_idct16x16_add8x1_bd8(a[1], &out, stride); + highbd_idct16x16_add8x1_bd8(a[2], &out, stride); + highbd_idct16x16_add8x1_bd8(a[3], &out, stride); + highbd_idct16x16_add8x1_bd8(a[4], &out, stride); + highbd_idct16x16_add8x1_bd8(a[5], &out, stride); + highbd_idct16x16_add8x1_bd8(a[6], &out, stride); + highbd_idct16x16_add8x1_bd8(a[7], &out, stride); + highbd_idct16x16_add8x1_bd8(a[8], &out, stride); + highbd_idct16x16_add8x1_bd8(a[9], &out, stride); + highbd_idct16x16_add8x1_bd8(a[10], &out, stride); + highbd_idct16x16_add8x1_bd8(a[11], &out, stride); + highbd_idct16x16_add8x1_bd8(a[12], &out, stride); + highbd_idct16x16_add8x1_bd8(a[13], &out, stride); + highbd_idct16x16_add8x1_bd8(a[14], &out, stride); + highbd_idct16x16_add8x1_bd8(a[15], &out, stride); + highbd_idct16x16_add8x1_bd8(a[16], &out, stride); + highbd_idct16x16_add8x1_bd8(a[17], &out, stride); + highbd_idct16x16_add8x1_bd8(a[18], &out, stride); + highbd_idct16x16_add8x1_bd8(a[19], &out, stride); + highbd_idct16x16_add8x1_bd8(a[20], &out, stride); + highbd_idct16x16_add8x1_bd8(a[21], &out, stride); + highbd_idct16x16_add8x1_bd8(a[22], &out, stride); + highbd_idct16x16_add8x1_bd8(a[23], &out, stride); + highbd_idct16x16_add8x1_bd8(a[24], &out, stride); + highbd_idct16x16_add8x1_bd8(a[25], &out, stride); + highbd_idct16x16_add8x1_bd8(a[26], &out, stride); + highbd_idct16x16_add8x1_bd8(a[27], &out, stride); + highbd_idct16x16_add8x1_bd8(a[28], &out, stride); + highbd_idct16x16_add8x1_bd8(a[29], &out, stride); + highbd_idct16x16_add8x1_bd8(a[30], &out, stride); + highbd_idct16x16_add8x1_bd8(a[31], &out, stride); } static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out, diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index fb1fa6b68..38e275834 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -667,8 +667,6 @@ void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, d135_store_32x2(&dst, stride, row_0, row_1, row_2); } -// ----------------------------------------------------------------------------- - #if !HAVE_NEON_ASM void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index 4efad5333..ea0962954 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -19,6 +19,13 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" +static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1, + const int16_t c2, const int16_t c3) { + return vcreate_s16((uint16_t)c0 | ((uint16_t)c1 << 16) | + ((int64_t)(uint16_t)c2 << 32) | + ((int64_t)(uint16_t)c3 << 48)); +} + // Helper functions used to load tran_low_t into int16, narrowing if necessary. static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) { #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/bitreader.h b/vpx_dsp/bitreader.h index 6ee2a5863..16272ae3a 100644 --- a/vpx_dsp/bitreader.h +++ b/vpx_dsp/bitreader.h @@ -94,7 +94,7 @@ static INLINE int vpx_read(vpx_reader *r, int prob) { } { - register int shift = vpx_norm[range]; + const int shift = vpx_norm[range]; range <<= shift; value <<= shift; count -= shift; diff --git a/vpx_dsp/bitwriter.h b/vpx_dsp/bitwriter.h index 41040cf93..e63092a1a 100644 --- a/vpx_dsp/bitwriter.h +++ b/vpx_dsp/bitwriter.h @@ -35,7 +35,7 @@ static INLINE void vpx_write(vpx_writer *br, int bit, int probability) { int count = br->count; unsigned int range = br->range; unsigned int lowvalue = br->lowvalue; - register int shift; + int shift; split = 1 + (((range - 1) * probability) >> 8); diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h index 13137659f..e86fd3996 100644 --- a/vpx_dsp/inv_txfm.h +++ b/vpx_dsp/inv_txfm.h @@ -76,7 +76,6 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) { // bd of 10 uses trans_low with 18bits, need to remove 14bits // bd of 12 uses trans_low with 20bits, need to remove 12bits // bd of x uses trans_low with 8+x bits, need to remove 24-x bits - #define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16) #if CONFIG_VP9_HIGHBITDEPTH #define HIGHBD_WRAPLOW(x, bd) \ diff --git a/vpx_dsp/mips/deblock_msa.c b/vpx_dsp/mips/deblock_msa.c index aafa272fb..9ef04836a 100644 --- a/vpx_dsp/mips/deblock_msa.c +++ b/vpx_dsp/mips/deblock_msa.c @@ -14,38 +14,37 @@ extern const int16_t vpx_rv[]; -#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, out0, \ - out1, out2, out3, out4, out5, out6, out7, \ - out8, out9, out10, out11, out12, out13, out14, \ - out15) \ - { \ - v8i16 temp0, temp1, temp2, temp3, temp4; \ - v8i16 temp5, temp6, temp7, temp8, temp9; \ - \ - ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ - temp3); \ - ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ - ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_SH(temp5, temp4, temp8, temp9); \ - ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ - temp3); \ - ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_UB(temp5, temp4, out8, out10); \ - ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ - ILVRL_W2_UB(temp5, temp4, out12, out14); \ - out0 = (v16u8)temp6; \ - out2 = (v16u8)temp7; \ - out4 = (v16u8)temp8; \ - out6 = (v16u8)temp9; \ - out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \ - out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \ - out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \ - out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \ - out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ - out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ - out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \ - out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \ +#define VPX_TRANSPOSE8x16_UB_UB( \ + in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, \ + out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15) \ + { \ + v8i16 temp0, temp1, temp2, temp3, temp4; \ + v8i16 temp5, temp6, temp7, temp8, temp9; \ + \ + ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ + temp3); \ + ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_SH(temp5, temp4, temp6, temp7); \ + ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_SH(temp5, temp4, temp8, temp9); \ + ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \ + temp3); \ + ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_UB(temp5, temp4, out8, out10); \ + ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5); \ + ILVRL_W2_UB(temp5, temp4, out12, out14); \ + out0 = (v16u8)temp6; \ + out2 = (v16u8)temp7; \ + out4 = (v16u8)temp8; \ + out6 = (v16u8)temp9; \ + out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8); \ + out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10); \ + out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12); \ + out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14); \ + out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \ + out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ + out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4); \ + out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6); \ } #define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \ diff --git a/vpx_dsp/ppc/inv_txfm_vsx.c b/vpx_dsp/ppc/inv_txfm_vsx.c index d43a9fd18..f095cb0a4 100644 --- a/vpx_dsp/ppc/inv_txfm_vsx.c +++ b/vpx_dsp/ppc/inv_txfm_vsx.c @@ -109,6 +109,7 @@ static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; int32x4_t temp1, temp2, temp3, temp4; int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1; uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, @@ -152,8 +153,8 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, output_v = vec_packsu(tmp16_0, tmp16_1); vec_vsx_st(output_v, 0, tmp_dest); - for (int i = 0; i < 4; i++) - for (int j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i]; + for (i = 0; i < 4; i++) + for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i]; } #define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 3b1a873cd..a4a6fa084 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -242,6 +242,7 @@ DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_34_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_135_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_1024_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct_neon.h DSP_SRCS-$(HAVE_SSE2) += x86/highbd_inv_txfm_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct4x4_add_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c diff --git a/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/vpx_dsp/x86/fwd_txfm_impl_sse2.h index f9abaecf2..fd28d0d55 100644 --- a/vpx_dsp/x86/fwd_txfm_impl_sse2.h +++ b/vpx_dsp/x86/fwd_txfm_impl_sse2.h @@ -778,6 +778,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { return; } #endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us // into 32 bits. { @@ -834,6 +835,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) { return; } #endif // DCT_HIGH_BIT_DEPTH + // Interleave to do the multiply by constants which gets us // into 32 bits. { diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c index 7e75d5d10..ef94522a3 100644 --- a/vpx_dsp/x86/highbd_convolve_avx2.c +++ b/vpx_dsp/x86/highbd_convolve_avx2.c @@ -192,8 +192,6 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, // ----------------------------------------------------------------------------- // Horizontal and vertical filtering -#define CONV8_ROUNDING_BITS (7) - static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 }; @@ -210,6 +208,8 @@ static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; +#define CONV8_ROUNDING_BITS (7) + // ----------------------------------------------------------------------------- // Horizontal Filtering diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c index de097c66a..7898ee12c 100644 --- a/vpx_dsp/x86/highbd_idct16x16_add_sse4.c +++ b/vpx_dsp/x86/highbd_idct16x16_add_sse4.c @@ -53,7 +53,7 @@ static INLINE void highbd_idct16_4col_stage6(const __m128i *const in, out[15] = in[15]; } -static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) { +void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) { __m128i step1[16], step2[16]; // stage 2 @@ -233,7 +233,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input, in = all[i]; highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); - highbd_idct16_4col(in); + vpx_highbd_idct16_4col_sse4_1(in); input += 4 * 16; } @@ -243,7 +243,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input, transpose_32bit_4x4(all[1] + i, out + 4); transpose_32bit_4x4(all[2] + i, out + 8); transpose_32bit_4x4(all[3] + i, out + 12); - highbd_idct16_4col(out); + vpx_highbd_idct16_4col_sse4_1(out); for (j = 0; j < 16; ++j) { highbd_write_buffer_4(dest + j * stride, out[j], bd); diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c index 909a6b794..bb7a510e1 100644 --- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -124,8 +124,8 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, io_short[6] = _mm_packs_epi32(io[10], io[14]); io_short[7] = _mm_packs_epi32(io[11], io[15]); - idct8_sse2(io_short); - idct8_sse2(io_short); + vpx_idct8_sse2(io_short); + vpx_idct8_sse2(io_short); round_shift_8x8(io_short, io); } else { __m128i temp[4]; diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c index ae391b2c0..8b2e3d241 100644 --- a/vpx_dsp/x86/highbd_idct8x8_add_sse4.c +++ b/vpx_dsp/x86/highbd_idct8x8_add_sse4.c @@ -17,7 +17,7 @@ #include "vpx_dsp/x86/inv_txfm_ssse3.h" #include "vpx_dsp/x86/transpose_sse2.h" -static void highbd_idct8x8_half1d(__m128i *const io) { +void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io) { __m128i step1[8], step2[8]; transpose_32bit_4x4x2(io, io); @@ -126,13 +126,13 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, io_short[6] = _mm_packs_epi32(io[10], io[14]); io_short[7] = _mm_packs_epi32(io[11], io[15]); - idct8_sse2(io_short); - idct8_sse2(io_short); + vpx_idct8_sse2(io_short); + vpx_idct8_sse2(io_short); round_shift_8x8(io_short, io); } else { __m128i temp[4]; - highbd_idct8x8_half1d(io); + vpx_highbd_idct8x8_half1d_sse4_1(io); io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); @@ -142,7 +142,7 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); - highbd_idct8x8_half1d(&io[8]); + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); temp[0] = io[4]; temp[1] = io[5]; @@ -152,13 +152,13 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, io[5] = io[9]; io[6] = io[10]; io[7] = io[11]; - highbd_idct8x8_half1d(io); + vpx_highbd_idct8x8_half1d_sse4_1(io); io[8] = temp[0]; io[9] = temp[1]; io[10] = temp[2]; io[11] = temp[3]; - highbd_idct8x8_half1d(&io[8]); + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); highbd_idct8x8_final_round(io); } diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/vpx_dsp/x86/highbd_inv_txfm_sse4.h index 435934f1b..5a7fd1d39 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse4.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse4.h @@ -106,4 +106,7 @@ static INLINE void highbd_idct4_sse4_1(__m128i *const io) { io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3] } +void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io); +void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/); + #endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_ diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm index d9a6932e0..e1f9657df 100644 --- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm +++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -91,7 +91,7 @@ SECTION .text %define filter_idx_shift 5 -%ifdef PIC ; 64bit PIC +%if ARCH_X86_64 %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ x_offset, y_offset, \ @@ -99,19 +99,20 @@ SECTION .text sec, sec_stride, height, sse %define sec_str sec_strideq %else - cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse + cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, height, sse %endif %define block_height heightd %define bilin_filter sseq %else - %if ARCH_X86=1 && CONFIG_PIC=1 + %if CONFIG_PIC=1 %if %2 == 1 ; avg cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse, g_bilin_filter, g_pw_8 + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse, \ + g_bilin_filter, g_pw_8 %define block_height dword heightm %define sec_str sec_stridemp @@ -130,8 +131,9 @@ SECTION .text LOAD_IF_USED 0, 1 ; load eax, ecx back %else cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, height, \ - sse, g_bilin_filter, g_pw_8 + x_offset, y_offset, \ + dst, dst_stride, height, sse, \ + g_bilin_filter, g_pw_8 %define block_height heightd ; Store bilin_filter and pw_8 location in stack @@ -150,22 +152,16 @@ SECTION .text %endif %else %if %2 == 1 ; avg - cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ - 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse - %if ARCH_X86_64 - %define block_height heightd - %define sec_str sec_strideq - %else + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse %define block_height dword heightm %define sec_str sec_stridemp - %endif %else cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, height, sse + x_offset, y_offset, \ + dst, dst_stride, height, sse %define block_height heightd %endif @@ -284,14 +280,14 @@ SECTION .text .x_zero_y_nonhalf: ; x_offset == 0 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+y_offsetq] mova m9, [bilin_filter+y_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -308,7 +304,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -511,14 +507,14 @@ SECTION .text .x_half_y_nonhalf: ; x_offset == 0.5 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+y_offsetq] mova m9, [bilin_filter+y_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -535,7 +531,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -633,14 +629,14 @@ SECTION .text jnz .x_nonhalf_y_nonzero ; x_offset == bilin interpolation && y_offset == 0 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+x_offsetq] mova m9, [bilin_filter+x_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -657,7 +653,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -732,14 +728,14 @@ SECTION .text jne .x_nonhalf_y_nonhalf ; x_offset == bilin interpolation && y_offset == 0.5 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && mmsize == 16 mova m8, [bilin_filter+x_offsetq] mova m9, [bilin_filter+x_offsetq+16] - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -756,7 +752,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -859,8 +855,8 @@ SECTION .text .x_nonhalf_y_nonhalf: ; loading filter - this is same as in 8-bit depth -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 shl y_offsetd, filter_idx_shift @@ -869,7 +865,7 @@ SECTION .text mova m9, [bilin_filter+x_offsetq+16] mova m10, [bilin_filter+y_offsetq] mova m11, [bilin_filter+y_offsetq+16] - mova m12, [pw_8] + mova m12, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_y_a m10 @@ -897,7 +893,7 @@ SECTION .text %define filter_x_b [x_offsetq+16] %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif ; end of load filter diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index 6b1837df5..4b02da966 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -165,7 +165,7 @@ void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, // 2-D for (i = 0; i < 2; i++) { - idct8_sse2(in); + vpx_idct8_sse2(in); } write_buffer_8x8(in, dest, stride); @@ -221,7 +221,7 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, recon_and_store_8_dual(dest, dc_value, stride); } -void idct8_sse2(__m128i *const in) { +void vpx_idct8_sse2(__m128i *const in) { // 8x8 Transpose is copied from vpx_fdct8x8_sse2() transpose_16bit_8x8(in, in); @@ -514,7 +514,7 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, } } -static void iadst16_8col(__m128i *const in) { +void vpx_iadst16_8col_sse2(__m128i *const in) { // perform 16x16 1-D ADST for 8 columns __m128i s[16], x[16], u[32], v[32]; const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); @@ -874,8 +874,8 @@ void idct16_sse2(__m128i *const in0, __m128i *const in1) { void iadst16_sse2(__m128i *const in0, __m128i *const in1) { transpose_16bit_16x16(in0, in1); - iadst16_8col(in0); - iadst16_8col(in1); + vpx_iadst16_8col_sse2(in0); + vpx_iadst16_8col_sse2(in1); } // Group the coefficient calculation into smaller functions to prevent stack diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h index 5cd5098f1..d573f66c9 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.h +++ b/vpx_dsp/x86/inv_txfm_sse2.h @@ -697,10 +697,11 @@ static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7( } void idct4_sse2(__m128i *const in); -void idct8_sse2(__m128i *const in); +void vpx_idct8_sse2(__m128i *const in); void idct16_sse2(__m128i *const in0, __m128i *const in1); void iadst4_sse2(__m128i *const in); void iadst8_sse2(__m128i *const in); +void vpx_iadst16_8col_sse2(__m128i *const in); void iadst16_sse2(__m128i *const in0, __m128i *const in1); void idct32_1024_8x32(const __m128i *const in, __m128i *const out); void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out); diff --git a/vpx_dsp/x86/quantize_x86.h b/vpx_dsp/x86/quantize_x86.h index 34928fbb5..0e07a2ac5 100644 --- a/vpx_dsp/x86/quantize_x86.h +++ b/vpx_dsp/x86/quantize_x86.h @@ -12,7 +12,6 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" -#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin, const int16_t *round_ptr, __m128i *round, diff --git a/vpx_dsp/x86/subpel_variance_sse2.asm b/vpx_dsp/x86/subpel_variance_sse2.asm index cee4468c1..d938c1da4 100644 --- a/vpx_dsp/x86/subpel_variance_sse2.asm +++ b/vpx_dsp/x86/subpel_variance_sse2.asm @@ -114,27 +114,26 @@ SECTION .text ; 11, not 13, if the registers are ordered correctly. May make a minor speed ; difference on Win64 -%ifdef PIC ; 64bit PIC +%if ARCH_X86_64 %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, height, sse + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse %define sec_str sec_strideq %else - cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse %endif %define block_height heightd %define bilin_filter sseq %else - %if ARCH_X86=1 && CONFIG_PIC=1 + %if CONFIG_PIC=1 %if %2 == 1 ; avg cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse, g_bilin_filter, g_pw_8 + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse, \ + g_bilin_filter, g_pw_8 %define block_height dword heightm %define sec_str sec_stridemp @@ -152,9 +151,9 @@ SECTION .text LOAD_IF_USED 0, 1 ; load eax, ecx back %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse, \ - g_bilin_filter, g_pw_8 + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse, g_bilin_filter, g_pw_8 %define block_height heightd ;Store bilin_filter and pw_8 location in stack @@ -173,25 +172,18 @@ SECTION .text %endif %else %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ - 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, \ - sec, sec_stride, \ - height, sse - %if ARCH_X86_64 - %define block_height heightd - %define sec_str sec_strideq - %else + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, sec, sec_stride, \ + height, sse %define block_height dword heightm %define sec_str sec_stridemp - %endif %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \ - y_offset, dst, dst_stride, height, sse + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse %define block_height heightd %endif - %define bilin_filter bilin_filter_m %endif %endif @@ -371,8 +363,8 @@ SECTION .text .x_zero_y_nonhalf: ; x_offset == 0 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -380,7 +372,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+y_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -397,7 +389,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -694,8 +686,8 @@ SECTION .text .x_half_y_nonhalf: ; x_offset == 0.5 && y_offset == bilin interpolation -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl y_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -703,7 +695,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+y_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_y_a m8 %define filter_y_b m9 %define filter_rnd m10 @@ -720,7 +712,7 @@ SECTION .text add y_offsetq, bilin_filter %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -852,8 +844,8 @@ SECTION .text jnz .x_nonhalf_y_nonzero ; x_offset == bilin interpolation && y_offset == 0 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -861,7 +853,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -878,7 +870,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -994,8 +986,8 @@ SECTION .text jne .x_nonhalf_y_nonhalf ; x_offset == bilin interpolation && y_offset == 0.5 -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift %if ARCH_X86_64 && %1 > 4 @@ -1003,7 +995,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m9, [bilin_filter+x_offsetq+16] %endif - mova m10, [pw_8] + mova m10, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_rnd m10 @@ -1020,7 +1012,7 @@ SECTION .text add x_offsetq, bilin_filter %define filter_x_a [x_offsetq] %define filter_x_b [x_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif @@ -1192,8 +1184,8 @@ SECTION .text STORE_AND_RET %1 .x_nonhalf_y_nonhalf: -%ifdef PIC - lea bilin_filter, [bilin_filter_m] +%if ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] %endif shl x_offsetd, filter_idx_shift shl y_offsetd, filter_idx_shift @@ -1206,7 +1198,7 @@ SECTION .text %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 mova m11, [bilin_filter+y_offsetq+16] %endif - mova m12, [pw_8] + mova m12, [GLOBAL(pw_8)] %define filter_x_a m8 %define filter_x_b m9 %define filter_y_a m10 @@ -1234,7 +1226,7 @@ SECTION .text %define filter_x_b [x_offsetq+16] %define filter_y_a [y_offsetq] %define filter_y_b [y_offsetq+16] -%define filter_rnd [pw_8] +%define filter_rnd [GLOBAL(pw_8)] %endif %endif diff --git a/vpx_ports/mem_ops.h b/vpx_ports/mem_ops.h index 343f27577..b0608591f 100644 --- a/vpx_ports/mem_ops.h +++ b/vpx_ports/mem_ops.h @@ -224,5 +224,4 @@ static VPX_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) { mem[3] = (MAU_T)((val >> 24) & 0xff); } /* clang-format on */ - #endif // VPX_PORTS_MEM_OPS_H_ diff --git a/vpx_util/vpx_atomics.h b/vpx_util/vpx_atomics.h index b8cf80dae..504691065 100644 --- a/vpx_util/vpx_atomics.h +++ b/vpx_util/vpx_atomics.h @@ -68,7 +68,9 @@ extern "C" { // on any platform (to discourage programmer errors by setting values directly). // This primitive MUST be initialized using vpx_atomic_init or VPX_ATOMIC_INIT // (NOT memset) and accessed through vpx_atomic_ functions. -typedef struct vpx_atomic_int { volatile int value; } vpx_atomic_int; +typedef struct vpx_atomic_int { + volatile int value; +} vpx_atomic_int; #define VPX_ATOMIC_INIT(num) \ { num } @@ -2215,9 +2215,9 @@ int main(int argc, const char **argv_) { if (!global.quiet) { FOREACH_STREAM(fprintf( - stderr, "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64 - "b/f %7" PRId64 "b/s" - " %7" PRId64 " %s (%.2f fps)\033[K\n", + stderr, + "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64 "b/f %7" PRId64 + "b/s %7" PRId64 " %s (%.2f fps)\033[K\n", pass + 1, global.passes, frames_in, stream->frames_out, (int64_t)stream->nbytes, seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0, diff --git a/y4minput.c b/y4minput.c index e2df7a299..bf349c059 100644 --- a/y4minput.c +++ b/y4minput.c @@ -139,7 +139,6 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) { Conversions which require both horizontal and vertical filtering could have these steps pipelined, for less memory consumption and better cache performance, but we do them separately for simplicity.*/ - #define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a)) #define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a)) #define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c))) |