diff options
Diffstat (limited to 'vp9/common')
-rw-r--r-- | vp9/common/vp9_blockd.h | 19 | ||||
-rw-r--r-- | vp9/common/vp9_convolve.c | 21 | ||||
-rw-r--r-- | vp9/common/vp9_debugmodes.c | 4 | ||||
-rw-r--r-- | vp9/common/vp9_entropy.c | 619 | ||||
-rw-r--r-- | vp9/common/vp9_findnearmv.h | 12 | ||||
-rw-r--r-- | vp9/common/vp9_idct.h | 8 | ||||
-rw-r--r-- | vp9/common/vp9_idctllm.c | 889 | ||||
-rw-r--r-- | vp9/common/vp9_invtrans.c | 9 | ||||
-rw-r--r-- | vp9/common/vp9_onyxc_int.h | 2 | ||||
-rw-r--r-- | vp9/common/vp9_reconinter.c | 160 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.sh | 7 | ||||
-rw-r--r-- | vp9/common/vp9_tile_common.c | 43 | ||||
-rw-r--r-- | vp9/common/vp9_tile_common.h | 25 | ||||
-rw-r--r-- | vp9/common/x86/vp9_asm_stubs.c | 39 | ||||
-rw-r--r-- | vp9/common/x86/vp9_subpixel_8t_ssse3.asm | 239 |
15 files changed, 558 insertions, 1538 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index b0c1bfa08..82678d6b6 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -47,18 +47,6 @@ void vpx_log(const char *format, ...); #define MAX_MV_REFS 9 #define MAX_MV_REF_CANDIDATES 4 -#if CONFIG_DWTDCTHYBRID -#define DWT_MAX_LENGTH 64 -#define DWT_TYPE 26 // 26/53/97 -#define DWT_PRECISION_BITS 2 -#define DWT_PRECISION_RND ((1 << DWT_PRECISION_BITS) / 2) - -#define DWTDCT16X16 0 -#define DWTDCT16X16_LEAN 1 -#define DWTDCT8X8 2 -#define DWTDCT_TYPE DWTDCT16X16_LEAN -#endif - typedef struct { int r, c; } POS; @@ -218,10 +206,7 @@ union b_mode_info { B_PREDICTION_MODE context; #endif } as_mode; - struct { - int_mv first; - int_mv second; - } as_mv; + int_mv as_mv[2]; // first, second inter predictor motion vectors }; typedef enum { @@ -425,7 +410,7 @@ typedef struct macroblockd { #define ACTIVE_HT8 300 -#define ACTIVE_HT16 300 +#define ACTIVE_HT16 0 // convert MB_PREDICTION_MODE to B_PREDICTION_MODE static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) { diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c index f21f1d84e..b87c410df 100644 --- a/vp9/common/vp9_convolve.c +++ b/vp9/common/vp9_convolve.c @@ -7,12 +7,15 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include "vp9/common/vp9_convolve.h" + #include <assert.h> #include "./vpx_config.h" #include "./vp9_rtcd.h" #include "vp9/common/vp9_common.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" #define VP9_FILTER_WEIGHT 128 #define VP9_FILTER_SHIFT 7 @@ -293,9 +296,21 @@ void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - convolve_avg_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h, 8); + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 16 * 16); + assert(w <= 16); + assert(h <= 16); + + vp9_convolve8(src, src_stride, + temp, 16, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + vp9_convolve_avg(temp, 16, + dst, dst_stride, + NULL, 0, /* These unused parameter should be removed! */ + NULL, 0, /* These unused parameter should be removed! */ + w, h); } void vp9_convolve_copy(const uint8_t *src, int src_stride, diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c index 5ea7736b7..1953d60c6 100644 --- a/vp9/common/vp9_debugmodes.c +++ b/vp9/common/vp9_debugmodes.c @@ -129,8 +129,8 @@ void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, mb_index = (b_row >> 2) * (cols + 1) + (b_col >> 2); bindex = (b_row & 3) * 4 + (b_col & 3); fprintf(mvs, "%3d:%-3d ", - mi[mb_index].bmi[bindex].as_mv.first.as_mv.row, - mi[mb_index].bmi[bindex].as_mv.first.as_mv.col); + mi[mb_index].bmi[bindex].as_mv[0].as_mv.row, + mi[mb_index].bmi[bindex].as_mv[0].as_mv.col); } diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c index 352e17c0c..03f89ac87 100644 --- a/vp9/common/vp9_entropy.c +++ b/vp9/common/vp9_entropy.c @@ -143,624 +143,6 @@ DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_16x16[256]) = { 237, 252, 253, 238, 223, 239, 254, 255, }; -#if CONFIG_DWTDCTHYBRID - -#if DWTDCT_TYPE == DWTDCT16X16_LEAN -DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = { - 0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6, - 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, -}; - -DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = { - 0, 1, 32, 64, 33, 2, 3, 34, - 65, 96, 128, 97, 66, 35, 4, 5, - 36, 67, 98, 129, 160, 192, 161, 130, - 99, 68, 37, 6, 7, 38, 69, 100, - 131, 162, 193, 224, 256, 225, 194, 163, - 132, 101, 70, 39, 8, 9, 40, 71, - 102, 133, 164, 195, 226, 257, 288, 320, - 289, 258, 227, 196, 165, 134, 103, 72, - 41, 10, 11, 42, 73, 104, 135, 166, - 197, 228, 259, 290, 321, 352, 384, 353, - 322, 291, 260, 229, 198, 167, 136, 105, - 74, 43, 12, 13, 44, 75, 106, 137, - 168, 199, 230, 261, 292, 323, 354, 385, - 416, 448, 417, 386, 355, 324, 293, 262, - 231, 200, 169, 138, 107, 76, 45, 14, - 15, 46, 77, 108, 139, 170, 201, 232, - 263, 294, 325, 356, 387, 418, 449, 480, - 481, 450, 419, 388, 357, 326, 295, 264, - 233, 202, 171, 140, 109, 78, 47, 79, - 110, 141, 172, 203, 234, 265, 296, 327, - 358, 389, 420, 451, 482, 483, 452, 421, - 390, 359, 328, 297, 266, 235, 204, 173, - 142, 111, 143, 174, 205, 236, 267, 298, - 329, 360, 391, 422, 453, 484, 485, 454, - 423, 392, 361, 330, 299, 268, 237, 206, - 175, 207, 238, 269, 300, 331, 362, 393, - 424, 455, 486, 487, 456, 425, 394, 363, - 332, 301, 270, 239, 271, 302, 333, 364, - 395, 426, 457, 488, 489, 458, 427, 396, - 365, 334, 303, 335, 366, 397, 428, 459, - 490, 491, 460, 429, 398, 367, 399, 430, - 461, 492, 493, 462, 431, 463, 494, 495, - - 16, 512, 528, 17, 513, 529, 48, 544, - 560, 80, 576, 592, 49, 545, 561, 18, - 514, 530, 19, 515, 531, 50, 546, 562, - 81, 577, 593, 112, 608, 624, 144, 640, - 656, 113, 609, 625, 82, 578, 594, 51, - 547, 563, 20, 516, 532, 21, 517, 533, - 52, 548, 564, 83, 579, 595, 114, 610, - 626, 145, 641, 657, 176, 672, 688, 208, - 704, 720, 177, 673, 689, 146, 642, 658, - 115, 611, 627, 84, 580, 596, 53, 549, - 565, 22, 518, 534, 23, 519, 535, 54, - 550, 566, 85, 581, 597, 116, 612, 628, - 147, 643, 659, 178, 674, 690, 209, 705, - 721, 240, 736, 752, 272, 768, 784, 241, - 737, 753, 210, 706, 722, 179, 675, 691, - 148, 644, 660, 117, 613, 629, 86, 582, - 598, 55, 551, 567, 24, 520, 536, 25, - 521, 537, 56, 552, 568, 87, 583, 599, - 118, 614, 630, 149, 645, 661, 180, 676, - 692, 211, 707, 723, 242, 738, 754, 273, - 769, 785, 304, 800, 816, 336, 832, 848, - 305, 801, 817, 274, 770, 786, 243, 739, - 755, 212, 708, 724, 181, 677, 693, 150, - 646, 662, 119, 615, 631, 88, 584, 600, - 57, 553, 569, 26, 522, 538, 27, 523, - 539, 58, 554, 570, 89, 585, 601, 120, - 616, 632, 151, 647, 663, 182, 678, 694, - 213, 709, 725, 244, 740, 756, 275, 771, - 787, 306, 802, 818, 337, 833, 849, 368, - 864, 880, 400, 896, 912, 369, 865, 881, - 338, 834, 850, 307, 803, 819, 276, 772, - 788, 245, 741, 757, 214, 710, 726, 183, - - 679, 695, 152, 648, 664, 121, 617, 633, - 90, 586, 602, 59, 555, 571, 28, 524, - 540, 29, 525, 541, 60, 556, 572, 91, - 587, 603, 122, 618, 634, 153, 649, 665, - 184, 680, 696, 215, 711, 727, 246, 742, - 758, 277, 773, 789, 308, 804, 820, 339, - 835, 851, 370, 866, 882, 401, 897, 913, - 432, 928, 944, 464, 960, 976, 433, 929, - 945, 402, 898, 914, 371, 867, 883, 340, - 836, 852, 309, 805, 821, 278, 774, 790, - 247, 743, 759, 216, 712, 728, 185, 681, - 697, 154, 650, 666, 123, 619, 635, 92, - 588, 604, 61, 557, 573, 30, 526, 542, - 31, 527, 543, 62, 558, 574, 93, 589, - 605, 124, 620, 636, 155, 651, 667, 186, - 682, 698, 217, 713, 729, 248, 744, 760, - 279, 775, 791, 310, 806, 822, 341, 837, - 853, 372, 868, 884, 403, 899, 915, 434, - 930, 946, 465, 961, 977, 496, 992, 1008, - 497, 993, 1009, 466, 962, 978, 435, 931, - 947, 404, 900, 916, 373, 869, 885, 342, - 838, 854, 311, 807, 823, 280, 776, 792, - 249, 745, 761, 218, 714, 730, 187, 683, - 699, 156, 652, 668, 125, 621, 637, 94, - 590, 606, 63, 559, 575, 95, 591, 607, - 126, 622, 638, 157, 653, 669, 188, 684, - 700, 219, 715, 731, 250, 746, 762, 281, - 777, 793, 312, 808, 824, 343, 839, 855, - 374, 870, 886, 405, 901, 917, 436, 932, - 948, 467, 963, 979, 498, 994, 1010, 499, - 995, 1011, 468, 964, 980, 437, 933, 949, - 406, 902, 918, 375, 871, 887, 344, 840, - - 856, 313, 809, 825, 282, 778, 794, 251, - 747, 763, 220, 716, 732, 189, 685, 701, - 158, 654, 670, 127, 623, 639, 159, 655, - 671, 190, 686, 702, 221, 717, 733, 252, - 748, 764, 283, 779, 795, 314, 810, 826, - 345, 841, 857, 376, 872, 888, 407, 903, - 919, 438, 934, 950, 469, 965, 981, 500, - 996, 1012, 501, 997, 1013, 470, 966, 982, - 439, 935, 951, 408, 904, 920, 377, 873, - 889, 346, 842, 858, 315, 811, 827, 284, - 780, 796, 253, 749, 765, 222, 718, 734, - 191, 687, 703, 223, 719, 735, 254, 750, - 766, 285, 781, 797, 316, 812, 828, 347, - 843, 859, 378, 874, 890, 409, 905, 921, - 440, 936, 952, 471, 967, 983, 502, 998, - 1014, 503, 999, 1015, 472, 968, 984, 441, - 937, 953, 410, 906, 922, 379, 875, 891, - 348, 844, 860, 317, 813, 829, 286, 782, - 798, 255, 751, 767, 287, 783, 799, 318, - 814, 830, 349, 845, 861, 380, 876, 892, - 411, 907, 923, 442, 938, 954, 473, 969, - 985, 504, 1000, 1016, 505, 1001, 1017, 474, - 970, 986, 443, 939, 955, 412, 908, 924, - 381, 877, 893, 350, 846, 862, 319, 815, - 831, 351, 847, 863, 382, 878, 894, 413, - 909, 925, 444, 940, 956, 475, 971, 987, - 506, 1002, 1018, 507, 1003, 1019, 476, 972, - 988, 445, 941, 957, 414, 910, 926, 383, - 879, 895, 415, 911, 927, 446, 942, 958, - 477, 973, 989, 508, 1004, 1020, 509, 1005, - 1021, 478, 974, 990, 447, 943, 959, 479, - 975, 991, 510, 1006, 1022, 511, 1007, 1023, -}; - -#elif DWTDCT_TYPE == DWTDCT16X16 - -DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = { - 0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, - 6, 6, 6, - 6, - 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, -}; - -DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = { - 0, 1, 32, 64, 33, 2, 3, 34, - 65, 96, 128, 97, 66, 35, 4, - 16, 512, 528, - 5, - 36, 67, 98, 129, 160, 192, 161, 130, - 99, 68, 37, 6, 7, 38, 69, 100, - 131, 162, 193, 224, 256, 225, 194, 163, - 132, 101, 70, 39, 8, 9, 40, 71, - 102, 133, 164, 195, 226, 257, 288, 320, - 289, 258, 227, 196, 165, 134, 103, 72, - 41, 10, 11, 42, 73, 104, 135, 166, - 197, 228, 259, 290, 321, 352, 384, 353, - 322, 291, 260, 229, 198, 167, 136, 105, - 74, 43, 12, 13, 44, 75, 106, 137, - 168, 199, 230, 261, 292, 323, 354, 385, - 416, 448, 417, 386, 355, 324, 293, 262, - 231, 200, 169, 138, 107, 76, 45, 14, - 15, 46, 77, 108, 139, 170, 201, 232, - 263, 294, 325, 356, 387, 418, 449, 480, - 481, 450, 419, 388, 357, 326, 295, 264, - 233, 202, 171, 140, 109, 78, 47, 79, - 110, 141, 172, 203, 234, 265, 296, 327, - 358, 389, 420, 451, 482, 483, 452, 421, - 390, 359, 328, 297, 266, 235, 204, 173, - 142, 111, 143, 174, 205, 236, 267, 298, - 329, 360, 391, 422, 453, 484, 485, 454, - 423, 392, 361, 330, 299, 268, 237, 206, - 175, 207, 238, 269, 300, 331, 362, 393, - 424, 455, 486, 487, 456, 425, 394, 363, - 332, 301, 270, 239, 271, 302, 333, 364, - 395, 426, 457, 488, 489, 458, 427, 396, - 365, 334, 303, 335, 366, 397, 428, 459, - 490, 491, 460, 429, 398, 367, 399, 430, - 461, 492, 493, 462, 431, 463, 494, 495, - - 17, 513, 529, 48, 544, - 560, 80, 576, 592, 49, 545, 561, 18, - 514, 530, 19, 515, 531, 50, 546, 562, - 81, 577, 593, 112, 608, 624, 144, 640, - 656, 113, 609, 625, 82, 578, 594, 51, - 547, 563, 20, 516, 532, 21, 517, 533, - 52, 548, 564, 83, 579, 595, 114, 610, - 626, 145, 641, 657, 176, 672, 688, 208, - 704, 720, 177, 673, 689, 146, 642, 658, - 115, 611, 627, 84, 580, 596, 53, 549, - 565, 22, 518, 534, 23, 519, 535, 54, - 550, 566, 85, 581, 597, 116, 612, 628, - 147, 643, 659, 178, 674, 690, 209, 705, - 721, 240, 736, 752, 272, 768, 784, 241, - 737, 753, 210, 706, 722, 179, 675, 691, - 148, 644, 660, 117, 613, 629, 86, 582, - 598, 55, 551, 567, 24, 520, 536, 25, - 521, 537, 56, 552, 568, 87, 583, 599, - 118, 614, 630, 149, 645, 661, 180, 676, - 692, 211, 707, 723, 242, 738, 754, 273, - 769, 785, 304, 800, 816, 336, 832, 848, - 305, 801, 817, 274, 770, 786, 243, 739, - 755, 212, 708, 724, 181, 677, 693, 150, - 646, 662, 119, 615, 631, 88, 584, 600, - 57, 553, 569, 26, 522, 538, 27, 523, - 539, 58, 554, 570, 89, 585, 601, 120, - 616, 632, 151, 647, 663, 182, 678, 694, - 213, 709, 725, 244, 740, 756, 275, 771, - 787, 306, 802, 818, 337, 833, 849, 368, - 864, 880, 400, 896, 912, 369, 865, 881, - 338, 834, 850, 307, 803, 819, 276, 772, - 788, 245, 741, 757, 214, 710, 726, 183, - - 679, 695, 152, 648, 664, 121, 617, 633, - 90, 586, 602, 59, 555, 571, 28, 524, - 540, 29, 525, 541, 60, 556, 572, 91, - 587, 603, 122, 618, 634, 153, 649, 665, - 184, 680, 696, 215, 711, 727, 246, 742, - 758, 277, 773, 789, 308, 804, 820, 339, - 835, 851, 370, 866, 882, 401, 897, 913, - 432, 928, 944, 464, 960, 976, 433, 929, - 945, 402, 898, 914, 371, 867, 883, 340, - 836, 852, 309, 805, 821, 278, 774, 790, - 247, 743, 759, 216, 712, 728, 185, 681, - 697, 154, 650, 666, 123, 619, 635, 92, - 588, 604, 61, 557, 573, 30, 526, 542, - 31, 527, 543, 62, 558, 574, 93, 589, - 605, 124, 620, 636, 155, 651, 667, 186, - 682, 698, 217, 713, 729, 248, 744, 760, - 279, 775, 791, 310, 806, 822, 341, 837, - 853, 372, 868, 884, 403, 899, 915, 434, - 930, 946, 465, 961, 977, 496, 992, 1008, - 497, 993, 1009, 466, 962, 978, 435, 931, - 947, 404, 900, 916, 373, 869, 885, 342, - 838, 854, 311, 807, 823, 280, 776, 792, - 249, 745, 761, 218, 714, 730, 187, 683, - 699, 156, 652, 668, 125, 621, 637, 94, - 590, 606, 63, 559, 575, 95, 591, 607, - 126, 622, 638, 157, 653, 669, 188, 684, - 700, 219, 715, 731, 250, 746, 762, 281, - 777, 793, 312, 808, 824, 343, 839, 855, - 374, 870, 886, 405, 901, 917, 436, 932, - 948, 467, 963, 979, 498, 994, 1010, 499, - 995, 1011, 468, 964, 980, 437, 933, 949, - 406, 902, 918, 375, 871, 887, 344, 840, - - 856, 313, 809, 825, 282, 778, 794, 251, - 747, 763, 220, 716, 732, 189, 685, 701, - 158, 654, 670, 127, 623, 639, 159, 655, - 671, 190, 686, 702, 221, 717, 733, 252, - 748, 764, 283, 779, 795, 314, 810, 826, - 345, 841, 857, 376, 872, 888, 407, 903, - 919, 438, 934, 950, 469, 965, 981, 500, - 996, 1012, 501, 997, 1013, 470, 966, 982, - 439, 935, 951, 408, 904, 920, 377, 873, - 889, 346, 842, 858, 315, 811, 827, 284, - 780, 796, 253, 749, 765, 222, 718, 734, - 191, 687, 703, 223, 719, 735, 254, 750, - 766, 285, 781, 797, 316, 812, 828, 347, - 843, 859, 378, 874, 890, 409, 905, 921, - 440, 936, 952, 471, 967, 983, 502, 998, - 1014, 503, 999, 1015, 472, 968, 984, 441, - 937, 953, 410, 906, 922, 379, 875, 891, - 348, 844, 860, 317, 813, 829, 286, 782, - 798, 255, 751, 767, 287, 783, 799, 318, - 814, 830, 349, 845, 861, 380, 876, 892, - 411, 907, 923, 442, 938, 954, 473, 969, - 985, 504, 1000, 1016, 505, 1001, 1017, 474, - 970, 986, 443, 939, 955, 412, 908, 924, - 381, 877, 893, 350, 846, 862, 319, 815, - 831, 351, 847, 863, 382, 878, 894, 413, - 909, 925, 444, 940, 956, 475, 971, 987, - 506, 1002, 1018, 507, 1003, 1019, 476, 972, - 988, 445, 941, 957, 414, 910, 926, 383, - 879, 895, 415, 911, 927, 446, 942, 958, - 477, 973, 989, 508, 1004, 1020, 509, 1005, - 1021, 478, 974, 990, 447, 943, 959, 479, - 975, 991, 510, 1006, 1022, 511, 1007, 1023, -}; - -#elif DWTDCT_TYPE == DWTDCT8X8 - -DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = { - 0, 1, 2, 3, 5, 4, 4, 5, - 5, 3, 6, 3, 5, 4, 6, 6, - 6, 5, 5, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, - - 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, - 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, -}; - -DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = { - 0, 1, 32, 64, 33, 2, 3, 34, - 65, 96, 128, 97, 66, 35, 4, 5, - 36, 67, 98, 129, 160, 192, 161, 130, - 99, 68, 37, 6, 7, 38, 69, 100, - 131, 162, 193, 224, 225, 194, 163, 132, - 101, 70, 39, 71, 102, 133, 164, 195, - 226, 227, 196, 165, 134, 103, 135, 166, - 197, 228, 229, 198, 167, 199, 230, 231, - - 8, 256, 264, 9, 257, 265, 40, 288, 296, 72, 320, 328, - 41, 289, 297, 10, 258, 266, 11, 259, 267, 42, 290, 298, - 73, 321, 329, 104, 352, 360, 136, 384, 392, 105, 353, 361, - 74, 322, 330, 43, 291, 299, 12, 260, 268, 13, 261, 269, - 44, 292, 300, 75, 323, 331, 106, 354, 362, 137, 385, 393, - 168, 416, 424, 200, 448, 456, 169, 417, 425, 138, 386, 394, - 107, 355, 363, 76, 324, 332, 45, 293, 301, 14, 262, 270, - 15, 263, 271, 46, 294, 302, 77, 325, 333, 108, 356, 364, - 139, 387, 395, 170, 418, 426, 201, 449, 457, 232, 480, 488, - 233, 481, 489, 202, 450, 458, 171, 419, 427, 140, 388, 396, - 109, 357, 365, 78, 326, 334, 47, 295, 303, 79, 327, 335, - 110, 358, 366, 141, 389, 397, 172, 420, 428, 203, 451, 459, - 234, 482, 490, 235, 483, 491, 204, 452, 460, 173, 421, 429, - 142, 390, 398, 111, 359, 367, 143, 391, 399, 174, 422, 430, - 205, 453, 461, 236, 484, 492, 237, 485, 493, 206, 454, 462, - 175, 423, 431, 207, 455, 463, 238, 486, 494, 239, 487, 495, - - 16, 512, 528, 17, 513, 529, 18, 514, - 530, 19, 515, 531, 20, 516, 532, 21, - 517, 533, 22, 518, 534, 23, 519, 535, - 24, 520, 536, 25, 521, 537, 26, 522, - 538, 27, 523, 539, 28, 524, 540, 29, - 525, 541, 30, 526, 542, 31, 527, 543, - 48, 544, 560, 49, 545, 561, 50, 546, - 562, 51, 547, 563, 52, 548, 564, 53, - 549, 565, 54, 550, 566, 55, 551, 567, - 56, 552, 568, 57, 553, 569, 58, 554, - 570, 59, 555, 571, 60, 556, 572, 61, - 557, 573, 62, 558, 574, 63, 559, 575, - 80, 576, 592, 81, 577, 593, 82, 578, - 594, 83, 579, 595, 84, 580, 596, 85, - 581, 597, 86, 582, 598, 87, 583, 599, - 88, 584, 600, 89, 585, 601, 90, 586, - 602, 91, 587, 603, 92, 588, 604, 93, - 589, 605, 94, 590, 606, 95, 591, 607, - 112, 608, 624, 113, 609, 625, 114, 610, - 626, 115, 611, 627, 116, 612, 628, 117, - 613, 629, 118, 614, 630, 119, 615, 631, - 120, 616, 632, 121, 617, 633, 122, 618, - 634, 123, 619, 635, 124, 620, 636, 125, - 621, 637, 126, 622, 638, 127, 623, 639, - 144, 640, 656, 145, 641, 657, 146, 642, - 658, 147, 643, 659, 148, 644, 660, 149, - 645, 661, 150, 646, 662, 151, 647, 663, - 152, 648, 664, 153, 649, 665, 154, 650, - 666, 155, 651, 667, 156, 652, 668, 157, - 653, 669, 158, 654, 670, 159, 655, 671, - 176, 672, 688, 177, 673, 689, 178, 674, - 690, 179, 675, 691, 180, 676, 692, 181, - 677, 693, 182, 678, 694, 183, 679, 695, - 184, 680, 696, 185, 681, 697, 186, 682, - 698, 187, 683, 699, 188, 684, 700, 189, - 685, 701, 190, 686, 702, 191, 687, 703, - 208, 704, 720, 209, 705, 721, 210, 706, - 722, 211, 707, 723, 212, 708, 724, 213, - 709, 725, 214, 710, 726, 215, 711, 727, - 216, 712, 728, 217, 713, 729, 218, 714, - 730, 219, 715, 731, 220, 716, 732, 221, - 717, 733, 222, 718, 734, 223, 719, 735, - 240, 736, 752, 241, 737, 753, 242, 738, - 754, 243, 739, 755, 244, 740, 756, 245, - 741, 757, 246, 742, 758, 247, 743, 759, - 248, 744, 760, 249, 745, 761, 250, 746, - 762, 251, 747, 763, 252, 748, 764, 253, - 749, 765, 254, 750, 766, 255, 751, 767, - 272, 768, 784, 273, 769, 785, 274, 770, - 786, 275, 771, 787, 276, 772, 788, 277, - 773, 789, 278, 774, 790, 279, 775, 791, - 280, 776, 792, 281, 777, 793, 282, 778, - 794, 283, 779, 795, 284, 780, 796, 285, - 781, 797, 286, 782, 798, 287, 783, 799, - 304, 800, 816, 305, 801, 817, 306, 802, - 818, 307, 803, 819, 308, 804, 820, 309, - 805, 821, 310, 806, 822, 311, 807, 823, - 312, 808, 824, 313, 809, 825, 314, 810, - 826, 315, 811, 827, 316, 812, 828, 317, - 813, 829, 318, 814, 830, 319, 815, 831, - 336, 832, 848, 337, 833, 849, 338, 834, - 850, 339, 835, 851, 340, 836, 852, 341, - 837, 853, 342, 838, 854, 343, 839, 855, - 344, 840, 856, 345, 841, 857, 346, 842, - 858, 347, 843, 859, 348, 844, 860, 349, - 845, 861, 350, 846, 862, 351, 847, 863, - 368, 864, 880, 369, 865, 881, 370, 866, - 882, 371, 867, 883, 372, 868, 884, 373, - 869, 885, 374, 870, 886, 375, 871, 887, - 376, 872, 888, 377, 873, 889, 378, 874, - 890, 379, 875, 891, 380, 876, 892, 381, - 877, 893, 382, 878, 894, 383, 879, 895, - 400, 896, 912, 401, 897, 913, 402, 898, - 914, 403, 899, 915, 404, 900, 916, 405, - 901, 917, 406, 902, 918, 407, 903, 919, - 408, 904, 920, 409, 905, 921, 410, 906, - 922, 411, 907, 923, 412, 908, 924, 413, - 909, 925, 414, 910, 926, 415, 911, 927, - 432, 928, 944, 433, 929, 945, 434, 930, - 946, 435, 931, 947, 436, 932, 948, 437, - 933, 949, 438, 934, 950, 439, 935, 951, - 440, 936, 952, 441, 937, 953, 442, 938, - 954, 443, 939, 955, 444, 940, 956, 445, - 941, 957, 446, 942, 958, 447, 943, 959, - 464, 960, 976, 465, 961, 977, 466, 962, - 978, 467, 963, 979, 468, 964, 980, 469, - 965, 981, 470, 966, 982, 471, 967, 983, - 472, 968, 984, 473, 969, 985, 474, 970, - 986, 475, 971, 987, 476, 972, 988, 477, - 973, 989, 478, 974, 990, 479, 975, 991, - 496, 992, 1008, 497, 993, 1009, 498, 994, - 1010, 499, 995, 1011, 500, 996, 1012, 501, - 997, 1013, 502, 998, 1014, 503, 999, 1015, - 504, 1000, 1016, 505, 1001, 1017, 506, 1002, - 1018, 507, 1003, 1019, 508, 1004, 1020, 509, - 1005, 1021, 510, 1006, 1022, 511, 1007, 1023, -}; -#endif - -#else - DECLARE_ALIGNED(16, const int, vp9_coef_bands_32x32[1024]) = { 0, 1, 2, 3, 5, 4, 4, 5, 5, 3, 6, 3, 5, 4, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, @@ -865,7 +247,6 @@ DECLARE_ALIGNED(16, const int, vp9_default_zig_zag1d_32x32[1024]) = { 951, 920, 889, 858, 827, 796, 765, 734, 703, 735, 766, 797, 828, 859, 890, 921, 952, 983, 1014, 1015, 984, 953, 922, 891, 860, 829, 798, 767, 799, 830, 861, 892, 923, 954, 985, 1016, 1017, 986, 955, 924, 893, 862, 831, 863, 894, 925, 956, 987, 1018, 1019, 988, 957, 926, 895, 927, 958, 989, 1020, 1021, 990, 959, 991, 1022, 1023, }; -#endif // CONFIG_DWTDCTHYBRID /* Array indices are identical to previously-existing CONTEXT_NODE indices */ diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h index 74fce7aad..c42aab1a5 100644 --- a/vp9/common/vp9_findnearmv.h +++ b/vp9/common/vp9_findnearmv.h @@ -98,7 +98,7 @@ static int left_block_mv(const MACROBLOCKD *xd, b += 4; } - return (cur_mb->bmi + b - 1)->as_mv.first.as_int; + return (cur_mb->bmi + b - 1)->as_mv[0].as_int; } static int left_block_second_mv(const MACROBLOCKD *xd, @@ -117,8 +117,8 @@ static int left_block_second_mv(const MACROBLOCKD *xd, } return cur_mb->mbmi.second_ref_frame > 0 ? - (cur_mb->bmi + b - 1)->as_mv.second.as_int : - (cur_mb->bmi + b - 1)->as_mv.first.as_int; + (cur_mb->bmi + b - 1)->as_mv[1].as_int : + (cur_mb->bmi + b - 1)->as_mv[0].as_int; } static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) { @@ -131,7 +131,7 @@ static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride) { b += 16; } - return (cur_mb->bmi + b - 4)->as_mv.first.as_int; + return (cur_mb->bmi + b - 4)->as_mv[0].as_int; } static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) { @@ -146,8 +146,8 @@ static int above_block_second_mv(const MODE_INFO *cur_mb, int b, int mi_stride) } return cur_mb->mbmi.second_ref_frame > 0 ? - (cur_mb->bmi + b - 4)->as_mv.second.as_int : - (cur_mb->bmi + b - 4)->as_mv.first.as_int; + (cur_mb->bmi + b - 4)->as_mv[1].as_int : + (cur_mb->bmi + b - 4)->as_mv[0].as_int; } static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) { diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index 680a20627..01e8ea3c2 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -50,6 +50,14 @@ static const int cospi_29_64 = 2404; static const int cospi_30_64 = 1606; static const int cospi_31_64 = 804; +#if CONFIG_INTHT4X4 +// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 +static const int sinpi_1_9 = 5283; +static const int sinpi_2_9 = 9929; +static const int sinpi_3_9 = 13377; +static const int sinpi_4_9 = 15212; +#endif + static INLINE int dct_const_round_shift(int input) { int rv = (input + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; assert((rv <= INT16_MAX) && (rv >= INT16_MIN)); diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c index b27b34cf2..548805726 100644 --- a/vp9/common/vp9_idctllm.c +++ b/vp9/common/vp9_idctllm.c @@ -494,7 +494,6 @@ void vp9_dc_only_inv_walsh_add_c(short input_dc, uint8_t *pred_ptr, } #endif - void idct4_1d(int16_t *input, int16_t *output) { int16_t step[4]; int temp1, temp2; @@ -651,6 +650,100 @@ void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) { } } +#if CONFIG_INTHT4X4 +static void iadst4_1d(int16_t *input, int16_t *output) { + int x0, x1, x2, x3; + int s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = input[0]; + x1 = input[1]; + x2 = input[2]; + x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = x0 - x2 + x3; + + x0 = s0 + s3 + s5; + x1 = s1 - s4 - s6; + x2 = sinpi_3_9 * s7; + x3 = s2; + + s0 = x0 + x3; + s1 = x1 + x3; + s2 = x2; + s3 = x0 + x1 - x3; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = dct_const_round_shift(s0); + output[1] = dct_const_round_shift(s1); + output[2] = dct_const_round_shift(s2); + output[3] = dct_const_round_shift(s3); +} + +void vp9_short_iht4x4_c(int16_t *input, int16_t *output, + int pitch, TX_TYPE tx_type) { + int16_t out[16]; + int16_t *outptr = &out[0]; + const int short_pitch = pitch >> 1; + int i, j; + int16_t temp_in[4], temp_out[4]; + + void (*invr)(int16_t*, int16_t*); + void (*invc)(int16_t*, int16_t*); + + switch (tx_type) { + case ADST_ADST: + invc = &iadst4_1d; + invr = &iadst4_1d; + break; + case ADST_DCT: + invc = &iadst4_1d; + invr = &idct4_1d; + break; + case DCT_ADST: + invc = &idct4_1d; + invr = &iadst4_1d; + break; + case DCT_DCT: + invc = &idct4_1d; + invr = &idct4_1d; + break; + default: + assert(0); + } + + // inverse transform row vectors + for (i = 0; i < 4; ++i) { + invr(input, outptr); + input += 4; + outptr += 4; + } + + // inverse transform column vectors + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + invc(temp_in, temp_out); + for (j = 0; j < 4; ++j) + output[j * short_pitch + i] = (temp_out[j] + 8) >> 4; + } +} +#endif + #if CONFIG_INTHT static void iadst8_1d(int16_t *input, int16_t *output) { int x0, x1, x2, x3, x4, x5, x6, x7; @@ -733,7 +826,7 @@ static void iadst8_1d(int16_t *input, int16_t *output) { } void vp9_short_iht8x8_c(int16_t *input, int16_t *output, - TX_TYPE tx_type, int pitch) { + int pitch, TX_TYPE tx_type) { int16_t out[8 * 8]; int16_t *outptr = &out[0]; const int short_pitch = pitch >> 1; @@ -1059,8 +1152,6 @@ void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) { *output = (out + 32) >> 6; } - -#if !CONFIG_DWTDCTHYBRID void idct32_1d(int16_t *input, int16_t *output) { int16_t step1[32], step2[32]; int temp1, temp2; @@ -1428,7 +1519,6 @@ void idct32_1d(int16_t *input, int16_t *output) { output[31] = step1[0] - step1[31]; } - void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { int16_t out[32 * 32]; int16_t *outptr = &out[0]; @@ -1461,792 +1551,3 @@ void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) { out = dct_const_round_shift(tmp); *output = (out + 32) >> 6; } - -#else // !CONFIG_DWTDCTHYBRID - -#if DWT_TYPE == 53 - -// Note: block length must be even for this implementation -static void synthesis_53_row(int length, int16_t *lowpass, int16_t *highpass, - int16_t *x) { - int16_t r, *a, *b; - int n; - - n = length >> 1; - b = highpass; - a = lowpass; - r = *highpass; - while (n--) { - *a++ -= (r + (*b) + 1) >> 1; - r = *b++; - } - - n = length >> 1; - b = highpass; - a = lowpass; - while (--n) { - *x++ = ((r = *a++) + 1) >> 1; - *x++ = *b++ + ((r + (*a) + 2) >> 2); - } - *x++ = ((r = *a) + 1) >> 1; - *x++ = *b + ((r + 1) >> 1); -} - -static void synthesis_53_col(int length, int16_t *lowpass, int16_t *highpass, - int16_t *x) { - int16_t r, *a, *b; - int n; - - n = length >> 1; - b = highpass; - a = lowpass; - r = *highpass; - while (n--) { - *a++ -= (r + (*b) + 1) >> 1; - r = *b++; - } - - n = length >> 1; - b = highpass; - a = lowpass; - while (--n) { - r = *a++; - *x++ = r; - *x++ = ((*b++) << 1) + ((r + (*a) + 1) >> 1); - } - *x++ = *a; - *x++ = ((*b) << 1) + *a; -} - -static void dyadic_synthesize_53(int levels, int width, int height, int16_t *c, - int pitch_c, int16_t *x, int pitch_x) { - int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width; - short buffer[2 * DWT_MAX_LENGTH]; - - th[0] = hh; - tw[0] = hw; - for (i = 1; i <= levels; i++) { - th[i] = (th[i - 1] + 1) >> 1; - tw[i] = (tw[i - 1] + 1) >> 1; - } - for (lv = levels - 1; lv >= 0; lv--) { - nh = th[lv]; - nw = tw[lv]; - hh = th[lv + 1]; - hw = tw[lv + 1]; - if ((nh < 2) || (nw < 2)) continue; - for (j = 0; j < nw; j++) { - for (i = 0; i < nh; i++) - buffer[i] = c[i * pitch_c + j]; - synthesis_53_col(nh, buffer, buffer + hh, buffer + nh); - for (i = 0; i < nh; i++) - c[i * pitch_c + j] = buffer[i + nh]; - } - for (i = 0; i < nh; i++) { - memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer)); - synthesis_53_row(nw, buffer, buffer + hw, &c[i * pitch_c]); - } - } - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ? - ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) : - -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS); - } - } -} - -#elif DWT_TYPE == 26 - -// Note: block length must be even for this implementation -static void synthesis_26_row(int length, int16_t *lowpass, int16_t *highpass, - int16_t *x) { - int16_t r, s, *a, *b; - int i, n = length >> 1; - - if (n >= 4) { - a = lowpass; - b = highpass; - r = *lowpass; - while (--n) { - *b++ += (r - a[1] + 4) >> 3; - r = *a++; - } - *b += (r - *a + 4) >> 3; - } - a = lowpass; - b = highpass; - for (i = length >> 1; i; i--) { - s = *b++; - r = *a++; - *x++ = (r + s + 1) >> 1; - *x++ = (r - s + 1) >> 1; - } -} - -static void synthesis_26_col(int length, int16_t *lowpass, int16_t *highpass, - int16_t *x) { - int16_t r, s, *a, *b; - int i, n = length >> 1; - - if (n >= 4) { - a = lowpass; - b = highpass; - r = *lowpass; - while (--n) { - *b++ += (r - a[1] + 4) >> 3; - r = *a++; - } - *b += (r - *a + 4) >> 3; - } - a = lowpass; - b = highpass; - for (i = length >> 1; i; i--) { - s = *b++; - r = *a++; - *x++ = r + s; - *x++ = r - s; - } -} - -static void dyadic_synthesize_26(int levels, int width, int height, int16_t *c, - int pitch_c, int16_t *x, int pitch_x) { - int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width; - int16_t buffer[2 * DWT_MAX_LENGTH]; - - th[0] = hh; - tw[0] = hw; - for (i = 1; i <= levels; i++) { - th[i] = (th[i - 1] + 1) >> 1; - tw[i] = (tw[i - 1] + 1) >> 1; - } - for (lv = levels - 1; lv >= 0; lv--) { - nh = th[lv]; - nw = tw[lv]; - hh = th[lv + 1]; - hw = tw[lv + 1]; - if ((nh < 2) || (nw < 2)) continue; - for (j = 0; j < nw; j++) { - for (i = 0; i < nh; i++) - buffer[i] = c[i * pitch_c + j]; - synthesis_26_col(nh, buffer, buffer + hh, buffer + nh); - for (i = 0; i < nh; i++) - c[i * pitch_c + j] = buffer[i + nh]; - } - for (i = 0; i < nh; i++) { - memcpy(buffer, &c[i * pitch_c], nw * sizeof(*buffer)); - synthesis_26_row(nw, buffer, buffer + hw, &c[i * pitch_c]); - } - } - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - x[i * pitch_x + j] = c[i * pitch_c + j] >= 0 ? - ((c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS) : - -((-c[i * pitch_c + j] + DWT_PRECISION_RND) >> DWT_PRECISION_BITS); - } - } -} - -#elif DWT_TYPE == 97 - -static void synthesis_97(int length, double *lowpass, double *highpass, - double *x) { - static const double a_predict1 = -1.586134342; - static const double a_update1 = -0.05298011854; - static const double a_predict2 = 0.8829110762; - static const double a_update2 = 0.4435068522; - static const double s_low = 1.149604398; - static const double s_high = 1/1.149604398; - static const double inv_s_low = 1 / s_low; - static const double inv_s_high = 1 / s_high; - int i; - double y[DWT_MAX_LENGTH]; - // Undo pack and scale - for (i = 0; i < length / 2; i++) { - y[i * 2] = lowpass[i] * inv_s_low; - y[i * 2 + 1] = highpass[i] * inv_s_high; - } - memcpy(x, y, sizeof(*y) * length); - // Undo update 2 - for (i = 2; i < length; i += 2) { - x[i] -= a_update2 * (x[i-1] + x[i+1]); - } - x[0] -= 2 * a_update2 * x[1]; - // Undo predict 2 - for (i = 1; i < length - 2; i += 2) { - x[i] -= a_predict2 * (x[i - 1] + x[i + 1]); - } - x[length - 1] -= 2 * a_predict2 * x[length - 2]; - // Undo update 1 - for (i = 2; i < length; i += 2) { - x[i] -= a_update1 * (x[i - 1] + x[i + 1]); - } - x[0] -= 2 * a_update1 * x[1]; - // Undo predict 1 - for (i = 1; i < length - 2; i += 2) { - x[i] -= a_predict1 * (x[i - 1] + x[i + 1]); - } - x[length - 1] -= 2 * a_predict1 * x[length - 2]; -} - -static void dyadic_synthesize_97(int levels, int width, int height, int16_t *c, - int pitch_c, int16_t *x, int pitch_x) { - int th[16], tw[16], lv, i, j, nh, nw, hh = height, hw = width; - double buffer[2 * DWT_MAX_LENGTH]; - double y[DWT_MAX_LENGTH * DWT_MAX_LENGTH]; - - th[0] = hh; - tw[0] = hw; - for (i = 1; i <= levels; i++) { - th[i] = (th[i - 1] + 1) >> 1; - tw[i] = (tw[i - 1] + 1) >> 1; - } - for (lv = levels - 1; lv >= 0; lv--) { - nh = th[lv]; - nw = tw[lv]; - hh = th[lv + 1]; - hw = tw[lv + 1]; - if ((nh < 2) || (nw < 2)) continue; - for (j = 0; j < nw; j++) { - for (i = 0; i < nh; i++) - buffer[i] = c[i * pitch_c + j]; - synthesis_97(nh, buffer, buffer + hh, buffer + nh); - for (i = 0; i < nh; i++) - y[i * DWT_MAX_LENGTH + j] = buffer[i + nh]; - } - for (i = 0; i < nh; i++) { - memcpy(buffer, &y[i * DWT_MAX_LENGTH], nw * sizeof(*buffer)); - synthesis_97(nw, buffer, buffer + hw, &y[i * DWT_MAX_LENGTH]); - } - } - for (i = 0; i < height; i++) - for (j = 0; j < width; j++) - x[i * pitch_x + j] = round(y[i * DWT_MAX_LENGTH + j] / - (1 << DWT_PRECISION_BITS)); -} - -#endif // DWT_TYPE - -// TODO(debargha): Implement scaling differently so as not to have to use the -// floating point 16x16 dct -static void butterfly_16x16_idct_1d_f(double input[16], double output[16]) { - static const double C1 = 0.995184726672197; - static const double C2 = 0.98078528040323; - static const double C3 = 0.956940335732209; - static const double C4 = 0.923879532511287; - static const double C5 = 0.881921264348355; - static const double C6 = 0.831469612302545; - static const double C7 = 0.773010453362737; - static const double C8 = 0.707106781186548; - static const double C9 = 0.634393284163646; - static const double C10 = 0.555570233019602; - static const double C11 = 0.471396736825998; - static const double C12 = 0.38268343236509; - static const double C13 = 0.290284677254462; - static const double C14 = 0.195090322016128; - static const double C15 = 0.098017140329561; - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double step[16]; - double intermediate[16]; - double temp1, temp2; - - - // step 1 and 2 - step[ 0] = input[0] + input[8]; - step[ 1] = input[0] - input[8]; - - temp1 = input[4]*C12; - temp2 = input[12]*C4; - - temp1 -= temp2; - temp1 *= C8; - - step[ 2] = 2*(temp1); - - temp1 = input[4]*C4; - temp2 = input[12]*C12; - temp1 += temp2; - temp1 = (temp1); - temp1 *= C8; - step[ 3] = 2*(temp1); - - temp1 = input[2]*C8; - temp1 = 2*(temp1); - temp2 = input[6] + input[10]; - - step[ 4] = temp1 + temp2; - step[ 5] = temp1 - temp2; - - temp1 = input[14]*C8; - temp1 = 2*(temp1); - temp2 = input[6] - input[10]; - - step[ 6] = temp2 - temp1; - step[ 7] = temp2 + temp1; - - // for odd input - temp1 = input[3]*C12; - temp2 = input[13]*C4; - temp1 += temp2; - temp1 = (temp1); - temp1 *= C8; - intermediate[ 8] = 2*(temp1); - - temp1 = input[3]*C4; - temp2 = input[13]*C12; - temp2 -= temp1; - temp2 = (temp2); - temp2 *= C8; - intermediate[ 9] = 2*(temp2); - - intermediate[10] = 2*(input[9]*C8); - intermediate[11] = input[15] - input[1]; - intermediate[12] = input[15] + input[1]; - intermediate[13] = 2*((input[7]*C8)); - - temp1 = input[11]*C12; - temp2 = input[5]*C4; - temp2 -= temp1; - temp2 = (temp2); - temp2 *= C8; - intermediate[14] = 2*(temp2); - - temp1 = input[11]*C4; - temp2 = input[5]*C12; - temp1 += temp2; - temp1 = (temp1); - temp1 *= C8; - intermediate[15] = 2*(temp1); - - step[ 8] = intermediate[ 8] + intermediate[14]; - step[ 9] = intermediate[ 9] + intermediate[15]; - step[10] = intermediate[10] + intermediate[11]; - step[11] = intermediate[10] - intermediate[11]; - step[12] = intermediate[12] + intermediate[13]; - step[13] = intermediate[12] - intermediate[13]; - step[14] = intermediate[ 8] - intermediate[14]; - step[15] = intermediate[ 9] - intermediate[15]; - - // step 3 - output[0] = step[ 0] + step[ 3]; - output[1] = step[ 1] + step[ 2]; - output[2] = step[ 1] - step[ 2]; - output[3] = step[ 0] - step[ 3]; - - temp1 = step[ 4]*C14; - temp2 = step[ 7]*C2; - temp1 -= temp2; - output[4] = (temp1); - - temp1 = step[ 4]*C2; - temp2 = step[ 7]*C14; - temp1 += temp2; - output[7] = (temp1); - - temp1 = step[ 5]*C10; - temp2 = step[ 6]*C6; - temp1 -= temp2; - output[5] = (temp1); - - temp1 = step[ 5]*C6; - temp2 = step[ 6]*C10; - temp1 += temp2; - output[6] = (temp1); - - output[8] = step[ 8] + step[11]; - output[9] = step[ 9] + step[10]; - output[10] = step[ 9] - step[10]; - output[11] = step[ 8] - step[11]; - output[12] = step[12] + step[15]; - output[13] = step[13] + step[14]; - output[14] = step[13] - step[14]; - output[15] = step[12] - step[15]; - - // output 4 - step[ 0] = output[0] + output[7]; - step[ 1] = output[1] + output[6]; - step[ 2] = output[2] + output[5]; - step[ 3] = output[3] + output[4]; - step[ 4] = output[3] - output[4]; - step[ 5] = output[2] - output[5]; - step[ 6] = output[1] - output[6]; - step[ 7] = output[0] - output[7]; - - temp1 = output[8]*C7; - temp2 = output[15]*C9; - temp1 -= temp2; - step[ 8] = (temp1); - - temp1 = output[9]*C11; - temp2 = output[14]*C5; - temp1 += temp2; - step[ 9] = (temp1); - - temp1 = output[10]*C3; - temp2 = output[13]*C13; - temp1 -= temp2; - step[10] = (temp1); - - temp1 = output[11]*C15; - temp2 = output[12]*C1; - temp1 += temp2; - step[11] = (temp1); - - temp1 = output[11]*C1; - temp2 = output[12]*C15; - temp2 -= temp1; - step[12] = (temp2); - - temp1 = output[10]*C13; - temp2 = output[13]*C3; - temp1 += temp2; - step[13] = (temp1); - - temp1 = output[9]*C5; - temp2 = output[14]*C11; - temp2 -= temp1; - step[14] = (temp2); - - temp1 = output[8]*C9; - temp2 = output[15]*C7; - temp1 += temp2; - step[15] = (temp1); - - // step 5 - output[0] = (step[0] + step[15]); - output[1] = (step[1] + step[14]); - output[2] = (step[2] + step[13]); - output[3] = (step[3] + step[12]); - output[4] = (step[4] + step[11]); - output[5] = (step[5] + step[10]); - output[6] = (step[6] + step[ 9]); - output[7] = (step[7] + step[ 8]); - - output[15] = (step[0] - step[15]); - output[14] = (step[1] - step[14]); - output[13] = (step[2] - step[13]); - output[12] = (step[3] - step[12]); - output[11] = (step[4] - step[11]); - output[10] = (step[5] - step[10]); - output[9] = (step[6] - step[ 9]); - output[8] = (step[7] - step[ 8]); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -static void vp9_short_idct16x16_c_f(int16_t *input, int16_t *output, int pitch, - int scale) { - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - double out[16*16], out2[16*16]; - const int short_pitch = pitch >> 1; - int i, j; - // First transform rows - for (i = 0; i < 16; ++i) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; ++j) - temp_in[j] = input[j + i*short_pitch]; - butterfly_16x16_idct_1d_f(temp_in, temp_out); - for (j = 0; j < 16; ++j) - out[j + i*16] = temp_out[j]; - } - // Then transform columns - for (i = 0; i < 16; ++i) { - double temp_in[16], temp_out[16]; - for (j = 0; j < 16; ++j) - temp_in[j] = out[j*16 + i]; - butterfly_16x16_idct_1d_f(temp_in, temp_out); - for (j = 0; j < 16; ++j) - out2[j*16 + i] = temp_out[j]; - } - for (i = 0; i < 16*16; ++i) - output[i] = round(out2[i] / (128 >> scale)); - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -static void idct8_1d_f(double *x) { - int i, j; - double t[8]; - static const double idctmat[64] = { - 0.35355339059327, 0.49039264020162, 0.46193976625564, 0.41573480615127, - 0.35355339059327, 0.2777851165098, 0.19134171618254, 0.097545161008064, - 0.35355339059327, 0.41573480615127, 0.19134171618254, -0.097545161008064, - -0.35355339059327, -0.49039264020161, -0.46193976625564, -0.2777851165098, - 0.35355339059327, 0.2777851165098, -0.19134171618254, -0.49039264020162, - -0.35355339059327, 0.097545161008064, 0.46193976625564, 0.41573480615127, - 0.35355339059327, 0.097545161008063, -0.46193976625564, -0.2777851165098, - 0.35355339059327, 0.41573480615127, -0.19134171618254, -0.49039264020162, - 0.35355339059327, -0.097545161008063, -0.46193976625564, 0.2777851165098, - 0.35355339059327, -0.41573480615127, -0.19134171618255, 0.49039264020162, - 0.35355339059327, -0.2777851165098, -0.19134171618254, 0.49039264020161, - -0.35355339059327, -0.097545161008064, 0.46193976625564, -0.41573480615127, - 0.35355339059327, -0.41573480615127, 0.19134171618254, 0.097545161008065, - -0.35355339059327, 0.49039264020162, -0.46193976625564, 0.2777851165098, - 0.35355339059327, -0.49039264020162, 0.46193976625564, -0.41573480615127, - 0.35355339059327, -0.2777851165098, 0.19134171618255, -0.097545161008064 - }; - for (i = 0; i < 8; ++i) { - t[i] = 0; - for (j = 0; j < 8; ++j) - t[i] += idctmat[i * 8 + j] * x[j]; - } - for (i = 0; i < 8; ++i) { - x[i] = t[i]; - } -} - -static void vp9_short_idct8x8_c_f(int16_t *coefs, int16_t *block, int pitch, - int scale) { - double X[8 * 8], Y[8]; - int i, j; - int shortpitch = pitch >> 1; - - vp9_clear_system_state(); // Make it simd safe : __asm emms; - { - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - X[i * 8 + j] = (double)coefs[i * shortpitch + j]; - } - } - for (i = 0; i < 8; i++) - idct8_1d_f(X + 8 * i); - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; ++j) - Y[j] = X[i + 8 * j]; - idct8_1d_f(Y); - for (j = 0; j < 8; ++j) - X[i + 8 * j] = Y[j]; - } - for (i = 0; i < 8; i++) { - for (j = 0; j < 8; j++) { - block[i * 8 + j] = (int16_t)round(X[i * 8 + j] / (8 >> scale)); - } - } - } - vp9_clear_system_state(); // Make it simd safe : __asm emms; -} - -#define multiply_bits(d, n) ((n) < 0 ? (d) >> (n) : (d) << (n)) - -#if DWTDCT_TYPE == DWTDCT16X16_LEAN - -void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { - // assume output is a 32x32 buffer - // Temporary buffer to hold a 16x16 block for 16x16 inverse dct - int16_t buffer[16 * 16]; - // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt - int16_t buffer2[32 * 32]; - // Note: pitch is in bytes, short_pitch is in short units - const int short_pitch = pitch >> 1; - int i, j; - - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the idct16x16 function - vp9_short_idct16x16_c_f(input, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16); - } - for (i = 0; i < 16; ++i) { - for (j = 16; j < 32; ++j) { - buffer2[i * 32 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2); - } - } - for (i = 16; i < 32; ++i) { - for (j = 0; j < 32; ++j) { - buffer2[i * 32 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2); - } - } -#if DWT_TYPE == 26 - dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32); -#elif DWT_TYPE == 97 - dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32); -#elif DWT_TYPE == 53 - dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32); -#endif -} - -#elif DWTDCT_TYPE == DWTDCT16X16 - -void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { - // assume output is a 32x32 buffer - // Temporary buffer to hold a 16x16 block for 16x16 inverse dct - int16_t buffer[16 * 16]; - // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt - int16_t buffer2[32 * 32]; - // Note: pitch is in bytes, short_pitch is in short units - const int short_pitch = pitch >> 1; - int i, j; - - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the idct16x16 function - vp9_short_idct16x16_c_f(input, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 32, buffer + i * 16, sizeof(*buffer2) * 16); - } - vp9_short_idct16x16_c_f(input + 16, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 32 + 16, buffer + i * 16, sizeof(*buffer2) * 16); - } - vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 32 + 16 * 32, buffer + i * 16, - sizeof(*buffer2) * 16); - } - vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 32 + 16 * 33, buffer + i * 16, - sizeof(*buffer2) * 16); - } -#if DWT_TYPE == 26 - dyadic_synthesize_26(1, 32, 32, buffer2, 32, output, 32); -#elif DWT_TYPE == 97 - dyadic_synthesize_97(1, 32, 32, buffer2, 32, output, 32); -#elif DWT_TYPE == 53 - dyadic_synthesize_53(1, 32, 32, buffer2, 32, output, 32); -#endif -} - -#elif DWTDCT_TYPE == DWTDCT8X8 - -void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) { - // assume output is a 32x32 buffer - // Temporary buffer to hold a 16x16 block for 16x16 inverse dct - int16_t buffer[8 * 8]; - // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt - int16_t buffer2[32 * 32]; - // Note: pitch is in bytes, short_pitch is in short units - const int short_pitch = pitch >> 1; - int i, j; - - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the idct16x16 function - vp9_short_idct8x8_c_f(input, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) { - vpx_memcpy(buffer2 + i * 32, buffer + i * 8, sizeof(*buffer2) * 8); - } - vp9_short_idct8x8_c_f(input + 8, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) { - vpx_memcpy(buffer2 + i * 32 + 8, buffer + i * 8, sizeof(*buffer2) * 8); - } - vp9_short_idct8x8_c_f(input + 8 * short_pitch, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) { - vpx_memcpy(buffer2 + i * 32 + 8 * 32, buffer + i * 8, - sizeof(*buffer2) * 8); - } - vp9_short_idct8x8_c_f(input + 8 * short_pitch + 8, buffer, pitch, - 1 + DWT_PRECISION_BITS); - for (i = 0; i < 8; ++i) { - vpx_memcpy(buffer2 + i * 32 + 8 * 33, buffer + i * 8, - sizeof(*buffer2) * 8); - } - for (i = 0; i < 16; ++i) { - for (j = 16; j < 32; ++j) { - buffer2[i * 32 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2); - } - } - for (i = 16; i < 32; ++i) { - for (j = 0; j < 32; ++j) { - buffer2[i * 32 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 2); - } - } -#if DWT_TYPE == 26 - dyadic_synthesize_26(2, 32, 32, buffer2, 32, output, 32); -#elif DWT_TYPE == 97 - dyadic_synthesize_97(2, 32, 32, buffer2, 32, output, 32); -#elif DWT_TYPE == 53 - dyadic_synthesize_53(2, 32, 32, buffer2, 32, output, 32); -#endif -} - -#endif - -#if CONFIG_TX64X64 -void vp9_short_idct64x64_c(int16_t *input, int16_t *output, int pitch) { - // assume output is a 64x64 buffer - // Temporary buffer to hold a 16x16 block for 16x16 inverse dct - int16_t buffer[16 * 16]; - // Temporary buffer to hold a 32x32 block for inverse 32x32 dwt - int16_t buffer2[64 * 64]; - // Note: pitch is in bytes, short_pitch is in short units - const int short_pitch = pitch >> 1; - int i, j; - - // TODO(debargha): Implement more efficiently by adding output pitch - // argument to the idct16x16 function - vp9_short_idct16x16_c_f(input, buffer, pitch, - 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 64, buffer + i * 16, sizeof(*buffer2) * 16); - } -#if DWTDCT_TYPE == DWTDCT16X16_LEAN - for (i = 0; i < 16; ++i) { - for (j = 16; j < 64; ++j) { - buffer2[i * 64 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1); - } - } - for (i = 16; i < 64; ++i) { - for (j = 0; j < 64; ++j) { - buffer2[i * 64 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1); - } - } -#elif DWTDCT_TYPE == DWTDCT16X16 - vp9_short_idct16x16_c_f(input + 16, buffer, pitch, - 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 64 + 16, buffer + i * 16, sizeof(*buffer2) * 16); - } - vp9_short_idct16x16_c_f(input + 16 * short_pitch, buffer, pitch, - 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 64 + 16 * 64, buffer + i * 16, - sizeof(*buffer2) * 16); - } - vp9_short_idct16x16_c_f(input + 16 * short_pitch + 16, buffer, pitch, - 2 + DWT_PRECISION_BITS); - for (i = 0; i < 16; ++i) { - vpx_memcpy(buffer2 + i * 64 + 16 * 65, buffer + i * 16, - sizeof(*buffer2) * 16); - } - - // Copying and scaling highest bands into buffer2 - for (i = 0; i < 32; ++i) { - for (j = 32; j < 64; ++j) { - buffer2[i * 64 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1); - } - } - for (i = 32; i < 64; ++i) { - for (j = 0; j < 64; ++j) { - buffer2[i * 64 + j] = - multiply_bits(input[i * short_pitch + j], DWT_PRECISION_BITS - 1); - } - } -#endif // DWTDCT_TYPE - -#if DWT_TYPE == 26 - dyadic_synthesize_26(2, 64, 64, buffer2, 64, output, 64); -#elif DWT_TYPE == 97 - dyadic_synthesize_97(2, 64, 64, buffer2, 64, output, 64); -#elif DWT_TYPE == 53 - dyadic_synthesize_53(2, 64, 64, buffer2, 64, output, 64); -#endif -} -#endif // CONFIG_TX64X64 -#endif // !CONFIG_DWTDCTHYBRID diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c index 241a5bcb7..233ffd8a7 100644 --- a/vp9/common/vp9_invtrans.c +++ b/vp9/common/vp9_invtrans.c @@ -51,8 +51,13 @@ void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) { for (i = 0; i < 16; i++) { TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]); if (tx_type != DCT_DCT) { +#if CONFIG_INTHT4X4 + vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff, + 32, tx_type); +#else vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 4, xd->block[i].eob); +#endif } else { vp9_inverse_transform_b_4x4(xd, i, 32); } @@ -93,7 +98,7 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) { if (tx_type != DCT_DCT) { #if CONFIG_INTHT vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff, - tx_type, 32); + 32, tx_type); #else vp9_ihtllm(xd->block[i].dqcoeff, xd->block[i].diff, 32, tx_type, 8, xd->block[i].eob); @@ -108,7 +113,7 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) { if (tx_type != DCT_DCT) { #if CONFIG_INTHT vp9_short_iht8x8(xd->block[i + 2].dqcoeff, xd->block[i].diff, - tx_type, 32); + 32, tx_type); #else vp9_ihtllm(xd->block[i + 2].dqcoeff, xd->block[i].diff, 32, tx_type, 8, xd->block[i + 2].eob); diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index 5e57228b4..a333a4b02 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -279,7 +279,7 @@ typedef struct VP9Common { int error_resilient_mode; int frame_parallel_decoding_mode; - int tile_columns; + int tile_columns, log2_tile_columns; int cur_tile_mb_col_start, cur_tile_mb_col_end, cur_tile_idx; } VP9_COMMON; diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index d4435d872..b75525e2c 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -154,7 +154,7 @@ void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, int_mv mv; ptr_base = *(d->base_pre); - mv.as_int = d->bmi.as_mv.first.as_int; + mv.as_int = d->bmi.as_mv[0].as_int; ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + (mv.as_mv.col >> 3); @@ -179,7 +179,7 @@ void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch, int_mv mv; ptr_base = *(d->base_second_pre); - mv.as_int = d->bmi.as_mv.second.as_int; + mv.as_int = d->bmi.as_mv[1].as_int; ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + (mv.as_mv.col >> 3); @@ -197,7 +197,7 @@ void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { int_mv mv; ptr_base = *(d->base_pre); - mv.as_int = d->bmi.as_mv.first.as_int; + mv.as_int = d->bmi.as_mv[0].as_int; ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + (mv.as_mv.col >> 3); @@ -222,7 +222,7 @@ void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd, int_mv mv; ptr_base = *(d->base_second_pre); - mv.as_int = d->bmi.as_mv.second.as_int; + mv.as_int = d->bmi.as_mv[1].as_int; ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + (mv.as_mv.col >> 3); @@ -240,7 +240,7 @@ static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { int_mv mv; ptr_base = *(d->base_pre); - mv.as_int = d->bmi.as_mv.first.as_int; + mv.as_int = d->bmi.as_mv[0].as_int; ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + (mv.as_mv.col >> 3); @@ -264,38 +264,38 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) { int voffset = 20 + i * 2 + j; int temp; - temp = blockd[yoffset ].bmi.as_mv.first.as_mv.row - + blockd[yoffset + 1].bmi.as_mv.first.as_mv.row - + blockd[yoffset + 4].bmi.as_mv.first.as_mv.row - + blockd[yoffset + 5].bmi.as_mv.first.as_mv.row; + temp = blockd[yoffset ].bmi.as_mv[0].as_mv.row + + blockd[yoffset + 1].bmi.as_mv[0].as_mv.row + + blockd[yoffset + 4].bmi.as_mv[0].as_mv.row + + blockd[yoffset + 5].bmi.as_mv[0].as_mv.row; if (temp < 0) temp -= 4; else temp += 4; - xd->block[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) & + xd->block[uoffset].bmi.as_mv[0].as_mv.row = (temp / 8) & xd->fullpixel_mask; - temp = blockd[yoffset ].bmi.as_mv.first.as_mv.col - + blockd[yoffset + 1].bmi.as_mv.first.as_mv.col - + blockd[yoffset + 4].bmi.as_mv.first.as_mv.col - + blockd[yoffset + 5].bmi.as_mv.first.as_mv.col; + temp = blockd[yoffset ].bmi.as_mv[0].as_mv.col + + blockd[yoffset + 1].bmi.as_mv[0].as_mv.col + + blockd[yoffset + 4].bmi.as_mv[0].as_mv.col + + blockd[yoffset + 5].bmi.as_mv[0].as_mv.col; if (temp < 0) temp -= 4; else temp += 4; - blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) & + blockd[uoffset].bmi.as_mv[0].as_mv.col = (temp / 8) & xd->fullpixel_mask; - blockd[voffset].bmi.as_mv.first.as_mv.row = - blockd[uoffset].bmi.as_mv.first.as_mv.row; - blockd[voffset].bmi.as_mv.first.as_mv.col = - blockd[uoffset].bmi.as_mv.first.as_mv.col; + blockd[voffset].bmi.as_mv[0].as_mv.row = + blockd[uoffset].bmi.as_mv[0].as_mv.row; + blockd[voffset].bmi.as_mv[0].as_mv.col = + blockd[uoffset].bmi.as_mv[0].as_mv.col; if (xd->mode_info_context->mbmi.second_ref_frame > 0) { - temp = blockd[yoffset ].bmi.as_mv.second.as_mv.row - + blockd[yoffset + 1].bmi.as_mv.second.as_mv.row - + blockd[yoffset + 4].bmi.as_mv.second.as_mv.row - + blockd[yoffset + 5].bmi.as_mv.second.as_mv.row; + temp = blockd[yoffset ].bmi.as_mv[1].as_mv.row + + blockd[yoffset + 1].bmi.as_mv[1].as_mv.row + + blockd[yoffset + 4].bmi.as_mv[1].as_mv.row + + blockd[yoffset + 5].bmi.as_mv[1].as_mv.row; if (temp < 0) { temp -= 4; @@ -303,13 +303,13 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) { temp += 4; } - blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) & + blockd[uoffset].bmi.as_mv[1].as_mv.row = (temp / 8) & xd->fullpixel_mask; - temp = blockd[yoffset ].bmi.as_mv.second.as_mv.col - + blockd[yoffset + 1].bmi.as_mv.second.as_mv.col - + blockd[yoffset + 4].bmi.as_mv.second.as_mv.col - + blockd[yoffset + 5].bmi.as_mv.second.as_mv.col; + temp = blockd[yoffset ].bmi.as_mv[1].as_mv.col + + blockd[yoffset + 1].bmi.as_mv[1].as_mv.col + + blockd[yoffset + 4].bmi.as_mv[1].as_mv.col + + blockd[yoffset + 5].bmi.as_mv[1].as_mv.col; if (temp < 0) { temp -= 4; @@ -317,13 +317,13 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) { temp += 4; } - blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) & + blockd[uoffset].bmi.as_mv[1].as_mv.col = (temp / 8) & xd->fullpixel_mask; - blockd[voffset].bmi.as_mv.second.as_mv.row = - blockd[uoffset].bmi.as_mv.second.as_mv.row; - blockd[voffset].bmi.as_mv.second.as_mv.col = - blockd[uoffset].bmi.as_mv.second.as_mv.col; + blockd[voffset].bmi.as_mv[1].as_mv.row = + blockd[uoffset].bmi.as_mv[1].as_mv.row; + blockd[voffset].bmi.as_mv[1].as_mv.col = + blockd[uoffset].bmi.as_mv[1].as_mv.col; } } } @@ -332,7 +332,7 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) { BLOCKD *d0 = &blockd[i]; BLOCKD *d1 = &blockd[i + 1]; - if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) + if (d0->bmi.as_mv[0].as_int == d1->bmi.as_mv[0].as_int) build_inter_predictors2b(xd, d0, 8); else { vp9_build_inter_predictors_b(d0, 8, &xd->subpix); @@ -717,15 +717,15 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) { blockd[10].bmi = xd->mode_info_context->bmi[10]; if (mbmi->need_to_clamp_mvs) { - clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.first.as_mv, xd); - clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.first.as_mv, xd); - clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.first.as_mv, xd); - clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.first.as_mv, xd); + clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv[0].as_mv, xd); + clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv[0].as_mv, xd); + clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv[0].as_mv, xd); + clamp_mv_to_umv_border(&blockd[10].bmi.as_mv[0].as_mv, xd); if (mbmi->second_ref_frame > 0) { - clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv.second.as_mv, xd); - clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv.second.as_mv, xd); - clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv.second.as_mv, xd); - clamp_mv_to_umv_border(&blockd[10].bmi.as_mv.second.as_mv, xd); + clamp_mv_to_umv_border(&blockd[ 0].bmi.as_mv[1].as_mv, xd); + clamp_mv_to_umv_border(&blockd[ 2].bmi.as_mv[1].as_mv, xd); + clamp_mv_to_umv_border(&blockd[ 8].bmi.as_mv[1].as_mv, xd); + clamp_mv_to_umv_border(&blockd[10].bmi.as_mv[1].as_mv, xd); } } @@ -750,15 +750,15 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) { blockd[i + 1].bmi = xd->mode_info_context->bmi[i + 1]; if (mbmi->need_to_clamp_mvs) { - clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.first.as_mv, xd); - clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.first.as_mv, xd); + clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[0].as_mv, xd); + clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv[0].as_mv, xd); if (mbmi->second_ref_frame > 0) { - clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv.second.as_mv, xd); - clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv.second.as_mv, xd); + clamp_mv_to_umv_border(&blockd[i + 0].bmi.as_mv[1].as_mv, xd); + clamp_mv_to_umv_border(&blockd[i + 1].bmi.as_mv[1].as_mv, xd); } } - if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) + if (d0->bmi.as_mv[0].as_int == d1->bmi.as_mv[0].as_int) build_inter_predictors2b(xd, d0, 16); else { vp9_build_inter_predictors_b(d0, 16, &xd->subpix); @@ -776,7 +776,7 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) { BLOCKD *d0 = &blockd[i]; BLOCKD *d1 = &blockd[i + 1]; - if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) + if (d0->bmi.as_mv[0].as_int == d1->bmi.as_mv[0].as_int) build_inter_predictors2b(xd, d0, 8); else { vp9_build_inter_predictors_b(d0, 8, &xd->subpix); @@ -803,44 +803,44 @@ void build_4x4uvmvs(MACROBLOCKD *xd) { int temp; - temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.row - + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.row - + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.row - + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.row; + temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[0].as_mv.row + + xd->mode_info_context->bmi[yoffset + 1].as_mv[0].as_mv.row + + xd->mode_info_context->bmi[yoffset + 4].as_mv[0].as_mv.row + + xd->mode_info_context->bmi[yoffset + 5].as_mv[0].as_mv.row; if (temp < 0) temp -= 4; else temp += 4; - blockd[uoffset].bmi.as_mv.first.as_mv.row = (temp / 8) & + blockd[uoffset].bmi.as_mv[0].as_mv.row = (temp / 8) & xd->fullpixel_mask; - temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.first.as_mv.col - + xd->mode_info_context->bmi[yoffset + 1].as_mv.first.as_mv.col - + xd->mode_info_context->bmi[yoffset + 4].as_mv.first.as_mv.col - + xd->mode_info_context->bmi[yoffset + 5].as_mv.first.as_mv.col; + temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[0].as_mv.col + + xd->mode_info_context->bmi[yoffset + 1].as_mv[0].as_mv.col + + xd->mode_info_context->bmi[yoffset + 4].as_mv[0].as_mv.col + + xd->mode_info_context->bmi[yoffset + 5].as_mv[0].as_mv.col; if (temp < 0) temp -= 4; else temp += 4; - blockd[uoffset].bmi.as_mv.first.as_mv.col = (temp / 8) & + blockd[uoffset].bmi.as_mv[0].as_mv.col = (temp / 8) & xd->fullpixel_mask; // if (x->mode_info_context->mbmi.need_to_clamp_mvs) - clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd); + clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv[0].as_mv, xd); // if (x->mode_info_context->mbmi.need_to_clamp_mvs) - clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv.first.as_mv, xd); + clamp_uvmv_to_umv_border(&blockd[uoffset].bmi.as_mv[0].as_mv, xd); - blockd[voffset].bmi.as_mv.first.as_mv.row = - blockd[uoffset].bmi.as_mv.first.as_mv.row; - blockd[voffset].bmi.as_mv.first.as_mv.col = - blockd[uoffset].bmi.as_mv.first.as_mv.col; + blockd[voffset].bmi.as_mv[0].as_mv.row = + blockd[uoffset].bmi.as_mv[0].as_mv.row; + blockd[voffset].bmi.as_mv[0].as_mv.col = + blockd[uoffset].bmi.as_mv[0].as_mv.col; if (xd->mode_info_context->mbmi.second_ref_frame > 0) { - temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.row - + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.row - + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.row - + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.row; + temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[1].as_mv.row + + xd->mode_info_context->bmi[yoffset + 1].as_mv[1].as_mv.row + + xd->mode_info_context->bmi[yoffset + 4].as_mv[1].as_mv.row + + xd->mode_info_context->bmi[yoffset + 5].as_mv[1].as_mv.row; if (temp < 0) { temp -= 4; @@ -848,13 +848,13 @@ void build_4x4uvmvs(MACROBLOCKD *xd) { temp += 4; } - blockd[uoffset].bmi.as_mv.second.as_mv.row = (temp / 8) & + blockd[uoffset].bmi.as_mv[1].as_mv.row = (temp / 8) & xd->fullpixel_mask; - temp = xd->mode_info_context->bmi[yoffset + 0].as_mv.second.as_mv.col - + xd->mode_info_context->bmi[yoffset + 1].as_mv.second.as_mv.col - + xd->mode_info_context->bmi[yoffset + 4].as_mv.second.as_mv.col - + xd->mode_info_context->bmi[yoffset + 5].as_mv.second.as_mv.col; + temp = xd->mode_info_context->bmi[yoffset + 0].as_mv[1].as_mv.col + + xd->mode_info_context->bmi[yoffset + 1].as_mv[1].as_mv.col + + xd->mode_info_context->bmi[yoffset + 4].as_mv[1].as_mv.col + + xd->mode_info_context->bmi[yoffset + 5].as_mv[1].as_mv.col; if (temp < 0) { temp -= 4; @@ -862,21 +862,21 @@ void build_4x4uvmvs(MACROBLOCKD *xd) { temp += 4; } - blockd[uoffset].bmi.as_mv.second.as_mv.col = (temp / 8) & + blockd[uoffset].bmi.as_mv[1].as_mv.col = (temp / 8) & xd->fullpixel_mask; // if (mbmi->need_to_clamp_mvs) clamp_uvmv_to_umv_border( - &blockd[uoffset].bmi.as_mv.second.as_mv, xd); + &blockd[uoffset].bmi.as_mv[1].as_mv, xd); // if (mbmi->need_to_clamp_mvs) clamp_uvmv_to_umv_border( - &blockd[uoffset].bmi.as_mv.second.as_mv, xd); + &blockd[uoffset].bmi.as_mv[1].as_mv, xd); - blockd[voffset].bmi.as_mv.second.as_mv.row = - blockd[uoffset].bmi.as_mv.second.as_mv.row; - blockd[voffset].bmi.as_mv.second.as_mv.col = - blockd[uoffset].bmi.as_mv.second.as_mv.col; + blockd[voffset].bmi.as_mv[1].as_mv.row = + blockd[uoffset].bmi.as_mv[1].as_mv.row; + blockd[voffset].bmi.as_mv[1].as_mv.col = + blockd[uoffset].bmi.as_mv[1].as_mv.col; } } } diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 02f8b6614..3bd1f250f 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -300,10 +300,15 @@ prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output" specialize vp9_short_idct1_32x32 #if CONFIG_INTHT -prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int tx_type, int pitch" +prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type" specialize vp9_short_iht8x8 #endif +#if CONFIG_INTHT4X4 +prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type" +specialize vp9_short_iht4x4 +#endif + prototype void vp9_ihtllm "const int16_t *input, int16_t *output, int pitch, int tx_type, int tx_dim, int16_t eobs" specialize vp9_ihtllm diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c new file mode 100644 index 000000000..02e0d1461 --- /dev/null +++ b/vp9/common/vp9_tile_common.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_tile_common.h" + +void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off, + int *max_tile_off) { + const int log2_n_tiles = cm->log2_tile_columns; + const int tile_idx = cm->cur_tile_idx; + const int mb_cols = cm->mb_cols; + const int sb_cols = (mb_cols + 3) >> 2; + const int sb_off1 = (tile_idx * sb_cols) >> log2_n_tiles; + const int sb_off2 = ((tile_idx + 1) * sb_cols) >> log2_n_tiles; + + *min_tile_off = (sb_off1 << 2) > mb_cols ? mb_cols : (sb_off1 << 2); + *max_tile_off = (sb_off2 << 2) > mb_cols ? mb_cols : (sb_off2 << 2); +} + +#define MIN_TILE_WIDTH_SBS (MIN_TILE_WIDTH >> 6) +#define MAX_TILE_WIDTH_SBS (MAX_TILE_WIDTH >> 6) + +void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles_ptr, + int *delta_log2_n_tiles) { + const int sb_cols = (cm->mb_cols + 3) >> 2; + int min_log2_n_tiles, max_log2_n_tiles; + + for (max_log2_n_tiles = 0; + (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_SBS; + max_log2_n_tiles++) {} + for (min_log2_n_tiles = 0; + (MAX_TILE_WIDTH_SBS << min_log2_n_tiles) < sb_cols; + min_log2_n_tiles++) {} + + *min_log2_n_tiles_ptr = min_log2_n_tiles; + *delta_log2_n_tiles = max_log2_n_tiles - min_log2_n_tiles; +} diff --git a/vp9/common/vp9_tile_common.h b/vp9/common/vp9_tile_common.h new file mode 100644 index 000000000..653b6b4f6 --- /dev/null +++ b/vp9/common/vp9_tile_common.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_TILE_COMMON_H_ +#define VP9_COMMON_VP9_TILE_COMMON_H_ + +#include "vp9/common/vp9_onyxc_int.h" + +#define MIN_TILE_WIDTH 256 +#define MAX_TILE_WIDTH 4096 + +extern void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off, + int *max_tile_off); + +extern void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles, + int *delta_log2_n_tiles); + +#endif // VP9_COMMON_VP9_TILE_COMMON_H_ diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index 3e2346f29..fbc95b6ce 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -65,6 +65,20 @@ void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr, unsigned int output_height, const short *filter); +void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, @@ -87,6 +101,14 @@ void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride, dst += 8; w -= 8; } + while (w >= 4) { + vp9_filter_block1d4_h8_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 4; + dst += 4; + w -= 4; + } } if (w) { vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, @@ -117,6 +139,14 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride, dst += 8; w -= 8; } + while (w >= 4) { + vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 4; + dst += 4; + w -= 4; + } } if (w) { vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, @@ -156,6 +186,15 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride, h, filter_y); return; } + if (w == 4) { + vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d4_v8_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; + } } vp9_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm index c6d65e904..5f039454a 100644 --- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm +++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm @@ -30,6 +30,124 @@ ; unsigned int output_height, ; short *filter ;) +global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE +sym(vp9_filter_block1d4_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + lea rbx, [rdx + rdx*4] + add rbx, rdx ;pitch * 6 + +.vp9_filter_block1d4_v8_ssse3_loop: + movd xmm0, [rsi] ;A + movd xmm1, [rsi + rdx] ;B + movd xmm2, [rsi + rdx * 2] ;C + movd xmm3, [rax + rdx * 2] ;D + movd xmm4, [rsi + rdx * 4] ;E + movd xmm5, [rax + rdx * 4] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + movd xmm6, [rsi + rbx] ;G + movd xmm7, [rax + rbx] ;H + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + punpcklbw xmm6, xmm7 ;G H + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm2 + paddsw xmm0, krd + paddsw xmm4, xmm6 + paddsw xmm0, xmm4 + + psraw xmm0, 7 + packuswb xmm0, xmm0 + + add rsi, rdx + add rax, rdx + + movd [rdi], xmm0 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .vp9_filter_block1d4_v8_ssse3_loop + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d8_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE sym(vp9_filter_block1d8_v8_ssse3): push rbp @@ -289,6 +407,110 @@ sym(vp9_filter_block1d16_v8_ssse3): pop rbp ret +;void vp9_filter_block1d4_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE +sym(vp9_filter_block1d4_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rdx, dword ptr arg(3) ;output_pitch + movsxd rcx, dword ptr arg(4) ;output_height + +.filter_block1d4_h8_rowloop_ssse3: + movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 + + movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 + punpcklqdq xmm0, xmm3 + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf_t0t1)] + pmaddubsw xmm0, k0k1 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf_t2t3)] + pmaddubsw xmm1, k2k3 + + movdqa xmm4, xmm2 + pshufb xmm2, [GLOBAL(shuf_t4t5)] + pmaddubsw xmm2, k4k5 + + pshufb xmm4, [GLOBAL(shuf_t6t7)] + pmaddubsw xmm4, k6k7 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 + paddsw xmm0, xmm2 + paddsw xmm0, krd + psraw xmm0, 7 + packuswb xmm0, xmm0 + + lea rsi, [rsi + rax] + movd [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .filter_block1d4_h8_rowloop_ssse3 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + ;void vp9_filter_block1d8_h8_ssse3 ;( ; unsigned char *src_ptr, @@ -340,7 +562,7 @@ sym(vp9_filter_block1d8_h8_ssse3): pshufd xmm5, xmm5, 0 movdqa k4k5, xmm2 movdqa k6k7, xmm3 -; movdqa krd, xmm5 + movdqa krd, xmm5 movsxd rax, dword ptr arg(1) ;src_pixels_per_line movsxd rdx, dword ptr arg(3) ;output_pitch @@ -349,10 +571,7 @@ sym(vp9_filter_block1d8_h8_ssse3): .filter_block1d8_h8_rowloop_ssse3: movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 -; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 -;note: if we create a k0_k7 filter, we can save a pshufb -; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 punpcklqdq xmm0, xmm3 movdqa xmm1, xmm0 @@ -371,9 +590,9 @@ sym(vp9_filter_block1d8_h8_ssse3): pmaddubsw xmm4, k6k7 paddsw xmm0, xmm1 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 paddsw xmm0, xmm4 + paddsw xmm0, xmm2 + paddsw xmm0, krd psraw xmm0, 7 packuswb xmm0, xmm0 @@ -456,10 +675,7 @@ sym(vp9_filter_block1d16_h8_ssse3): .filter_block1d16_h8_rowloop_ssse3: movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 -; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 -;note: if we create a k0_k7 filter, we can save a pshufb -; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 punpcklqdq xmm0, xmm3 movdqa xmm1, xmm0 @@ -486,10 +702,7 @@ sym(vp9_filter_block1d16_h8_ssse3): movq xmm3, [rsi + 5] -; movq xmm7, [rsi + 12] movq xmm7, [rsi + 13] -;note: same as above -; punpcklbw xmm3, xmm7 punpcklqdq xmm3, xmm7 movdqa xmm1, xmm3 @@ -508,9 +721,9 @@ sym(vp9_filter_block1d16_h8_ssse3): pmaddubsw xmm4, k6k7 paddsw xmm3, xmm1 + paddsw xmm3, xmm4 paddsw xmm3, xmm2 paddsw xmm3, krd - paddsw xmm3, xmm4 psraw xmm3, 7 packuswb xmm3, xmm3 punpcklqdq xmm0, xmm3 |