From c8874f74a7a29729a77ccba5790bf2c71f583f15 Mon Sep 17 00:00:00 2001 From: Johann Date: Sat, 17 Sep 2022 08:47:28 +0900 Subject: quantize: increase iscan by 1 All of the assembly adds 1 to iscan to convert from a 0 based array to the EOB value. Add 1 to all iscan values and remove the extra instructions from the assembly. Change-Id: I219dd7f2bd10533ab24b206289565703176dc5e9 --- vp9/common/vp9_scan.c | 293 +++++++++++++------------- vp9/encoder/arm/neon/vp9_quantize_neon.c | 4 +- vp9/encoder/x86/vp9_quantize_avx2.c | 14 +- vp9/encoder/x86/vp9_quantize_sse2.c | 6 - vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm | 4 - vpx_dsp/arm/highbd_quantize_neon.c | 18 +- vpx_dsp/arm/quantize_neon.c | 18 +- vpx_dsp/loongarch/quantize_lsx.c | 13 +- vpx_dsp/ppc/quantize_vsx.c | 24 +-- vpx_dsp/x86/highbd_quantize_intrin_avx2.c | 5 +- vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 8 +- vpx_dsp/x86/quantize_avx.c | 12 +- vpx_dsp/x86/quantize_avx2.c | 11 +- vpx_dsp/x86/quantize_sse2.c | 5 +- vpx_dsp/x86/quantize_sse2.h | 8 +- vpx_dsp/x86/quantize_ssse3.c | 11 +- 16 files changed, 197 insertions(+), 257 deletions(-) diff --git a/vp9/common/vp9_scan.c b/vp9/common/vp9_scan.c index 0fef26351..8bea61dea 100644 --- a/vp9/common/vp9_scan.c +++ b/vp9/common/vp9_scan.c @@ -511,180 +511,181 @@ DECLARE_ALIGNED(16, static const int16_t, 959, 990, 991, 1022, 0, 0, }; +// Add 1 to iscan values. This represents the EOB position instead of the index. DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_4x4[16]) = { - 0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15, + 1, 3, 6, 9, 2, 4, 10, 13, 5, 8, 12, 15, 7, 11, 14, 16, }; DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_4x4[16]) = { - 0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15, + 1, 4, 8, 12, 2, 6, 10, 13, 3, 7, 11, 15, 5, 9, 14, 16, }; DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_4x4[16]) = { - 0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15, + 1, 2, 4, 6, 3, 5, 7, 10, 8, 9, 12, 14, 11, 13, 15, 16, }; DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_8x8[64]) = { - 0, 3, 8, 15, 22, 32, 40, 47, 1, 5, 11, 18, 26, 34, 44, 51, - 2, 7, 13, 20, 28, 38, 46, 54, 4, 10, 16, 24, 31, 41, 50, 56, - 6, 12, 21, 27, 35, 43, 52, 58, 9, 17, 25, 33, 39, 48, 55, 60, - 14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63, + 1, 4, 9, 16, 23, 33, 41, 48, 2, 6, 12, 19, 27, 35, 45, 52, + 3, 8, 14, 21, 29, 39, 47, 55, 5, 11, 17, 25, 32, 42, 51, 57, + 7, 13, 22, 28, 36, 44, 53, 59, 10, 18, 26, 34, 40, 49, 56, 61, + 15, 24, 31, 38, 46, 54, 60, 63, 20, 30, 37, 43, 50, 58, 62, 64, }; DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_8x8[64]) = { - 0, 1, 2, 5, 8, 12, 19, 24, 3, 4, 7, 10, 15, 20, 30, 39, - 6, 9, 13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52, - 18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59, - 32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63, + 1, 2, 3, 6, 9, 13, 20, 25, 4, 5, 8, 11, 16, 21, 31, 40, + 7, 10, 14, 17, 22, 28, 38, 47, 12, 15, 18, 24, 29, 35, 45, 53, + 19, 23, 26, 32, 36, 42, 51, 58, 27, 30, 34, 39, 44, 50, 56, 60, + 33, 37, 43, 48, 52, 55, 61, 62, 41, 46, 49, 54, 57, 59, 63, 64, }; DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_8x8[64]) = { - 0, 2, 5, 9, 14, 22, 31, 37, 1, 4, 8, 13, 19, 26, 38, 44, - 3, 6, 10, 17, 24, 30, 42, 49, 7, 11, 15, 21, 29, 36, 47, 53, - 12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60, - 25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63, + 1, 3, 6, 10, 15, 23, 32, 38, 2, 5, 9, 14, 20, 27, 39, 45, + 4, 7, 11, 18, 25, 31, 43, 50, 8, 12, 16, 22, 30, 37, 48, 54, + 13, 17, 21, 28, 35, 44, 53, 58, 19, 24, 29, 36, 42, 49, 57, 61, + 26, 33, 40, 46, 51, 56, 60, 63, 34, 41, 47, 52, 55, 59, 62, 64, }; DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_16x16[256]) = { - 0, 4, 11, 20, 31, 43, 59, 75, 85, 109, 130, 150, 165, 181, 195, 198, - 1, 6, 14, 23, 34, 47, 64, 81, 95, 114, 135, 153, 171, 188, 201, 212, - 2, 8, 16, 25, 38, 52, 67, 83, 101, 116, 136, 157, 172, 190, 205, 216, - 3, 10, 18, 29, 41, 55, 71, 89, 103, 119, 141, 159, 176, 194, 208, 218, - 5, 12, 21, 32, 45, 58, 74, 93, 104, 123, 144, 164, 179, 196, 210, 223, - 7, 15, 26, 37, 49, 63, 78, 96, 112, 129, 146, 166, 182, 200, 215, 228, - 9, 19, 28, 39, 54, 69, 86, 102, 117, 132, 151, 170, 187, 206, 220, 230, - 13, 24, 35, 46, 60, 73, 91, 108, 122, 137, 154, 174, 189, 207, 224, 235, - 17, 30, 40, 53, 66, 82, 98, 115, 126, 142, 161, 180, 197, 213, 227, 237, - 22, 36, 48, 62, 76, 92, 105, 120, 133, 147, 167, 186, 203, 219, 232, 240, - 27, 44, 56, 70, 84, 99, 113, 127, 140, 156, 175, 193, 209, 226, 236, 244, - 33, 51, 68, 79, 94, 110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247, - 42, 61, 77, 90, 106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251, - 50, 72, 87, 100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253, - 57, 80, 97, 111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254, - 65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255, + 1, 5, 12, 21, 32, 44, 60, 76, 86, 110, 131, 151, 166, 182, 196, 199, + 2, 7, 15, 24, 35, 48, 65, 82, 96, 115, 136, 154, 172, 189, 202, 213, + 3, 9, 17, 26, 39, 53, 68, 84, 102, 117, 137, 158, 173, 191, 206, 217, + 4, 11, 19, 30, 42, 56, 72, 90, 104, 120, 142, 160, 177, 195, 209, 219, + 6, 13, 22, 33, 46, 59, 75, 94, 105, 124, 145, 165, 180, 197, 211, 224, + 8, 16, 27, 38, 50, 64, 79, 97, 113, 130, 147, 167, 183, 201, 216, 229, + 10, 20, 29, 40, 55, 70, 87, 103, 118, 133, 152, 171, 188, 207, 221, 231, + 14, 25, 36, 47, 61, 74, 92, 109, 123, 138, 155, 175, 190, 208, 225, 236, + 18, 31, 41, 54, 67, 83, 99, 116, 127, 143, 162, 181, 198, 214, 228, 238, + 23, 37, 49, 63, 77, 93, 106, 121, 134, 148, 168, 187, 204, 220, 233, 241, + 28, 45, 57, 71, 85, 100, 114, 128, 141, 157, 176, 194, 210, 227, 237, 245, + 34, 52, 69, 80, 95, 111, 126, 139, 150, 163, 185, 203, 218, 230, 242, 248, + 43, 62, 78, 91, 107, 122, 135, 149, 161, 174, 192, 212, 226, 239, 246, 252, + 51, 73, 88, 101, 119, 129, 146, 159, 169, 184, 205, 223, 234, 243, 250, 254, + 58, 81, 98, 112, 132, 144, 156, 170, 179, 193, 215, 232, 240, 247, 251, 255, + 66, 89, 108, 125, 140, 153, 164, 178, 186, 200, 222, 235, 244, 249, 253, 256, }; DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_16x16[256]) = { - 0, 1, 2, 4, 6, 9, 12, 17, 22, 29, 36, 43, 54, 64, 76, - 86, 3, 5, 7, 11, 15, 19, 25, 32, 38, 48, 59, 68, 84, 99, - 115, 130, 8, 10, 13, 18, 23, 27, 33, 42, 51, 60, 72, 88, 103, - 119, 142, 167, 14, 16, 20, 26, 31, 37, 44, 53, 61, 73, 85, 100, - 116, 135, 161, 185, 21, 24, 30, 35, 40, 47, 55, 65, 74, 81, 94, - 112, 133, 154, 179, 205, 28, 34, 39, 45, 50, 58, 67, 77, 87, 96, - 106, 121, 146, 169, 196, 212, 41, 46, 49, 56, 63, 70, 79, 90, 98, - 107, 122, 138, 159, 182, 207, 222, 52, 57, 62, 69, 75, 83, 93, 102, - 110, 120, 134, 150, 176, 195, 215, 226, 66, 71, 78, 82, 91, 97, 108, - 113, 127, 136, 148, 168, 188, 202, 221, 232, 80, 89, 92, 101, 105, 114, - 125, 131, 139, 151, 162, 177, 192, 208, 223, 234, 95, 104, 109, 117, 123, - 128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239, 111, 118, 124, 129, - 140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240, 243, 126, 132, 137, - 145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237, 244, 246, 141, 149, - 156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238, 242, 249, 251, 152, - 163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236, 245, 247, 252, 253, - 158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235, 241, 248, 250, 254, - 255, + 1, 2, 3, 5, 7, 10, 13, 18, 23, 30, 37, 44, 55, 65, 77, + 87, 4, 6, 8, 12, 16, 20, 26, 33, 39, 49, 60, 69, 85, 100, + 116, 131, 9, 11, 14, 19, 24, 28, 34, 43, 52, 61, 73, 89, 104, + 120, 143, 168, 15, 17, 21, 27, 32, 38, 45, 54, 62, 74, 86, 101, + 117, 136, 162, 186, 22, 25, 31, 36, 41, 48, 56, 66, 75, 82, 95, + 113, 134, 155, 180, 206, 29, 35, 40, 46, 51, 59, 68, 78, 88, 97, + 107, 122, 147, 170, 197, 213, 42, 47, 50, 57, 64, 71, 80, 91, 99, + 108, 123, 139, 160, 183, 208, 223, 53, 58, 63, 70, 76, 84, 94, 103, + 111, 121, 135, 151, 177, 196, 216, 227, 67, 72, 79, 83, 92, 98, 109, + 114, 128, 137, 149, 169, 189, 203, 222, 233, 81, 90, 93, 102, 106, 115, + 126, 132, 140, 152, 163, 178, 193, 209, 224, 235, 96, 105, 110, 118, 124, + 129, 144, 145, 156, 166, 176, 191, 207, 220, 234, 240, 112, 119, 125, 130, + 141, 148, 158, 165, 171, 182, 192, 204, 225, 231, 241, 244, 127, 133, 138, + 146, 154, 161, 175, 179, 185, 198, 205, 217, 232, 238, 245, 247, 142, 150, + 157, 167, 173, 181, 190, 200, 201, 211, 221, 229, 239, 243, 250, 252, 153, + 164, 172, 184, 187, 194, 202, 212, 215, 219, 228, 237, 246, 248, 253, 254, + 159, 174, 188, 195, 199, 210, 214, 218, 226, 230, 236, 242, 249, 251, 255, + 256, }; DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_16x16[256]) = { - 0, 2, 5, 9, 17, 24, 36, 44, 55, 72, 88, 104, 128, 143, 166, - 179, 1, 4, 8, 13, 20, 30, 40, 54, 66, 79, 96, 113, 141, 154, - 178, 196, 3, 7, 11, 18, 25, 33, 46, 57, 71, 86, 101, 119, 148, - 164, 186, 201, 6, 12, 16, 23, 31, 39, 53, 64, 78, 92, 110, 127, - 153, 169, 193, 208, 10, 14, 19, 28, 37, 47, 58, 67, 84, 98, 114, - 133, 161, 176, 198, 214, 15, 21, 26, 34, 43, 52, 65, 77, 91, 106, - 120, 140, 165, 185, 205, 221, 22, 27, 32, 41, 48, 60, 73, 85, 99, - 116, 130, 151, 175, 190, 211, 225, 29, 35, 42, 49, 59, 69, 81, 95, - 108, 125, 139, 155, 182, 197, 217, 229, 38, 45, 51, 61, 68, 80, 93, - 105, 118, 134, 150, 168, 191, 207, 223, 234, 50, 56, 63, 74, 83, 94, - 109, 117, 129, 147, 163, 177, 199, 213, 228, 238, 62, 70, 76, 87, 97, - 107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242, 75, 82, 90, 102, - 112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245, 89, 100, 111, - 123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250, 103, 115, - 126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248, 252, 121, - 135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244, 251, 254, - 137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247, 249, 253, - 255, + 1, 3, 6, 10, 18, 25, 37, 45, 56, 73, 89, 105, 129, 144, 167, + 180, 2, 5, 9, 14, 21, 31, 41, 55, 67, 80, 97, 114, 142, 155, + 179, 197, 4, 8, 12, 19, 26, 34, 47, 58, 72, 87, 102, 120, 149, + 165, 187, 202, 7, 13, 17, 24, 32, 40, 54, 65, 79, 93, 111, 128, + 154, 170, 194, 209, 11, 15, 20, 29, 38, 48, 59, 68, 85, 99, 115, + 134, 162, 177, 199, 215, 16, 22, 27, 35, 44, 53, 66, 78, 92, 107, + 121, 141, 166, 186, 206, 222, 23, 28, 33, 42, 49, 61, 74, 86, 100, + 117, 131, 152, 176, 191, 212, 226, 30, 36, 43, 50, 60, 70, 82, 96, + 109, 126, 140, 156, 183, 198, 218, 230, 39, 46, 52, 62, 69, 81, 94, + 106, 119, 135, 151, 169, 192, 208, 224, 235, 51, 57, 64, 75, 84, 95, + 110, 118, 130, 148, 164, 178, 200, 214, 229, 239, 63, 71, 77, 88, 98, + 108, 123, 132, 146, 160, 173, 189, 211, 223, 236, 243, 76, 83, 91, 103, + 113, 125, 139, 147, 158, 174, 188, 203, 220, 231, 241, 246, 90, 101, 112, + 124, 133, 143, 157, 168, 181, 190, 204, 217, 232, 238, 247, 251, 104, 116, + 127, 137, 150, 163, 172, 184, 195, 205, 216, 225, 237, 242, 249, 253, 122, + 136, 145, 159, 171, 182, 193, 201, 210, 219, 228, 234, 244, 245, 252, 255, + 138, 153, 161, 175, 185, 196, 207, 213, 221, 227, 233, 240, 248, 250, 254, + 256, }; DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = { - 0, 2, 5, 10, 17, 25, 38, 47, 62, 83, 101, 121, 145, - 170, 193, 204, 210, 219, 229, 233, 245, 257, 275, 299, 342, 356, - 377, 405, 455, 471, 495, 527, 1, 4, 8, 15, 22, 30, 45, - 58, 74, 92, 112, 133, 158, 184, 203, 215, 222, 228, 234, 237, - 256, 274, 298, 317, 355, 376, 404, 426, 470, 494, 526, 551, 3, - 7, 12, 18, 28, 36, 52, 64, 82, 102, 118, 142, 164, 189, - 208, 217, 224, 231, 235, 238, 273, 297, 316, 329, 375, 403, 425, - 440, 493, 525, 550, 567, 6, 11, 16, 23, 31, 43, 60, 73, - 90, 109, 126, 150, 173, 196, 211, 220, 226, 232, 236, 239, 296, - 315, 328, 335, 402, 424, 439, 447, 524, 549, 566, 575, 9, 14, - 19, 29, 37, 50, 65, 78, 95, 116, 134, 157, 179, 201, 214, - 223, 244, 255, 272, 295, 341, 354, 374, 401, 454, 469, 492, 523, - 582, 596, 617, 645, 13, 20, 26, 35, 44, 54, 72, 85, 105, - 123, 140, 163, 182, 205, 216, 225, 254, 271, 294, 314, 353, 373, - 400, 423, 468, 491, 522, 548, 595, 616, 644, 666, 21, 27, 33, - 42, 53, 63, 80, 94, 113, 132, 151, 172, 190, 209, 218, 227, - 270, 293, 313, 327, 372, 399, 422, 438, 490, 521, 547, 565, 615, - 643, 665, 680, 24, 32, 39, 48, 57, 71, 88, 104, 120, 139, - 159, 178, 197, 212, 221, 230, 292, 312, 326, 334, 398, 421, 437, - 446, 520, 546, 564, 574, 642, 664, 679, 687, 34, 40, 46, 56, - 68, 81, 96, 111, 130, 147, 167, 186, 243, 253, 269, 291, 340, - 352, 371, 397, 453, 467, 489, 519, 581, 594, 614, 641, 693, 705, - 723, 747, 41, 49, 55, 67, 77, 91, 107, 124, 138, 161, 177, - 194, 252, 268, 290, 311, 351, 370, 396, 420, 466, 488, 518, 545, - 593, 613, 640, 663, 704, 722, 746, 765, 51, 59, 66, 76, 89, - 99, 119, 131, 149, 168, 181, 200, 267, 289, 310, 325, 369, 395, - 419, 436, 487, 517, 544, 563, 612, 639, 662, 678, 721, 745, 764, - 777, 61, 69, 75, 87, 100, 114, 129, 144, 162, 180, 191, 207, - 288, 309, 324, 333, 394, 418, 435, 445, 516, 543, 562, 573, 638, - 661, 677, 686, 744, 763, 776, 783, 70, 79, 86, 97, 108, 122, - 137, 155, 242, 251, 266, 287, 339, 350, 368, 393, 452, 465, 486, - 515, 580, 592, 611, 637, 692, 703, 720, 743, 788, 798, 813, 833, - 84, 93, 103, 110, 125, 141, 154, 171, 250, 265, 286, 308, 349, - 367, 392, 417, 464, 485, 514, 542, 591, 610, 636, 660, 702, 719, - 742, 762, 797, 812, 832, 848, 98, 106, 115, 127, 143, 156, 169, - 185, 264, 285, 307, 323, 366, 391, 416, 434, 484, 513, 541, 561, - 609, 635, 659, 676, 718, 741, 761, 775, 811, 831, 847, 858, 117, - 128, 136, 148, 160, 175, 188, 198, 284, 306, 322, 332, 390, 415, - 433, 444, 512, 540, 560, 572, 634, 658, 675, 685, 740, 760, 774, - 782, 830, 846, 857, 863, 135, 146, 152, 165, 241, 249, 263, 283, - 338, 348, 365, 389, 451, 463, 483, 511, 579, 590, 608, 633, 691, - 701, 717, 739, 787, 796, 810, 829, 867, 875, 887, 903, 153, 166, - 174, 183, 248, 262, 282, 305, 347, 364, 388, 414, 462, 482, 510, - 539, 589, 607, 632, 657, 700, 716, 738, 759, 795, 809, 828, 845, - 874, 886, 902, 915, 176, 187, 195, 202, 261, 281, 304, 321, 363, - 387, 413, 432, 481, 509, 538, 559, 606, 631, 656, 674, 715, 737, - 758, 773, 808, 827, 844, 856, 885, 901, 914, 923, 192, 199, 206, - 213, 280, 303, 320, 331, 386, 412, 431, 443, 508, 537, 558, 571, - 630, 655, 673, 684, 736, 757, 772, 781, 826, 843, 855, 862, 900, - 913, 922, 927, 240, 247, 260, 279, 337, 346, 362, 385, 450, 461, - 480, 507, 578, 588, 605, 629, 690, 699, 714, 735, 786, 794, 807, - 825, 866, 873, 884, 899, 930, 936, 945, 957, 246, 259, 278, 302, - 345, 361, 384, 411, 460, 479, 506, 536, 587, 604, 628, 654, 698, - 713, 734, 756, 793, 806, 824, 842, 872, 883, 898, 912, 935, 944, - 956, 966, 258, 277, 301, 319, 360, 383, 410, 430, 478, 505, 535, - 557, 603, 627, 653, 672, 712, 733, 755, 771, 805, 823, 841, 854, - 882, 897, 911, 921, 943, 955, 965, 972, 276, 300, 318, 330, 382, - 409, 429, 442, 504, 534, 556, 570, 626, 652, 671, 683, 732, 754, - 770, 780, 822, 840, 853, 861, 896, 910, 920, 926, 954, 964, 971, - 975, 336, 344, 359, 381, 449, 459, 477, 503, 577, 586, 602, 625, - 689, 697, 711, 731, 785, 792, 804, 821, 865, 871, 881, 895, 929, - 934, 942, 953, 977, 981, 987, 995, 343, 358, 380, 408, 458, 476, - 502, 533, 585, 601, 624, 651, 696, 710, 730, 753, 791, 803, 820, - 839, 870, 880, 894, 909, 933, 941, 952, 963, 980, 986, 994, 1001, - 357, 379, 407, 428, 475, 501, 532, 555, 600, 623, 650, 670, 709, - 729, 752, 769, 802, 819, 838, 852, 879, 893, 908, 919, 940, 951, - 962, 970, 985, 993, 1000, 1005, 378, 406, 427, 441, 500, 531, 554, - 569, 622, 649, 669, 682, 728, 751, 768, 779, 818, 837, 851, 860, - 892, 907, 918, 925, 950, 961, 969, 974, 992, 999, 1004, 1007, 448, - 457, 474, 499, 576, 584, 599, 621, 688, 695, 708, 727, 784, 790, - 801, 817, 864, 869, 878, 891, 928, 932, 939, 949, 976, 979, 984, - 991, 1008, 1010, 1013, 1017, 456, 473, 498, 530, 583, 598, 620, 648, - 694, 707, 726, 750, 789, 800, 816, 836, 868, 877, 890, 906, 931, - 938, 948, 960, 978, 983, 990, 998, 1009, 1012, 1016, 1020, 472, 497, - 529, 553, 597, 619, 647, 668, 706, 725, 749, 767, 799, 815, 835, - 850, 876, 889, 905, 917, 937, 947, 959, 968, 982, 989, 997, 1003, - 1011, 1015, 1019, 1022, 496, 528, 552, 568, 618, 646, 667, 681, 724, - 748, 766, 778, 814, 834, 849, 859, 888, 904, 916, 924, 946, 958, - 967, 973, 988, 996, 1002, 1006, 1014, 1018, 1021, 1023, + 1, 3, 6, 11, 18, 26, 39, 48, 63, 84, 102, 122, 146, + 171, 194, 205, 211, 220, 230, 234, 246, 258, 276, 300, 343, 357, + 378, 406, 456, 472, 496, 528, 2, 5, 9, 16, 23, 31, 46, + 59, 75, 93, 113, 134, 159, 185, 204, 216, 223, 229, 235, 238, + 257, 275, 299, 318, 356, 377, 405, 427, 471, 495, 527, 552, 4, + 8, 13, 19, 29, 37, 53, 65, 83, 103, 119, 143, 165, 190, + 209, 218, 225, 232, 236, 239, 274, 298, 317, 330, 376, 404, 426, + 441, 494, 526, 551, 568, 7, 12, 17, 24, 32, 44, 61, 74, + 91, 110, 127, 151, 174, 197, 212, 221, 227, 233, 237, 240, 297, + 316, 329, 336, 403, 425, 440, 448, 525, 550, 567, 576, 10, 15, + 20, 30, 38, 51, 66, 79, 96, 117, 135, 158, 180, 202, 215, + 224, 245, 256, 273, 296, 342, 355, 375, 402, 455, 470, 493, 524, + 583, 597, 618, 646, 14, 21, 27, 36, 45, 55, 73, 86, 106, + 124, 141, 164, 183, 206, 217, 226, 255, 272, 295, 315, 354, 374, + 401, 424, 469, 492, 523, 549, 596, 617, 645, 667, 22, 28, 34, + 43, 54, 64, 81, 95, 114, 133, 152, 173, 191, 210, 219, 228, + 271, 294, 314, 328, 373, 400, 423, 439, 491, 522, 548, 566, 616, + 644, 666, 681, 25, 33, 40, 49, 58, 72, 89, 105, 121, 140, + 160, 179, 198, 213, 222, 231, 293, 313, 327, 335, 399, 422, 438, + 447, 521, 547, 565, 575, 643, 665, 680, 688, 35, 41, 47, 57, + 69, 82, 97, 112, 131, 148, 168, 187, 244, 254, 270, 292, 341, + 353, 372, 398, 454, 468, 490, 520, 582, 595, 615, 642, 694, 706, + 724, 748, 42, 50, 56, 68, 78, 92, 108, 125, 139, 162, 178, + 195, 253, 269, 291, 312, 352, 371, 397, 421, 467, 489, 519, 546, + 594, 614, 641, 664, 705, 723, 747, 766, 52, 60, 67, 77, 90, + 100, 120, 132, 150, 169, 182, 201, 268, 290, 311, 326, 370, 396, + 420, 437, 488, 518, 545, 564, 613, 640, 663, 679, 722, 746, 765, + 778, 62, 70, 76, 88, 101, 115, 130, 145, 163, 181, 192, 208, + 289, 310, 325, 334, 395, 419, 436, 446, 517, 544, 563, 574, 639, + 662, 678, 687, 745, 764, 777, 784, 71, 80, 87, 98, 109, 123, + 138, 156, 243, 252, 267, 288, 340, 351, 369, 394, 453, 466, 487, + 516, 581, 593, 612, 638, 693, 704, 721, 744, 789, 799, 814, 834, + 85, 94, 104, 111, 126, 142, 155, 172, 251, 266, 287, 309, 350, + 368, 393, 418, 465, 486, 515, 543, 592, 611, 637, 661, 703, 720, + 743, 763, 798, 813, 833, 849, 99, 107, 116, 128, 144, 157, 170, + 186, 265, 286, 308, 324, 367, 392, 417, 435, 485, 514, 542, 562, + 610, 636, 660, 677, 719, 742, 762, 776, 812, 832, 848, 859, 118, + 129, 137, 149, 161, 176, 189, 199, 285, 307, 323, 333, 391, 416, + 434, 445, 513, 541, 561, 573, 635, 659, 676, 686, 741, 761, 775, + 783, 831, 847, 858, 864, 136, 147, 153, 166, 242, 250, 264, 284, + 339, 349, 366, 390, 452, 464, 484, 512, 580, 591, 609, 634, 692, + 702, 718, 740, 788, 797, 811, 830, 868, 876, 888, 904, 154, 167, + 175, 184, 249, 263, 283, 306, 348, 365, 389, 415, 463, 483, 511, + 540, 590, 608, 633, 658, 701, 717, 739, 760, 796, 810, 829, 846, + 875, 887, 903, 916, 177, 188, 196, 203, 262, 282, 305, 322, 364, + 388, 414, 433, 482, 510, 539, 560, 607, 632, 657, 675, 716, 738, + 759, 774, 809, 828, 845, 857, 886, 902, 915, 924, 193, 200, 207, + 214, 281, 304, 321, 332, 387, 413, 432, 444, 509, 538, 559, 572, + 631, 656, 674, 685, 737, 758, 773, 782, 827, 844, 856, 863, 901, + 914, 923, 928, 241, 248, 261, 280, 338, 347, 363, 386, 451, 462, + 481, 508, 579, 589, 606, 630, 691, 700, 715, 736, 787, 795, 808, + 826, 867, 874, 885, 900, 931, 937, 946, 958, 247, 260, 279, 303, + 346, 362, 385, 412, 461, 480, 507, 537, 588, 605, 629, 655, 699, + 714, 735, 757, 794, 807, 825, 843, 873, 884, 899, 913, 936, 945, + 957, 967, 259, 278, 302, 320, 361, 384, 411, 431, 479, 506, 536, + 558, 604, 628, 654, 673, 713, 734, 756, 772, 806, 824, 842, 855, + 883, 898, 912, 922, 944, 956, 966, 973, 277, 301, 319, 331, 383, + 410, 430, 443, 505, 535, 557, 571, 627, 653, 672, 684, 733, 755, + 771, 781, 823, 841, 854, 862, 897, 911, 921, 927, 955, 965, 972, + 976, 337, 345, 360, 382, 450, 460, 478, 504, 578, 587, 603, 626, + 690, 698, 712, 732, 786, 793, 805, 822, 866, 872, 882, 896, 930, + 935, 943, 954, 978, 982, 988, 996, 344, 359, 381, 409, 459, 477, + 503, 534, 586, 602, 625, 652, 697, 711, 731, 754, 792, 804, 821, + 840, 871, 881, 895, 910, 934, 942, 953, 964, 981, 987, 995, 1002, + 358, 380, 408, 429, 476, 502, 533, 556, 601, 624, 651, 671, 710, + 730, 753, 770, 803, 820, 839, 853, 880, 894, 909, 920, 941, 952, + 963, 971, 986, 994, 1001, 1006, 379, 407, 428, 442, 501, 532, 555, + 570, 623, 650, 670, 683, 729, 752, 769, 780, 819, 838, 852, 861, + 893, 908, 919, 926, 951, 962, 970, 975, 993, 1000, 1005, 1008, 449, + 458, 475, 500, 577, 585, 600, 622, 689, 696, 709, 728, 785, 791, + 802, 818, 865, 870, 879, 892, 929, 933, 940, 950, 977, 980, 985, + 992, 1009, 1011, 1014, 1018, 457, 474, 499, 531, 584, 599, 621, 649, + 695, 708, 727, 751, 790, 801, 817, 837, 869, 878, 891, 907, 932, + 939, 949, 961, 979, 984, 991, 999, 1010, 1013, 1017, 1021, 473, 498, + 530, 554, 598, 620, 648, 669, 707, 726, 750, 768, 800, 816, 836, + 851, 877, 890, 906, 918, 938, 948, 960, 969, 983, 990, 998, 1004, + 1012, 1016, 1020, 1023, 497, 529, 553, 569, 619, 647, 668, 682, 725, + 749, 767, 779, 815, 835, 850, 860, 889, 905, 917, 925, 947, 959, + 968, 974, 989, 997, 1003, 1007, 1015, 1019, 1022, 1024, }; const scan_order vp9_default_scan_orders[TX_SIZES] = { diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index 945fd522e..b9bd1eba3 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -45,9 +45,7 @@ static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr, int16x8_t v_eobmax, uint16x8_t v_nz_mask) { const int16x8_t v_iscan = vld1q_s16(&iscan_ptr[0]); - const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1)); - const int16x8_t v_nz_iscan = - vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan_plus1); + const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan); return vmaxq_s16(v_eobmax, v_nz_iscan); } diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c index 15ce71c5c..da285be8e 100644 --- a/vp9/encoder/x86/vp9_quantize_avx2.c +++ b/vp9/encoder/x86/vp9_quantize_avx2.c @@ -43,16 +43,13 @@ static VPX_FORCE_INLINE void load_fp_values_avx2( static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, __m256i v_eobmax, __m256i v_mask) { - const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); #if CONFIG_VP9_HIGHBITDEPTH - // typedef int32_t tran_low_t; - const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8); - const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask); + const __m256i v_iscan = _mm256_permute4x64_epi64( + _mm256_loadu_si256((const __m256i *)iscan), 0xD8); #else - // typedef int16_t tran_low_t; - const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan, v_mask); + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); #endif - const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask); + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask); return _mm256_max_epi16(v_eobmax, v_nz_iscan); } @@ -303,8 +300,7 @@ static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob( _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); const __m256i iscan = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); - const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm); - const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm); + const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm); return _mm256_max_epi16(eobmax, nz_iscan); } diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index 4bcadaa6a..0fd0dccc4 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -99,9 +99,6 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob = _mm_max_epi16(eob, eob1); @@ -174,9 +171,6 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); - iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); eob0 = _mm_and_si128(iscan0, nzero_coeff0); eob1 = _mm_and_si128(iscan1, nzero_coeff1); eob0 = _mm_max_epi16(eob0, eob1); diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm index 680acfec6..ae43a90f8 100644 --- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm +++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm @@ -91,8 +91,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, round, quant, \ pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m7 ; m11 = scan[i] + 1 pandn m8, m6 ; m8 = max(eob) pandn m13, m11 ; m13 = max(eob) pmaxsw m8, m13 @@ -141,8 +139,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, round, quant, \ pcmpeqw m13, m5 ; m13 = c[i] == 0 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] - psubw m6, m7 ; m6 = scan[i] + 1 - psubw m11, m7 ; m11 = scan[i] + 1 pandn m14, m6 ; m14 = max(eob) pandn m13, m11 ; m13 = max(eob) pmaxsw m8, m14 diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index 502a9c972..b9f72a94c 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -101,7 +101,6 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; @@ -119,9 +118,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // Process first 8 values which include a dc component. { - // Add one because the eob does not index from 0. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, @@ -148,9 +145,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = vdupq_lane_s32(vget_low_s32(dequant), 1); do { - // Add one because the eob is not its index. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, @@ -234,7 +229,6 @@ void vpx_highbd_quantize_b_32x32_neon( const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; @@ -253,9 +247,7 @@ void vpx_highbd_quantize_b_32x32_neon( // Process first 8 values which include a dc component. { - // Add one because the eob does not index from 0. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, @@ -279,9 +271,7 @@ void vpx_highbd_quantize_b_32x32_neon( dequant = vdupq_lane_s32(vget_low_s32(dequant), 1); for (i = 1; i < 32 * 32 / 8; ++i) { - // Add one because the eob is not its index. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index dcdf588cb..9c227d560 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -75,7 +75,6 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; @@ -88,9 +87,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // Process first 8 values which include a dc component. { - // Add one because the eob does not index from 0. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant, @@ -116,9 +113,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); do { - // Add one because the eob is not its index. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, @@ -226,7 +221,6 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); uint16x8_t eob_max; int i; @@ -240,9 +234,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, // Process first 8 values which include a dc component. { - // Add one because the eob does not index from 0. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, @@ -266,9 +258,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = vdupq_lane_s16(vget_low_s16(dequant), 1); for (i = 1; i < 32 * 32 / 8; ++i) { - // Add one because the eob is not its index. - const uint16x8_t v_iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); + const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan)); const int16x8_t qcoeff = quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c index 2fc33b06b..77be0bb4f 100644 --- a/vpx_dsp/loongarch/quantize_lsx.c +++ b/vpx_dsp/loongarch/quantize_lsx.c @@ -59,7 +59,6 @@ static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff, } static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1, - __m128i zbin_mask0, __m128i zbin_mask1, const int16_t *scan, int index, __m128i zero) { const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero); @@ -68,8 +67,6 @@ static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1, __m128i scan1 = __lsx_vld(scan + index + 8, 0); __m128i eob0, eob1; - scan0 = __lsx_vsub_h(scan0, zbin_mask0); - scan1 = __lsx_vsub_h(scan1, zbin_mask1); eob0 = __lsx_vandn_v(zero_coeff0, scan0); eob1 = __lsx_vandn_v(zero_coeff1, scan1); return __lsx_vmax_h(eob0, eob1); @@ -138,7 +135,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, dequant = __lsx_vilvh_d(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { coeff0 = __lsx_vld(coeff_ptr + index, 0); @@ -161,8 +158,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero); eob = __lsx_vmax_h(eob, eob0); index += 16; @@ -221,7 +217,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr); dequant = __lsx_vilvh_d(dequant, dequant); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero); // AC only loop. for (index = 16; index < 32 * 32; index += 16) { coeff0 = __lsx_vld(coeff_ptr + index, 0); @@ -243,8 +239,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8 + index); - eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero); eob = __lsx_vmax_h(eob, eob0); } diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c index 7cdcbeb40..ab71f6e23 100644 --- a/vpx_dsp/ppc/quantize_vsx.c +++ b/vpx_dsp/ppc/quantize_vsx.c @@ -78,11 +78,10 @@ static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack); } -static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask, +static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, const int16_t *iscan_ptr, int index) { int16x8_t scan = vec_vsx_ld(index, iscan_ptr); bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16); - scan = vec_sub(scan, mask); return vec_andc(scan, zero_coeff); } @@ -139,8 +138,8 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); - eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0), - nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16)); + eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, iscan_ptr, 16)); if (n_coeffs > 16) { int index = 16; @@ -177,10 +176,9 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); - eob = - vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0)); - eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1), - nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2)); + eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, iscan_ptr, off2)); eob = vec_max(eob, eob2); index += 24; @@ -252,8 +250,8 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = vec_splat(dequant, 1); // remove DC from dequant vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr); - eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0), - nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16)); + eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0), + nonzero_scanindex(qcoeff1, iscan_ptr, 16)); do { int16x8_t coeff2, coeff2_abs, qcoeff2, eob2; @@ -286,9 +284,9 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr); vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr); - eob = vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0)); - eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1), - nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2)); + eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0)); + eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1), + nonzero_scanindex(qcoeff2, iscan_ptr, off2)); eob = vec_max(eob, eob2); // 24 int16_t is 48 bytes diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c index cbc715c04..8edddd637 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c @@ -80,8 +80,7 @@ static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr, _mm256_permute4x64_epi64(packed_nz_mask, 0xD8); const __m256i iscan = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr)); - const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm); - const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm); + const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm); return _mm256_max_epi16(eobmax, nz_iscan); } @@ -256,4 +255,4 @@ void vpx_highbd_quantize_b_32x32_avx2( } *eob_ptr = get_max_eob(eob); -} \ No newline at end of file +} diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 1264fbed2..ae1981a83 100644 --- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -25,7 +25,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { - int i, j, non_zero_regs = (int)count / 4, eob_i = -1; + int i, j, non_zero_regs = (int)count / 4, eob_i = 0; __m128i zbins[2]; __m128i nzbins[2]; @@ -89,7 +89,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, } } } - *eob_ptr = eob_i + 1; + *eob_ptr = eob_i; } void vpx_highbd_quantize_b_32x32_sse2( @@ -102,7 +102,7 @@ void vpx_highbd_quantize_b_32x32_sse2( __m128i nzbins[2]; int idx = 0; int idx_arr[1024]; - int i, eob = -1; + int i, eob = 0; const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1); const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1); (void)scan; @@ -148,6 +148,6 @@ void vpx_highbd_quantize_b_32x32_sse2( dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; } - *eob_ptr = eob + 1; + *eob_ptr = eob; } #endif diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c index 706e4e641..7d8352721 100644 --- a/vpx_dsp/x86/quantize_avx.c +++ b/vpx_dsp/x86/quantize_avx.c @@ -93,8 +93,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = - scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); } // AC only loop. @@ -134,8 +133,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } @@ -229,8 +227,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); - eob = - scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); } // AC only loop. @@ -272,8 +269,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c index 6fd517487..28f7c9c7d 100644 --- a/vpx_dsp/x86/quantize_avx2.c +++ b/vpx_dsp/x86/quantize_avx2.c @@ -127,16 +127,13 @@ quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr, static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan, __m256i v_eobmax, __m256i v_mask) { - const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); #if CONFIG_VP9_HIGHBITDEPTH - // typedef int32_t tran_low_t; - const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8); - const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask); + const __m256i v_iscan = _mm256_permute4x64_epi64( + _mm256_loadu_si256((const __m256i *)iscan), 0xD8); #else - // typedef int16_t tran_low_t; - const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan, v_mask); + const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan); #endif - const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask); + const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask); return _mm256_max_epi16(v_eobmax, v_nz_iscan); } diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c index 459d95f28..9533e7916 100644 --- a/vpx_dsp/x86/quantize_sse2.c +++ b/vpx_dsp/x86/quantize_sse2.c @@ -76,7 +76,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { @@ -106,8 +106,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); index += 16; diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h index afe2f924b..580dd883f 100644 --- a/vpx_dsp/x86/quantize_sse2.h +++ b/vpx_dsp/x86/quantize_sse2.h @@ -62,11 +62,8 @@ static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant, #endif // CONFIG_VP9_HIGHBITDEPTH } -// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to -// zbin to add 1 to the index in 'scan'. +// Scan 16 values for eob reference in scan. static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, - const __m128i zbin_mask0, - const __m128i zbin_mask1, const int16_t *scan, const int index, const __m128i zero) { const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero); @@ -74,9 +71,6 @@ static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index)); __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8)); __m128i eob0, eob1; - // Add one to convert from indices to counts - scan0 = _mm_sub_epi16(scan0, zbin_mask0); - scan1 = _mm_sub_epi16(scan1, zbin_mask1); eob0 = _mm_andnot_si128(zero_coeff0, scan0); eob1 = _mm_andnot_si128(zero_coeff1, scan1); return _mm_max_epi16(eob0, eob1); diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index 9d2a88b7b..476230286 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -70,7 +70,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8); - eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); // AC only loop. while (index < n_coeffs) { @@ -98,8 +98,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index); calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); index += 16; @@ -202,8 +201,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dequant = _mm_unpackhi_epi64(dequant, dequant); calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8); - eob = - scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero); } // AC only loop. @@ -249,8 +247,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8 + index); - eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, - zero); + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero); eob = _mm_max_epi16(eob, eob0); } -- cgit v1.2.3