diff options
Diffstat (limited to 'vp9/common')
-rw-r--r-- | vp9/common/vp9_blockd.h | 5 | ||||
-rw-r--r-- | vp9/common/vp9_scan.c | 56 | ||||
-rw-r--r-- | vp9/common/vp9_scan.h | 10 | ||||
-rw-r--r-- | vp9/common/x86/vp9_idct_intrin_sse2.c | 83 |
4 files changed, 111 insertions, 43 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 50cb98c92..121947b7e 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -381,11 +381,6 @@ static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize, const int stride = 4 << b_width_log2(plane_bsize); return base + raster_block_offset(plane_bsize, raster_block, stride); } -static uint8_t* raster_block_offset_uint8(BLOCK_SIZE plane_bsize, - int raster_block, uint8_t *base, - int stride) { - return base + raster_block_offset(plane_bsize, raster_block, stride); -} static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int block, diff --git a/vp9/common/vp9_scan.c b/vp9/common/vp9_scan.c index f17da9110..f62150fd4 100644 --- a/vp9/common/vp9_scan.c +++ b/vp9/common/vp9_scan.c @@ -266,6 +266,62 @@ DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]); DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]); DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]); +const scan_order inter_scan_orders[TX_SIZES] = { + {vp9_default_scan_4x4, vp9_default_scan_4x4_neighbors}, // NEWMV + {vp9_default_scan_8x8, vp9_default_scan_8x8_neighbors}, // NEWMV + {vp9_default_scan_16x16, vp9_default_scan_16x16_neighbors}, // NEWMV + {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors}, // NEWMV +}; + +const scan_order intra_scan_orders[TX_SIZES][INTRA_MODES] = { + { // 4X4 + {vp9_default_scan_4x4, vp9_default_scan_4x4_neighbors}, // DC + {vp9_row_scan_4x4, vp9_row_scan_4x4_neighbors}, // V + {vp9_col_scan_4x4, vp9_col_scan_4x4_neighbors}, // H + {vp9_default_scan_4x4, vp9_default_scan_4x4_neighbors}, // D45 + {vp9_default_scan_4x4, vp9_default_scan_4x4_neighbors}, // D135 + {vp9_row_scan_4x4, vp9_row_scan_4x4_neighbors}, // D117 + {vp9_col_scan_4x4, vp9_col_scan_4x4_neighbors}, // D153 + {vp9_col_scan_4x4, vp9_col_scan_4x4_neighbors}, // D207 + {vp9_row_scan_4x4, vp9_row_scan_4x4_neighbors}, // D63 + {vp9_default_scan_4x4, vp9_default_scan_4x4_neighbors}, // TM + }, { // 8x8 + {vp9_default_scan_8x8, vp9_default_scan_8x8_neighbors}, // DC + {vp9_row_scan_8x8, vp9_row_scan_8x8_neighbors}, // V + {vp9_col_scan_8x8, vp9_col_scan_8x8_neighbors}, // H + {vp9_default_scan_8x8, vp9_default_scan_8x8_neighbors}, // D45 + {vp9_default_scan_8x8, vp9_default_scan_8x8_neighbors}, // D135 + {vp9_row_scan_8x8, vp9_row_scan_8x8_neighbors}, // D117 + {vp9_col_scan_8x8, vp9_col_scan_8x8_neighbors}, // D153 + {vp9_col_scan_8x8, vp9_col_scan_8x8_neighbors}, // D207 + {vp9_row_scan_8x8, vp9_row_scan_8x8_neighbors}, // D63 + {vp9_default_scan_8x8, vp9_default_scan_8x8_neighbors}, // TM + }, { // 16x16 + {vp9_default_scan_16x16, vp9_default_scan_16x16_neighbors}, // DC + {vp9_row_scan_16x16, vp9_row_scan_16x16_neighbors}, // V + {vp9_col_scan_16x16, vp9_col_scan_16x16_neighbors}, // H + {vp9_default_scan_16x16, vp9_default_scan_16x16_neighbors}, // D45 + {vp9_default_scan_16x16, vp9_default_scan_16x16_neighbors}, // D135 + {vp9_row_scan_16x16, vp9_row_scan_16x16_neighbors}, // D117 + {vp9_col_scan_16x16, vp9_col_scan_16x16_neighbors}, // D153 + {vp9_col_scan_16x16, vp9_col_scan_16x16_neighbors}, // D207 + {vp9_row_scan_16x16, vp9_row_scan_16x16_neighbors}, // D63 + {vp9_default_scan_16x16, vp9_default_scan_16x16_neighbors}, // TM + }, { // 32x32 + {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors}, // DC + {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors}, // V + {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors}, // H + {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors}, // D45 + {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors}, // D135 + {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors}, // D117 + {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors}, // D153 + {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors}, // D207 + {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors}, // D63 + {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors}, // TM + } +}; + + static int find_in_scan(const int16_t *scan, int l, int idx) { int n, l2 = l * l; for (n = 0; n < l2; n++) { diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h index 14a1a7eb0..98fc607ca 100644 --- a/vp9/common/vp9_scan.h +++ b/vp9/common/vp9_scan.h @@ -15,6 +15,7 @@ #include "vpx_ports/mem.h" #include "vp9/common/vp9_enums.h" +#include "vp9/common/vp9_blockd.h" #define MAX_NEIGHBORS 2 @@ -67,9 +68,16 @@ extern DECLARE_ALIGNED(16, int16_t, extern DECLARE_ALIGNED(16, int16_t, vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]); - void vp9_init_neighbors(); +typedef struct { + const int16_t *scan; + const int16_t *neighbors; +} scan_order; + +extern const scan_order intra_scan_orders[TX_SIZES][INTRA_MODES]; +extern const scan_order inter_scan_orders[TX_SIZES]; + static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) { switch (tx_type) { case ADST_DCT: diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 2a3384488..c65184f9c 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -174,12 +174,10 @@ void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { static INLINE void transpose_4x4(__m128i *res) { const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); - res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); - res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); - res[1] = _mm_unpackhi_epi64(res[0], res[0]); - res[3] = _mm_unpackhi_epi64(res[2], res[2]); + res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1); + res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1); } static void idct4_1d_sse2(__m128i *in) { @@ -192,8 +190,8 @@ static void idct4_1d_sse2(__m128i *in) { transpose_4x4(in); // stage 1 - u[0] = _mm_unpacklo_epi16(in[0], in[2]); - u[1] = _mm_unpacklo_epi16(in[1], in[3]); + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); @@ -209,16 +207,13 @@ static void idct4_1d_sse2(__m128i *in) { v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - u[0] = _mm_packs_epi32(v[0], v[2]); - u[1] = _mm_packs_epi32(v[1], v[3]); - u[2] = _mm_unpackhi_epi64(u[0], u[0]); - u[3] = _mm_unpackhi_epi64(u[1], u[1]); + u[0] = _mm_packs_epi32(v[0], v[1]); + u[1] = _mm_packs_epi32(v[3], v[2]); // stage 2 - in[0] = _mm_add_epi16(u[0], u[3]); - in[1] = _mm_add_epi16(u[1], u[2]); - in[2] = _mm_sub_epi16(u[1], u[2]); - in[3] = _mm_sub_epi16(u[0], u[3]); + in[0] = _mm_add_epi16(u[0], u[1]); + in[1] = _mm_sub_epi16(u[0], u[1]); + in[1] = _mm_shuffle_epi32(in[1], 0x4E); } static void iadst4_1d_sse2(__m128i *in) { @@ -232,13 +227,14 @@ static void iadst4_1d_sse2(__m128i *in) { __m128i u[8], v[8], in7; transpose_4x4(in); - in7 = _mm_add_epi16(in[0], in[3]); - in7 = _mm_sub_epi16(in7, in[2]); + in7 = _mm_srli_si128(in[1], 8); + in7 = _mm_add_epi16(in7, in[0]); + in7 = _mm_sub_epi16(in7, in[1]); - u[0] = _mm_unpacklo_epi16(in[0], in[2]); - u[1] = _mm_unpacklo_epi16(in[1], in[3]); + u[0] = _mm_unpacklo_epi16(in[0], in[1]); + u[1] = _mm_unpackhi_epi16(in[0], in[1]); u[2] = _mm_unpacklo_epi16(in7, kZero); - u[3] = _mm_unpacklo_epi16(in[1], kZero); + u[3] = _mm_unpackhi_epi16(in[0], kZero); v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 @@ -265,22 +261,18 @@ static void iadst4_1d_sse2(__m128i *in) { u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - in[0] = _mm_packs_epi32(u[0], u[2]); - in[1] = _mm_packs_epi32(u[1], u[3]); - in[2] = _mm_unpackhi_epi64(in[0], in[0]); - in[3] = _mm_unpackhi_epi64(in[1], in[1]); + in[0] = _mm_packs_epi32(u[0], u[1]); + in[1] = _mm_packs_epi32(u[2], u[3]); } void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, int tx_type) { - __m128i in[4]; + __m128i in[2]; const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); - in[0] = _mm_loadl_epi64((const __m128i *)input); - in[1] = _mm_loadl_epi64((const __m128i *)(input + 4)); - in[2] = _mm_loadl_epi64((const __m128i *)(input + 8)); - in[3] = _mm_loadl_epi64((const __m128i *)(input + 12)); + in[0]= _mm_loadu_si128((const __m128i *)(input)); + in[1]= _mm_loadu_si128((const __m128i *)(input + 8)); switch (tx_type) { case 0: // DCT_DCT @@ -307,18 +299,35 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, // Final round and shift in[0] = _mm_add_epi16(in[0], eight); in[1] = _mm_add_epi16(in[1], eight); - in[2] = _mm_add_epi16(in[2], eight); - in[3] = _mm_add_epi16(in[3], eight); in[0] = _mm_srai_epi16(in[0], 4); in[1] = _mm_srai_epi16(in[1], 4); - in[2] = _mm_srai_epi16(in[2], 4); - in[3] = _mm_srai_epi16(in[3], 4); - RECON_AND_STORE4X4(dest, in[0]); - RECON_AND_STORE4X4(dest, in[1]); - RECON_AND_STORE4X4(dest, in[2]); - RECON_AND_STORE4X4(dest, in[3]); + // Reconstruction and Store + { + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); + __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); + d0 = _mm_unpacklo_epi32(d0, + _mm_cvtsi32_si128(*(const int *) (dest + stride))); + d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128( + *(const int *) (dest + stride * 3))); + d0 = _mm_unpacklo_epi8(d0, zero); + d2 = _mm_unpacklo_epi8(d2, zero); + d0 = _mm_add_epi16(d0, in[0]); + d2 = _mm_add_epi16(d2, in[1]); + d0 = _mm_packus_epi16(d0, d2); + // store result[0] + *(int *)dest = _mm_cvtsi128_si32(d0); + // store result[1] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); + // store result[2] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); + // store result[3] + d0 = _mm_srli_si128(d0, 4); + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); + } } #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ |