summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--vp8/encoder/x86/quantize_ssse3.c2
-rw-r--r--vp9/common/x86/vp9_loopfilter_intrin_sse2.c105
-rw-r--r--vp9/decoder/vp9_decodeframe.c133
3 files changed, 160 insertions, 80 deletions
diff --git a/vp8/encoder/x86/quantize_ssse3.c b/vp8/encoder/x86/quantize_ssse3.c
index 448217ff4..14282db80 100644
--- a/vp8/encoder/x86/quantize_ssse3.c
+++ b/vp8/encoder/x86/quantize_ssse3.c
@@ -17,7 +17,7 @@
#include <intrin.h>
#pragma intrinsic(_BitScanReverse)
static int bsr(int mask) {
- int eob;
+ unsigned long eob;
_BitScanReverse(&eob, mask);
eob++;
if (mask == 0)
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index e321dbebe..fe8af5463 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -1333,43 +1333,47 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
__m128i x8, x9, x10, x11, x12, x13, x14, x15;
- // Read in 16 lines
- x0 = _mm_loadl_epi64((__m128i *)in0);
- x8 = _mm_loadl_epi64((__m128i *)in1);
- x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
- x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
- x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
- x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
- x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p));
- x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p));
- x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p));
- x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p));
- x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p));
- x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p));
- x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p));
- x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p));
- x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p));
- x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p));
-
- x0 = _mm_unpacklo_epi8(x0, x1);
- x1 = _mm_unpacklo_epi8(x2, x3);
- x2 = _mm_unpacklo_epi8(x4, x5);
- x3 = _mm_unpacklo_epi8(x6, x7);
-
- x8 = _mm_unpacklo_epi8(x8, x9);
- x9 = _mm_unpacklo_epi8(x10, x11);
- x10 = _mm_unpacklo_epi8(x12, x13);
- x11 = _mm_unpacklo_epi8(x14, x15);
-
- x4 = _mm_unpacklo_epi16(x0, x1);
- x5 = _mm_unpacklo_epi16(x2, x3);
- x12 = _mm_unpacklo_epi16(x8, x9);
- x13 = _mm_unpacklo_epi16(x10, x11);
-
- x6 = _mm_unpacklo_epi32(x4, x5);
- x7 = _mm_unpackhi_epi32(x4, x5);
- x14 = _mm_unpacklo_epi32(x12, x13);
- x15 = _mm_unpackhi_epi32(x12, x13);
+ // 2-way interleave w/hoisting of unpacks
+ x0 = _mm_loadl_epi64((__m128i *)in0); // 1
+ x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3
+ x0 = _mm_unpacklo_epi8(x0, x1); // 1
+
+ x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5
+ x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p)); // 7
+ x1 = _mm_unpacklo_epi8(x2, x3); // 2
+
+ x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p)); // 9
+ x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p)); // 11
+ x2 = _mm_unpacklo_epi8(x4, x5); // 3
+
+ x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p)); // 13
+ x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p)); // 15
+ x3 = _mm_unpacklo_epi8(x6, x7); // 4
+ x4 = _mm_unpacklo_epi16(x0, x1); // 9
+
+ x8 = _mm_loadl_epi64((__m128i *)in1); // 2
+ x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4
+ x8 = _mm_unpacklo_epi8(x8, x9); // 5
+ x5 = _mm_unpacklo_epi16(x2, x3); // 10
+
+ x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6
+ x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p)); // 8
+ x9 = _mm_unpacklo_epi8(x10, x11); // 6
+
+ x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p)); // 10
+ x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p)); // 12
+ x10 = _mm_unpacklo_epi8(x12, x13); // 7
+ x12 = _mm_unpacklo_epi16(x8, x9); // 11
+
+ x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p)); // 14
+ x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p)); // 16
+ x11 = _mm_unpacklo_epi8(x14, x15); // 8
+ x13 = _mm_unpacklo_epi16(x10, x11); // 12
+
+ x6 = _mm_unpacklo_epi32(x4, x5); // 13
+ x7 = _mm_unpackhi_epi32(x4, x5); // 14
+ x14 = _mm_unpacklo_epi32(x12, x13); // 15
+ x15 = _mm_unpackhi_epi32(x12, x13); // 16
// Store first 4-line result
_mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
@@ -1405,33 +1409,36 @@ static INLINE void transpose(unsigned char *src[], int in_p,
x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07
x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17
- x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
- x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
- x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
- x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
- x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
- x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
// 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
x0 = _mm_unpacklo_epi8(x0, x1);
+
+ x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27
+ x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37
// 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
x1 = _mm_unpacklo_epi8(x2, x3);
+
+ x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47
+ x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57
// 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
x2 = _mm_unpacklo_epi8(x4, x5);
+
+ x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67
+ x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77
// 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
x3 = _mm_unpacklo_epi8(x6, x7);
+
// 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
x4 = _mm_unpacklo_epi16(x0, x1);
// 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
x5 = _mm_unpacklo_epi16(x2, x3);
// 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
x6 = _mm_unpacklo_epi32(x4, x5);
- // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- x7 = _mm_unpackhi_epi32(x4, x5);
-
_mm_storel_pd((double *)(out + 0*out_p),
_mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70
_mm_storeh_pd((double *)(out + 1*out_p),
_mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ x7 = _mm_unpackhi_epi32(x4, x5);
_mm_storel_pd((double *)(out + 2*out_p),
_mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72
_mm_storeh_pd((double *)(out + 3*out_p),
@@ -1443,13 +1450,13 @@ static INLINE void transpose(unsigned char *src[], int in_p,
x5 = _mm_unpackhi_epi16(x2, x3);
// 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
x6 = _mm_unpacklo_epi32(x4, x5);
- // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
- x7 = _mm_unpackhi_epi32(x4, x5);
-
_mm_storel_pd((double *)(out + 4*out_p),
_mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74
_mm_storeh_pd((double *)(out + 5*out_p),
_mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ x7 = _mm_unpackhi_epi32(x4, x5);
+
_mm_storel_pd((double *)(out + 6*out_p),
_mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76
_mm_storeh_pd((double *)(out + 7*out_p),
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 4cdfe8762..142e865d3 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -182,35 +182,119 @@ static void read_mv_probs(nmv_context *ctx, int allow_hp, vp9_reader *r) {
}
}
-static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
- TX_SIZE tx_size, uint8_t *dst, int stride,
- int eob) {
+static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
+ const TX_SIZE tx_size,
+ uint8_t *dst, int stride,
+ int eob) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ if (eob > 0) {
+ tran_low_t *const dqcoeff = pd->dqcoeff;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (xd->lossless) {
+ vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+ } else {
+ switch (tx_size) {
+ case TX_4X4:
+ vp9_highbd_idct4x4_add(dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ case TX_8X8:
+ vp9_highbd_idct8x8_add(dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ case TX_16X16:
+ vp9_highbd_idct16x16_add(dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ case TX_32X32:
+ vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
+ break;
+ default:
+ assert(0 && "Invalid transform size");
+ }
+ }
+ } else {
+ if (xd->lossless) {
+ vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+ } else {
+ switch (tx_size) {
+ case TX_4X4:
+ vp9_idct4x4_add(dqcoeff, dst, stride, eob);
+ break;
+ case TX_8X8:
+ vp9_idct8x8_add(dqcoeff, dst, stride, eob);
+ break;
+ case TX_16X16:
+ vp9_idct16x16_add(dqcoeff, dst, stride, eob);
+ break;
+ case TX_32X32:
+ vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+ break;
+ default:
+ assert(0 && "Invalid transform size");
+ return;
+ }
+ }
+ }
+#else
+ if (xd->lossless) {
+ vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
+ } else {
+ switch (tx_size) {
+ case TX_4X4:
+ vp9_idct4x4_add(dqcoeff, dst, stride, eob);
+ break;
+ case TX_8X8:
+ vp9_idct8x8_add(dqcoeff, dst, stride, eob);
+ break;
+ case TX_16X16:
+ vp9_idct16x16_add(dqcoeff, dst, stride, eob);
+ break;
+ case TX_32X32:
+ vp9_idct32x32_add(dqcoeff, dst, stride, eob);
+ break;
+ default:
+ assert(0 && "Invalid transform size");
+ return;
+ }
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ if (eob == 1) {
+ dqcoeff[0] = 0;
+ } else {
+ if (tx_size <= TX_16X16 && eob <= 10)
+ memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+ else if (tx_size == TX_32X32 && eob <= 34)
+ memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
+ else
+ memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+ }
+ }
+}
+
+static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
+ const TX_TYPE tx_type,
+ const TX_SIZE tx_size,
+ uint8_t *dst, int stride,
+ int eob) {
struct macroblockd_plane *const pd = &xd->plane[plane];
if (eob > 0) {
- TX_TYPE tx_type = DCT_DCT;
tran_low_t *const dqcoeff = pd->dqcoeff;
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
if (xd->lossless) {
- tx_type = DCT_DCT;
vp9_highbd_iwht4x4_add(dqcoeff, dst, stride, eob, xd->bd);
} else {
- const PLANE_TYPE plane_type = pd->plane_type;
switch (tx_size) {
case TX_4X4:
- tx_type = get_tx_type_4x4(plane_type, xd, block);
vp9_highbd_iht4x4_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
break;
case TX_8X8:
- tx_type = get_tx_type(plane_type, xd);
vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
break;
case TX_16X16:
- tx_type = get_tx_type(plane_type, xd);
vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, stride, eob, xd->bd);
break;
case TX_32X32:
- tx_type = DCT_DCT;
vp9_highbd_idct32x32_add(dqcoeff, dst, stride, eob, xd->bd);
break;
default:
@@ -219,25 +303,19 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
}
} else {
if (xd->lossless) {
- tx_type = DCT_DCT;
vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
} else {
- const PLANE_TYPE plane_type = pd->plane_type;
switch (tx_size) {
case TX_4X4:
- tx_type = get_tx_type_4x4(plane_type, xd, block);
vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
break;
case TX_8X8:
- tx_type = get_tx_type(plane_type, xd);
vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
break;
case TX_16X16:
- tx_type = get_tx_type(plane_type, xd);
vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
break;
case TX_32X32:
- tx_type = DCT_DCT;
vp9_idct32x32_add(dqcoeff, dst, stride, eob);
break;
default:
@@ -248,25 +326,19 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
}
#else
if (xd->lossless) {
- tx_type = DCT_DCT;
vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
} else {
- const PLANE_TYPE plane_type = pd->plane_type;
switch (tx_size) {
case TX_4X4:
- tx_type = get_tx_type_4x4(plane_type, xd, block);
vp9_iht4x4_add(tx_type, dqcoeff, dst, stride, eob);
break;
case TX_8X8:
- tx_type = get_tx_type(plane_type, xd);
vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
break;
case TX_16X16:
- tx_type = get_tx_type(plane_type, xd);
vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
break;
case TX_32X32:
- tx_type = DCT_DCT;
vp9_idct32x32_add(dqcoeff, dst, stride, eob);
break;
default:
@@ -277,7 +349,7 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
#endif // CONFIG_VP9_HIGHBITDEPTH
if (eob == 1) {
- memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
+ dqcoeff[0] = 0;
} else {
if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
@@ -315,14 +387,15 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
x, y, plane);
if (!mi->mbmi.skip) {
+ const TX_TYPE tx_type = (plane || xd->lossless) ?
+ DCT_DCT : intra_mode_to_tx_type_lookup[mode];
const scan_order *sc = (plane || xd->lossless) ?
- &vp9_default_scan_orders[tx_size] :
- &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
+ &vp9_default_scan_orders[tx_size] : &vp9_scan_orders[tx_size][tx_type];
const int eob = vp9_decode_block_tokens(xd, plane, sc,
plane_bsize, x, y, tx_size,
args->r, args->seg_id);
- inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride,
- eob);
+ inverse_transform_block_intra(xd, plane, tx_type, tx_size,
+ dst, pd->dst.stride, eob);
}
}
@@ -344,9 +417,9 @@ static void reconstruct_inter_block(int plane, int block,
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
eob = vp9_decode_block_tokens(xd, plane, sc, plane_bsize,
x, y, tx_size, args->r, args->seg_id);
- inverse_transform_block(xd, plane, block, tx_size,
- &pd->dst.buf[4 * y * pd->dst.stride + 4 * x],
- pd->dst.stride, eob);
+ inverse_transform_block_inter(xd, plane, tx_size,
+ &pd->dst.buf[4 * y * pd->dst.stride + 4 * x],
+ pd->dst.stride, eob);
*args->eobtotal += eob;
}