diff options
-rw-r--r-- | test/external_frame_buffer_test.cc | 2 | ||||
-rw-r--r-- | vp9/common/vp9_alloccommon.c | 3 | ||||
-rw-r--r-- | vp9/common/vp9_onyxc_int.h | 1 | ||||
-rw-r--r-- | vp9/decoder/vp9_decodeframe.c | 2 | ||||
-rw-r--r-- | vp9/decoder/vp9_decoder.c | 6 | ||||
-rw-r--r-- | vp9/decoder/vp9_decoder.h | 3 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 141 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 3 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.c | 3 | ||||
-rw-r--r-- | vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm | 60 |
10 files changed, 121 insertions, 103 deletions
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc index 045d579e0..dbf297119 100644 --- a/test/external_frame_buffer_test.cc +++ b/test/external_frame_buffer_test.cc @@ -499,7 +499,7 @@ TEST_F(ExternalFrameBufferTest, SetAfterDecode) { release_vp9_frame_buffer)); } -TEST_F(ExternalFrameBufferNonRefTest, DISABLED_ReleaseNonRefFrameBuffer) { +TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) { const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS; ASSERT_EQ(VPX_CODEC_OK, SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer, diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index c9aca89d1..7345e259b 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -65,10 +65,11 @@ void vp9_free_ref_frame_buffers(BufferPool *pool) { int i; for (i = 0; i < FRAME_BUFFERS; ++i) { - if (pool->frame_bufs[i].ref_count > 0 && + if (!pool->frame_bufs[i].released && pool->frame_bufs[i].raw_frame_buffer.data != NULL) { pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer); pool->frame_bufs[i].ref_count = 0; + pool->frame_bufs[i].released = 1; } vpx_free(pool->frame_bufs[i].mvs); pool->frame_bufs[i].mvs = NULL; diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index 11eae4ba6..1d96d92c2 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -69,6 +69,7 @@ typedef struct { MV_REF *mvs; int mi_rows; int mi_cols; + uint8_t released; vpx_codec_frame_buffer_t raw_frame_buffer; YV12_BUFFER_CONFIG buf; } RefCntBuffer; diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 0e4605abb..9452ec4dc 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -1184,6 +1184,7 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) { "Failed to allocate frame buffer"); } + pool->frame_bufs[cm->new_fb_idx].released = 0; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; @@ -1267,6 +1268,7 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm, "Failed to allocate frame buffer"); } + pool->frame_bufs[cm->new_fb_idx].released = 0; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x; pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y; pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth; diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 6adbee78f..a913fa560 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -287,9 +287,13 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, pbi->ready_for_new_data = 0; // Check if the previous frame was a frame without any references to it. - if (cm->new_fb_idx >= 0 && frame_bufs[cm->new_fb_idx].ref_count == 0) + if (cm->new_fb_idx >= 0 && frame_bufs[cm->new_fb_idx].ref_count == 0 && + !frame_bufs[cm->new_fb_idx].released) { pool->release_fb_cb(pool->cb_priv, &frame_bufs[cm->new_fb_idx].raw_frame_buffer); + frame_bufs[cm->new_fb_idx].released = 1; + } + // Find a free frame buffer. Return error if can not find any. cm->new_fb_idx = get_free_fb(cm); if (cm->new_fb_idx == INVALID_IDX) { diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index 2040cee81..4b26c314d 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -117,9 +117,10 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, // But the private buffer is not set up until finish decoding header. // So any error happens during decoding header, the frame_bufs will not // have valid priv buffer. - if (frame_bufs[idx].ref_count == 0 && + if (!frame_bufs[idx].released && frame_bufs[idx].ref_count == 0 && frame_bufs[idx].raw_frame_buffer.priv) { pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer); + frame_bufs[idx].released = 1; } } } diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index f92705a81..e3ec823d2 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -929,9 +929,8 @@ static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, return 0; } -static void copy_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, - BLOCK_SIZE bsize, int mi_rowref, - int mi_colref, int mi_row, int mi_col, +static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + BLOCK_SIZE bsize, int mi_row, int mi_col, int mi_row_high, int mi_col_high) { VP9_COMMON *const cm = &cpi->common; SVC *const svc = &cpi->svc; @@ -939,62 +938,61 @@ static void copy_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, // Variables with _high are for higher resolution. int bsize_high = 0; int subsize_high = 0; - int bsl_high = 0; - int bs_high = 0; - int shift_row = 0; - int shift_col = 0; + const int bsl_high = b_width_log2_lookup[bsize]; + const int bs_high = (1 << bsl_high) >> 2; + const int has_rows = (mi_row_high + bs_high) < cm->mi_rows; + const int has_cols = (mi_col_high + bs_high) < cm->mi_cols; + + const int row_boundary_block_scale_factor[BLOCK_SIZES] = { + 13, 13, 13, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0 + }; + const int col_boundary_block_scale_factor[BLOCK_SIZES] = { + 13, 13, 13, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0 + }; + int start_pos; + BLOCK_SIZE bsize_low; + PARTITION_TYPE partition_high; + + if (mi_row_high >= cm->mi_rows || mi_col_high >= cm->mi_cols) return 0; + if (mi_row >= (cm->mi_rows >> 1) || mi_col >= (cm->mi_cols >> 1)) return 0; // Find corresponding (mi_col/mi_row) block down-scaled by 2x2. - int start_pos = mi_row * (svc->mi_stride[svc->spatial_layer_id - 1]) + mi_col; - - const int bsl = b_width_log2_lookup[bsize]; - const int bs = (1 << bsl) >> 2; - BLOCK_SIZE subsize; - PARTITION_TYPE partition; - - const int bw = b_width_log2_lookup[bsize]; - const int bh = b_height_log2_lookup[bsize]; - // For block size >= 32x32 shift_row/col can stay 0. - if (bw == 1) - shift_col = 3; - else if (bw == 2) - shift_col = 2; - if (bh == 1) - shift_row = 3; - else if (bh == 2) - shift_row = 2; - - if (mi_row_high >= cm->mi_rows || mi_col_high >= cm->mi_cols) return; - if (mi_row >= (cm->mi_rows >> 1) || mi_col >= (cm->mi_cols >> 1)) return; - - if ((mi_row - mi_rowref > shift_row) || (mi_col - mi_colref > shift_col)) - return; - - partition = partition_lookup[bsl][prev_part[start_pos]]; - subsize = get_subsize(bsize, partition); + start_pos = mi_row * (svc->mi_stride[svc->spatial_layer_id - 1]) + mi_col; + bsize_low = prev_part[start_pos]; + // The block size is too big for boundaries. Do variance based partitioning. + if ((!has_rows || !has_cols) && bsize_low > BLOCK_16X16) return 1; // Scale up block size by 2x2. Force 64x64 for size larger than 32x32. - if (bsize < BLOCK_32X32) { - bsize_high = bsize + 3; - subsize_high = subsize + 3; - } else if (bsize >= BLOCK_32X32) { + if (bsize_low < BLOCK_32X32) { + bsize_high = bsize_low + 3; + } else if (bsize_low >= BLOCK_32X32) { bsize_high = BLOCK_64X64; - subsize_high = BLOCK_64X64; } - bsl_high = b_width_log2_lookup[bsize_high]; - bs_high = (1 << bsl_high) / 4; + // Scale up blocks on boundary. + if (!has_cols && has_rows) { + bsize_high = bsize_low + row_boundary_block_scale_factor[bsize_low]; + } else if (has_cols && !has_rows) { + bsize_high = bsize_low + col_boundary_block_scale_factor[bsize_low]; + } else if (!has_cols && !has_rows) { + bsize_high = bsize_low; + } - if (subsize < BLOCK_8X8) { + partition_high = partition_lookup[bsl_high][bsize_high]; + subsize_high = get_subsize(bsize, partition_high); + + if (subsize_high < BLOCK_8X8) { set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high); } else { - switch (partition) { + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) >> 2; + switch (partition_high) { case PARTITION_NONE: set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high); break; case PARTITION_HORZ: set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high); if (subsize_high < BLOCK_64X64) - set_block_size(cpi, x, xd, mi_row_high + bs_high, mi_col, + set_block_size(cpi, x, xd, mi_row_high + bs_high, mi_col_high, subsize_high); break; case PARTITION_VERT: @@ -1004,20 +1002,26 @@ static void copy_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, subsize_high); break; case PARTITION_SPLIT: - copy_partitioning_svc(cpi, x, xd, subsize, mi_rowref, mi_colref, mi_row, - mi_col, mi_row_high, mi_col_high); - copy_partitioning_svc(cpi, x, xd, subsize, mi_rowref, mi_colref, - mi_row + bs, mi_col, mi_row_high + bs_high, - mi_col_high); - copy_partitioning_svc(cpi, x, xd, subsize, mi_rowref, mi_colref, mi_row, - mi_col + bs, mi_row_high, mi_col_high + bs_high); - copy_partitioning_svc(cpi, x, xd, subsize, mi_rowref, mi_colref, - mi_row + bs, mi_col + bs, mi_row_high + bs_high, - mi_col_high + bs_high); + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col, + mi_row_high, mi_col_high)) + return 1; + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1), + mi_col, mi_row_high + bs_high, mi_col_high)) + return 1; + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, + mi_col + (bs >> 1), mi_row_high, + mi_col_high + bs_high)) + return 1; + if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1), + mi_col + (bs >> 1), mi_row_high + bs_high, + mi_col_high + bs_high)) + return 1; break; default: assert(0); } } + + return 0; } static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, @@ -1255,22 +1259,21 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, if (cpi->content_state_sb_fd != NULL) x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2]; - // For SVC on top spatial layer and non_reference frame: copy partition - // from lower spatial resolution if svc_use_lowres_part is enabled. - // TODO(jianj): Fix to allow it to work on boundary. - if (cpi->sf.svc_use_lowres_part && cpi->svc.spatial_layer_id == 2 && - cpi->svc.non_reference_frame && cpi->svc.prev_partition_svc != NULL && - mi_row < cm->mi_rows - 8 && mi_col < cm->mi_cols - 8 && - content_state != kVeryHighSad) { - copy_partitioning_svc(cpi, x, xd, BLOCK_64X64, mi_row >> 1, mi_col >> 1, - mi_row >> 1, mi_col >> 1, mi_row, mi_col); - return 0; + // For SVC on top spatial layer: use/scale the partition from + // the lower spatial resolution if svc_use_lowres_part is enabled. + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 && + cpi->svc.prev_partition_svc != NULL && content_state != kVeryHighSad) { + if (!scale_partitioning_svc(cpi, x, xd, BLOCK_64X64, mi_row >> 1, + mi_col >> 1, mi_row, mi_col)) + return 0; } // If source_sad is low copy the partition without computing the y_sad. if (x->skip_low_source_sad && cpi->sf.copy_partition_flag && copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) { x->sb_use_mv_part = 1; - if (cpi->sf.svc_use_lowres_part && cpi->svc.spatial_layer_id == 1) + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); return 0; } @@ -1396,7 +1399,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64); x->variance_low[0] = 1; chroma_check(cpi, x, bsize, y_sad, is_key_frame); - if (cpi->sf.svc_use_lowres_part && cpi->svc.spatial_layer_id == 1) + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); return 0; } @@ -1409,7 +1413,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, if (cpi->sf.copy_partition_flag && y_sad_last < cpi->vbp_threshold_copy && copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) { chroma_check(cpi, x, bsize, y_sad, is_key_frame); - if (cpi->sf.svc_use_lowres_part && cpi->svc.spatial_layer_id == 1) + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); return 0; } @@ -1634,7 +1639,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } if (cm->frame_type != KEY_FRAME && cpi->sf.svc_use_lowres_part && - cpi->svc.spatial_layer_id == 1) + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); if (cpi->sf.short_circuit_low_temp_var) { diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 54f040453..782e1b3b3 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -3553,7 +3553,8 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, if (cpi->sf.copy_partition_flag) alloc_copy_partition_data(cpi); - if (cpi->sf.svc_use_lowres_part && cpi->svc.spatial_layer_id == 1) { + if (cpi->sf.svc_use_lowres_part && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) { if (cpi->svc.prev_partition_svc == NULL) { CHECK_MEM_ERROR( cm, cpi->svc.prev_partition_svc, diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 14df8daba..40b99b4e7 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -607,7 +607,10 @@ static void set_rt_speed_feature_framesize_independent( } // For SVC: enable use of lower resolution partition for higher resolution, // only for 3 spatial layers and when config/top resolution is above VGA. + // Enable only for top temporal enhancement layer (which are non-reference + // frames for the fixed SVC patterns). if (cpi->use_svc && cpi->svc.number_spatial_layers == 3 && + cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1 && cpi->oxcf.width * cpi->oxcf.height > 640 * 480) sf->svc_use_lowres_part = 1; } diff --git a/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm index b433874f2..32824a03a 100644 --- a/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm +++ b/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm @@ -31,8 +31,8 @@ SECTION .text INIT_XMM ssse3 cglobal fdct8x8, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m12, [pw_11585x2] + mova m8, [GLOBAL(pd_8192)] + mova m12, [GLOBAL(pw_11585x2)] lea r3, [2 * strideq] lea r4, [4 * strideq] @@ -92,10 +92,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride ; sin(pi / 8), cos(pi / 8) punpcklwd m2, m10, m9 punpckhwd m10, m9 - pmaddwd m5, m2, [pw_15137_6270] - pmaddwd m2, [pw_6270_m15137] - pmaddwd m9, m10, [pw_15137_6270] - pmaddwd m10, [pw_6270_m15137] + pmaddwd m5, m2, [GLOBAL(pw_15137_6270)] + pmaddwd m2, [GLOBAL(pw_6270_m15137)] + pmaddwd m9, m10, [GLOBAL(pw_15137_6270)] + pmaddwd m10, [GLOBAL(pw_6270_m15137)] paddd m5, m8 paddd m2, m8 paddd m9, m8 @@ -120,10 +120,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride ; sin(pi / 16), cos(pi / 16) punpcklwd m1, m10, m9 punpckhwd m10, m9 - pmaddwd m7, m1, [pw_16069_3196] - pmaddwd m1, [pw_3196_m16069] - pmaddwd m9, m10, [pw_16069_3196] - pmaddwd m10, [pw_3196_m16069] + pmaddwd m7, m1, [GLOBAL(pw_16069_3196)] + pmaddwd m1, [GLOBAL(pw_3196_m16069)] + pmaddwd m9, m10, [GLOBAL(pw_16069_3196)] + pmaddwd m10, [GLOBAL(pw_3196_m16069)] paddd m7, m8 paddd m1, m8 paddd m9, m8 @@ -138,10 +138,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride ; sin(3 * pi / 16), cos(3 * pi / 16) punpcklwd m11, m0, m3 punpckhwd m0, m3 - pmaddwd m9, m11, [pw_9102_13623] - pmaddwd m11, [pw_13623_m9102] - pmaddwd m3, m0, [pw_9102_13623] - pmaddwd m0, [pw_13623_m9102] + pmaddwd m9, m11, [GLOBAL(pw_9102_13623)] + pmaddwd m11, [GLOBAL(pw_13623_m9102)] + pmaddwd m3, m0, [GLOBAL(pw_9102_13623)] + pmaddwd m0, [GLOBAL(pw_13623_m9102)] paddd m9, m8 paddd m11, m8 paddd m3, m8 @@ -211,10 +211,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride ; stage 3 punpcklwd m6, m1, m3 punpckhwd m1, m3 - pmaddwd m2, m6, [pw_11585_11585] - pmaddwd m6, [pw_11585_m11585] - pmaddwd m3, m1, [pw_11585_11585] - pmaddwd m1, [pw_11585_m11585] + pmaddwd m2, m6, [GLOBAL(pw_11585_11585)] + pmaddwd m6, [GLOBAL(pw_11585_m11585)] + pmaddwd m3, m1, [GLOBAL(pw_11585_11585)] + pmaddwd m1, [GLOBAL(pw_11585_m11585)] paddd m2, m8 paddd m6, m8 paddd m3, m8 @@ -231,10 +231,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride punpcklwd m3, m5, m4 punpckhwd m5, m4 - pmaddwd m1, m3, [pw_15137_6270] - pmaddwd m3, [pw_6270_m15137] - pmaddwd m4, m5, [pw_15137_6270] - pmaddwd m5, [pw_6270_m15137] + pmaddwd m1, m3, [GLOBAL(pw_15137_6270)] + pmaddwd m3, [GLOBAL(pw_6270_m15137)] + pmaddwd m4, m5, [GLOBAL(pw_15137_6270)] + pmaddwd m5, [GLOBAL(pw_6270_m15137)] paddd m1, m8 paddd m3, m8 paddd m4, m8 @@ -255,10 +255,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride ; stage 4 punpcklwd m9, m5, m4 punpckhwd m5, m4 - pmaddwd m7, m9, [pw_16069_3196] - pmaddwd m9, [pw_3196_m16069] - pmaddwd m4, m5, [pw_16069_3196] - pmaddwd m5, [pw_3196_m16069] + pmaddwd m7, m9, [GLOBAL(pw_16069_3196)] + pmaddwd m9, [GLOBAL(pw_3196_m16069)] + pmaddwd m4, m5, [GLOBAL(pw_16069_3196)] + pmaddwd m5, [GLOBAL(pw_3196_m16069)] paddd m7, m8 paddd m9, m8 paddd m4, m8 @@ -272,10 +272,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride punpcklwd m4, m10, m0 punpckhwd m10, m0 - pmaddwd m5, m4, [pw_9102_13623] - pmaddwd m4, [pw_13623_m9102] - pmaddwd m0, m10, [pw_9102_13623] - pmaddwd m10, [pw_13623_m9102] + pmaddwd m5, m4, [GLOBAL(pw_9102_13623)] + pmaddwd m4, [GLOBAL(pw_13623_m9102)] + pmaddwd m0, m10, [GLOBAL(pw_9102_13623)] + pmaddwd m10, [GLOBAL(pw_13623_m9102)] paddd m5, m8 paddd m4, m8 paddd m0, m8 |