diff options
author | John Koleszar <jkoleszar@google.com> | 2013-03-01 20:50:33 -0800 |
---|---|---|
committer | Gerrit Code Review <gerrit@gerrit.golo.chromium.org> | 2013-03-01 20:50:33 -0800 |
commit | 2d3e879fccef330d89f5e5ae6c718cb37d888d2a (patch) | |
tree | 748d939cd5d861745ea7c7f2e08e38794415027e /vp8/encoder | |
parent | e189edfeb11d3433c8ca3c35fe8d5610ef2444f3 (diff) | |
parent | 1cfc86ebe041b3894d33b50d54db94dcb7ea3684 (diff) | |
download | libvpx-2d3e879fccef330d89f5e5ae6c718cb37d888d2a.tar libvpx-2d3e879fccef330d89f5e5ae6c718cb37d888d2a.tar.gz libvpx-2d3e879fccef330d89f5e5ae6c718cb37d888d2a.tar.bz2 libvpx-2d3e879fccef330d89f5e5ae6c718cb37d888d2a.zip |
Merge changes If5896507,I06b5ba5c,I2712f99e into experimental
* changes:
Add unit test for x4 multi-SAD functions
Add VP9 1 block SAD functions to unit test
Merge master branch into experimental
Diffstat (limited to 'vp8/encoder')
-rw-r--r-- | vp8/encoder/firstpass.c | 20 | ||||
-rw-r--r-- | vp8/encoder/mcomp.c | 3 | ||||
-rw-r--r-- | vp8/encoder/onyx_if.c | 1 | ||||
-rw-r--r-- | vp8/encoder/onyx_int.h | 2 | ||||
-rw-r--r-- | vp8/encoder/pickinter.c | 4 | ||||
-rw-r--r-- | vp8/encoder/ratectrl.c | 8 | ||||
-rw-r--r-- | vp8/encoder/rdopt.c | 4 | ||||
-rw-r--r-- | vp8/encoder/x86/dct_sse2.asm | 4 | ||||
-rw-r--r-- | vp8/encoder/x86/quantize_sse2.asm | 147 | ||||
-rw-r--r-- | vp8/encoder/x86/quantize_sse2.c | 103 | ||||
-rw-r--r-- | vp8/encoder/x86/quantize_sse4.asm | 6 | ||||
-rw-r--r-- | vp8/encoder/x86/quantize_ssse3.asm | 6 | ||||
-rw-r--r-- | vp8/encoder/x86/temporal_filter_apply_sse2.asm | 2 |
13 files changed, 140 insertions, 170 deletions
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 68095ca68..433726df6 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -858,7 +858,9 @@ skip_motion_search: */ if ((cm->current_video_frame > 0) && (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) && - ((cpi->twopass.this_frame_stats.intra_error / cpi->twopass.this_frame_stats.coded_error) > 2.0)) + ((cpi->twopass.this_frame_stats.intra_error / + DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats.coded_error)) > + 2.0)) { vp8_yv12_copy_frame(lst_yv12, gld_yv12); } @@ -2116,23 +2118,25 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) (cpi->twopass.kf_group_error_left > 0)) { cpi->twopass.gf_group_bits = - (int)((double)cpi->twopass.kf_group_bits * - (gf_group_err / (double)cpi->twopass.kf_group_error_left)); + (int64_t)(cpi->twopass.kf_group_bits * + (gf_group_err / cpi->twopass.kf_group_error_left)); } else cpi->twopass.gf_group_bits = 0; - cpi->twopass.gf_group_bits = (int)( + cpi->twopass.gf_group_bits = (cpi->twopass.gf_group_bits < 0) ? 0 : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits) - ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits); + ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits; /* Clip cpi->twopass.gf_group_bits based on user supplied data rate * variability limit (cpi->oxcf.two_pass_vbrmax_section) */ - if (cpi->twopass.gf_group_bits > max_bits * cpi->baseline_gf_interval) - cpi->twopass.gf_group_bits = max_bits * cpi->baseline_gf_interval; + if (cpi->twopass.gf_group_bits > + (int64_t)max_bits * cpi->baseline_gf_interval) + cpi->twopass.gf_group_bits = + (int64_t)max_bits * cpi->baseline_gf_interval; /* Reset the file position */ reset_fpf_position(cpi, start_pos); @@ -2446,7 +2450,7 @@ void vp8_second_pass(VP8_COMP *cpi) */ if (cpi->oxcf.error_resilient_mode) { - cpi->twopass.gf_group_bits = (int)cpi->twopass.kf_group_bits; + cpi->twopass.gf_group_bits = cpi->twopass.kf_group_bits; cpi->twopass.gf_group_error_left = (int)cpi->twopass.kf_group_error_left; cpi->baseline_gf_interval = cpi->twopass.frames_to_key; diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index b08c7a589..a34af6428 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -235,13 +235,12 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MACROBLOCKD *xd = &x->e_mbd; unsigned char *y0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col; unsigned char *y; - int buf_r1, buf_r2, buf_c1, buf_c2; + int buf_r1, buf_r2, buf_c1; /* Clamping to avoid out-of-range data access */ buf_r1 = ((bestmv->as_mv.row - 3) < x->mv_row_min)?(bestmv->as_mv.row - x->mv_row_min):3; buf_r2 = ((bestmv->as_mv.row + 3) > x->mv_row_max)?(x->mv_row_max - bestmv->as_mv.row):3; buf_c1 = ((bestmv->as_mv.col - 3) < x->mv_col_min)?(bestmv->as_mv.col - x->mv_col_min):3; - buf_c2 = ((bestmv->as_mv.col + 3) > x->mv_col_max)?(x->mv_col_max - bestmv->as_mv.col):3; y_stride = 32; /* Copy to intermediate buffer before searching. */ diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 7eb7193bf..92f981857 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -5362,6 +5362,7 @@ int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppfla #endif #if CONFIG_POSTPROC + cpi->common.show_frame_mi = cpi->common.mi; ret = vp8_post_proc_frame(&cpi->common, dest, flags); #else diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index fb8ad357c..378731d0a 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -587,7 +587,7 @@ typedef struct VP8_COMP /* Error score of frames still to be coded in kf group */ int64_t kf_group_error_left; /* Projected Bits available for a group including 1 GF or ARF */ - int gf_group_bits; + int64_t gf_group_bits; /* Bits for the golden frame or ARF */ int gf_bits; int alt_extra_bits; diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 673de2b33..4c2527d68 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -389,7 +389,7 @@ static void pick_intra_mbuv_mode(MACROBLOCK *mb) } -static void update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv) +static void update_mvcount(MACROBLOCK *x, int_mv *best_ref_mv) { MACROBLOCKD *xd = &x->e_mbd; /* Split MV modes currently not supported when RD is nopt enabled, @@ -1241,7 +1241,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame]) best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int; - update_mvcount(cpi, x, &best_ref_mv); + update_mvcount(x, &best_ref_mv); } diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index a399a3877..65fd0c5be 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -1360,10 +1360,10 @@ static int estimate_keyframe_frequency(VP8_COMP *cpi) * whichever is smaller. */ int key_freq = cpi->oxcf.key_freq>0 ? cpi->oxcf.key_freq : 1; - av_key_frame_frequency = (int)cpi->output_frame_rate * 2; + av_key_frame_frequency = 1 + (int)cpi->output_frame_rate * 2; if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq) - av_key_frame_frequency = cpi->oxcf.key_freq; + av_key_frame_frequency = key_freq; cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1] = av_key_frame_frequency; @@ -1393,6 +1393,10 @@ static int estimate_keyframe_frequency(VP8_COMP *cpi) av_key_frame_frequency /= total_weight; } + // TODO (marpan): Given the checks above, |av_key_frame_frequency| + // should always be above 0. But for now we keep the sanity check in. + if (av_key_frame_frequency == 0) + av_key_frame_frequency = 1; return av_key_frame_frequency; } diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index f0ec7b6e2..3d60bebda 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1733,7 +1733,7 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse } } -static void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv) +static void rd_update_mvcount(MACROBLOCK *x, int_mv *best_ref_mv) { if (x->e_mbd.mode_info_context->mbmi.mode == SPLITMV) { @@ -2608,7 +2608,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame]) best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int; - rd_update_mvcount(cpi, x, &best_ref_mv); + rd_update_mvcount(x, &best_ref_mv); } void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm index d880ce0c4..d06bca592 100644 --- a/vp8/encoder/x86/dct_sse2.asm +++ b/vp8/encoder/x86/dct_sse2.asm @@ -29,7 +29,7 @@ movsxd rax, dword ptr arg(2) lea rcx, [rsi + rax*2] %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 %define input rcx %define output rdx %define pitch r8 @@ -53,7 +53,7 @@ RESTORE_GOT pop rbp %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 RESTORE_XMM %endif %endif diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index fe9464b3d..b41768ce0 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -27,7 +27,7 @@ sym(vp8_regular_quantize_b_sse2): push rdi push rsi %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 push rdi push rsi %endif @@ -46,7 +46,7 @@ sym(vp8_regular_quantize_b_sse2): mov rdi, arg(0) ; BLOCK *b mov rsi, arg(1) ; BLOCKD *d %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 mov rdi, rcx ; BLOCK *b mov rsi, rdx ; BLOCKD *d %else @@ -226,7 +226,7 @@ ZIGZAG_LOOP 15 pop rsi pop rdi %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 pop rsi pop rdi %endif @@ -236,147 +236,6 @@ ZIGZAG_LOOP 15 pop rbp ret -; void vp8_fast_quantize_b_sse2 | arg -; (BLOCK *b, | 0 -; BLOCKD *d) | 1 - -global sym(vp8_fast_quantize_b_sse2) PRIVATE -sym(vp8_fast_quantize_b_sse2): - push rbp - mov rbp, rsp - GET_GOT rbx - -%if ABI_IS_32BIT - push rdi - push rsi -%else - %ifidn __OUTPUT_FORMAT__,x64 - push rdi - push rsi - %else - ; these registers are used for passing arguments - %endif -%endif - - ; end prolog - -%if ABI_IS_32BIT - mov rdi, arg(0) ; BLOCK *b - mov rsi, arg(1) ; BLOCKD *d -%else - %ifidn __OUTPUT_FORMAT__,x64 - mov rdi, rcx ; BLOCK *b - mov rsi, rdx ; BLOCKD *d - %else - ;mov rdi, rdi ; BLOCK *b - ;mov rsi, rsi ; BLOCKD *d - %endif -%endif - - mov rax, [rdi + vp8_block_coeff] - mov rcx, [rdi + vp8_block_round] - mov rdx, [rdi + vp8_block_quant_fast] - - ; z = coeff - movdqa xmm0, [rax] - movdqa xmm4, [rax + 16] - - ; dup z so we can save sz - movdqa xmm1, xmm0 - movdqa xmm5, xmm4 - - ; sz = z >> 15 - psraw xmm0, 15 - psraw xmm4, 15 - - ; x = abs(z) = (z ^ sz) - sz - pxor xmm1, xmm0 - pxor xmm5, xmm4 - psubw xmm1, xmm0 - psubw xmm5, xmm4 - - ; x += round - paddw xmm1, [rcx] - paddw xmm5, [rcx + 16] - - mov rax, [rsi + vp8_blockd_qcoeff] - mov rcx, [rsi + vp8_blockd_dequant] - mov rdi, [rsi + vp8_blockd_dqcoeff] - - ; y = x * quant >> 16 - pmulhw xmm1, [rdx] - pmulhw xmm5, [rdx + 16] - - ; x = (y ^ sz) - sz - pxor xmm1, xmm0 - pxor xmm5, xmm4 - psubw xmm1, xmm0 - psubw xmm5, xmm4 - - ; qcoeff = x - movdqa [rax], xmm1 - movdqa [rax + 16], xmm5 - - ; x * dequant - movdqa xmm2, xmm1 - movdqa xmm3, xmm5 - pmullw xmm2, [rcx] - pmullw xmm3, [rcx + 16] - - ; dqcoeff = x * dequant - movdqa [rdi], xmm2 - movdqa [rdi + 16], xmm3 - - pxor xmm4, xmm4 ;clear all bits - pcmpeqw xmm1, xmm4 - pcmpeqw xmm5, xmm4 - - pcmpeqw xmm4, xmm4 ;set all bits - pxor xmm1, xmm4 - pxor xmm5, xmm4 - - pand xmm1, [GLOBAL(inv_zig_zag)] - pand xmm5, [GLOBAL(inv_zig_zag + 16)] - - pmaxsw xmm1, xmm5 - - mov rcx, [rsi + vp8_blockd_eob] - - ; now down to 8 - pshufd xmm5, xmm1, 00001110b - - pmaxsw xmm1, xmm5 - - ; only 4 left - pshuflw xmm5, xmm1, 00001110b - - pmaxsw xmm1, xmm5 - - ; okay, just 2! - pshuflw xmm5, xmm1, 00000001b - - pmaxsw xmm1, xmm5 - - movd eax, xmm1 - and eax, 0xff - - mov BYTE PTR [rcx], al ; store eob - - ; begin epilog -%if ABI_IS_32BIT - pop rsi - pop rdi -%else - %ifidn __OUTPUT_FORMAT__,x64 - pop rsi - pop rdi - %endif -%endif - - RESTORE_GOT - pop rbp - ret - SECTION_RODATA align 16 inv_zig_zag: diff --git a/vp8/encoder/x86/quantize_sse2.c b/vp8/encoder/x86/quantize_sse2.c new file mode 100644 index 000000000..55d57ad62 --- /dev/null +++ b/vp8/encoder/x86/quantize_sse2.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp8/common/blockd.h" +#include "vp8/common/entropy.h" +#include "vp8/encoder/block.h" + +#include <mmintrin.h> //MMX +#include <xmmintrin.h> //SSE +#include <emmintrin.h> //SSE2 + +void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) +{ + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); + __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag)); + __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8)); + + __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones; + + /* sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z): (z ^ sz) - sz */ + x0 = _mm_xor_si128(z0, sz0); + x1 = _mm_xor_si128(z1, sz1); + x0 = _mm_sub_epi16(x0, sz0); + x1 = _mm_sub_epi16(x1, sz1); + + /* x += round */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + /* y = (x * quant) >> 16 */ + y0 = _mm_mulhi_epi16(x0, quant_fast0); + y1 = _mm_mulhi_epi16(x1, quant_fast1); + + /* x = abs(y) = (y ^ sz) - sz */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + x0 = _mm_sub_epi16(y0, sz0); + x1 = _mm_sub_epi16(y1, sz1); + + /* qcoeff = x */ + _mm_store_si128((__m128i *)(d->qcoeff), x0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); + + /* x * dequant */ + xdq0 = _mm_mullo_epi16(x0, dequant0); + xdq1 = _mm_mullo_epi16(x1, dequant1); + + /* dqcoeff = x * dequant */ + _mm_store_si128((__m128i *)(d->dqcoeff), xdq0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1); + + /* build a mask for the zig zag */ + zeros = _mm_setzero_si128(); + + x0 = _mm_cmpeq_epi16(x0, zeros); + x1 = _mm_cmpeq_epi16(x1, zeros); + + ones = _mm_cmpeq_epi16(zeros, zeros); + + x0 = _mm_xor_si128(x0, ones); + x1 = _mm_xor_si128(x1, ones); + + x0 = _mm_and_si128(x0, inv_zig_zag0); + x1 = _mm_and_si128(x1, inv_zig_zag1); + + x0 = _mm_max_epi16(x0, x1); + + /* now down to 8 */ + x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110 + + x0 = _mm_max_epi16(x0, x1); + + /* only 4 left */ + x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110 + + x0 = _mm_max_epi16(x0, x1); + + /* okay, just 2! */ + x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001 + + x0 = _mm_max_epi16(x0, x1); + + *d->eob = 0xFF & _mm_cvtsi128_si32(x0); +} diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm index f21146457..dbd171bfc 100644 --- a/vp8/encoder/x86/quantize_sse4.asm +++ b/vp8/encoder/x86/quantize_sse4.asm @@ -31,7 +31,7 @@ sym(vp8_regular_quantize_b_sse4): %define stack_size 32 sub rsp, stack_size %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 SAVE_XMM 8, u push rdi push rsi @@ -43,7 +43,7 @@ sym(vp8_regular_quantize_b_sse4): mov rdi, arg(0) ; BLOCK *b mov rsi, arg(1) ; BLOCKD *d %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 mov rdi, rcx ; BLOCK *b mov rsi, rdx ; BLOCKD *d %else @@ -240,7 +240,7 @@ ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8 pop rbp %else %undef xmm5 - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 pop rsi pop rdi RESTORE_XMM diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm index 35368894d..7b1dc119f 100644 --- a/vp8/encoder/x86/quantize_ssse3.asm +++ b/vp8/encoder/x86/quantize_ssse3.asm @@ -27,7 +27,7 @@ sym(vp8_fast_quantize_b_ssse3): push rdi push rsi %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 push rdi push rsi %endif @@ -38,7 +38,7 @@ sym(vp8_fast_quantize_b_ssse3): mov rdi, arg(0) ; BLOCK *b mov rsi, arg(1) ; BLOCKD *d %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 mov rdi, rcx ; BLOCK *b mov rsi, rdx ; BLOCKD *d %else @@ -122,7 +122,7 @@ sym(vp8_fast_quantize_b_ssse3): pop rsi pop rdi %else - %ifidn __OUTPUT_FORMAT__,x64 + %if LIBVPX_YASM_WIN64 pop rsi pop rdi %endif diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm index ce9d9836b..bd92b398a 100644 --- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm +++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm @@ -50,7 +50,7 @@ sym(vp8_temporal_filter_apply_sse2): ; 0x8000 >> (16 - strength) mov rdx, 16 sub rdx, arg(4) ; 16 - strength - movd xmm4, rdx ; can't use rdx w/ shift + movq xmm4, rdx ; can't use rdx w/ shift movdqa xmm5, [GLOBAL(_const_top_bit)] psrlw xmm5, xmm4 movdqa [rsp + rounding_bit], xmm5 |