13 files changed, 140 insertions, 170 deletions
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 68095ca68..433726df6 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -858,7 +858,9 @@ skip_motion_search:
      */
     if ((cm->current_video_frame > 0) &&
         (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) &&
-        ((cpi->twopass.this_frame_stats.intra_error / cpi->twopass.this_frame_stats.coded_error) > 2.0))
+        ((cpi->twopass.this_frame_stats.intra_error /
+          DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats.coded_error)) >
+         2.0))
     {
         vp8_yv12_copy_frame(lst_yv12, gld_yv12);
     }
@@ -2116,23 +2118,25 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         (cpi->twopass.kf_group_error_left > 0))
     {
         cpi->twopass.gf_group_bits =
-            (int)((double)cpi->twopass.kf_group_bits *
-                  (gf_group_err / (double)cpi->twopass.kf_group_error_left));
+            (int64_t)(cpi->twopass.kf_group_bits *
+                      (gf_group_err / cpi->twopass.kf_group_error_left));
     }
     else
         cpi->twopass.gf_group_bits = 0;
 
-    cpi->twopass.gf_group_bits = (int)(
+    cpi->twopass.gf_group_bits =
         (cpi->twopass.gf_group_bits < 0)
             ? 0
             : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
-                ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits);
+                ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits;
 
     /* Clip cpi->twopass.gf_group_bits based on user supplied data rate
      * variability limit (cpi->oxcf.two_pass_vbrmax_section)
      */
-    if (cpi->twopass.gf_group_bits > max_bits * cpi->baseline_gf_interval)
-        cpi->twopass.gf_group_bits = max_bits * cpi->baseline_gf_interval;
+    if (cpi->twopass.gf_group_bits >
+        (int64_t)max_bits * cpi->baseline_gf_interval)
+        cpi->twopass.gf_group_bits =
+            (int64_t)max_bits * cpi->baseline_gf_interval;
 
     /* Reset the file position */
     reset_fpf_position(cpi, start_pos);
@@ -2446,7 +2450,7 @@ void vp8_second_pass(VP8_COMP *cpi)
          */
         if (cpi->oxcf.error_resilient_mode)
         {
-            cpi->twopass.gf_group_bits = (int)cpi->twopass.kf_group_bits;
+            cpi->twopass.gf_group_bits = cpi->twopass.kf_group_bits;
             cpi->twopass.gf_group_error_left =
                                   (int)cpi->twopass.kf_group_error_left;
             cpi->baseline_gf_interval = cpi->twopass.frames_to_key;
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index b08c7a589..a34af6428 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -235,13 +235,12 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
     MACROBLOCKD *xd = &x->e_mbd;
     unsigned char *y0 = base_pre + d->offset + (bestmv->as_mv.row) * pre_stride + bestmv->as_mv.col;
     unsigned char *y;
-    int buf_r1, buf_r2, buf_c1, buf_c2;
+    int buf_r1, buf_r2, buf_c1;
 
     /* Clamping to avoid out-of-range data access */
     buf_r1 = ((bestmv->as_mv.row - 3) < x->mv_row_min)?(bestmv->as_mv.row - x->mv_row_min):3;
     buf_r2 = ((bestmv->as_mv.row + 3) > x->mv_row_max)?(x->mv_row_max - bestmv->as_mv.row):3;
     buf_c1 = ((bestmv->as_mv.col - 3) < x->mv_col_min)?(bestmv->as_mv.col - x->mv_col_min):3;
-    buf_c2 = ((bestmv->as_mv.col + 3) > x->mv_col_max)?(x->mv_col_max - bestmv->as_mv.col):3;
     y_stride = 32;
 
     /* Copy to intermediate buffer before searching. */
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 7eb7193bf..92f981857 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -5362,6 +5362,7 @@ int vp8_get_preview_raw_frame(VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppfla
 #endif
 
 #if CONFIG_POSTPROC
+        cpi->common.show_frame_mi = cpi->common.mi;
         ret = vp8_post_proc_frame(&cpi->common, dest, flags);
 #else
 
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index fb8ad357c..378731d0a 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -587,7 +587,7 @@ typedef struct VP8_COMP
         /* Error score of frames still to be coded in kf group */
         int64_t kf_group_error_left;
         /* Projected Bits available for a group including 1 GF or ARF */
-        int gf_group_bits;
+        int64_t gf_group_bits;
         /* Bits for the golden frame or ARF */
         int gf_bits;
         int alt_extra_bits;
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 673de2b33..4c2527d68 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -389,7 +389,7 @@ static void pick_intra_mbuv_mode(MACROBLOCK *mb)
 
 }
 
-static void update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
+static void update_mvcount(MACROBLOCK *x, int_mv *best_ref_mv)
 {
     MACROBLOCKD *xd = &x->e_mbd;
     /* Split MV modes currently not supported when RD is nopt enabled,
@@ -1241,7 +1241,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
       != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
         best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int;
 
-    update_mvcount(cpi, x, &best_ref_mv);
+    update_mvcount(x, &best_ref_mv);
 }
 
 
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index a399a3877..65fd0c5be 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -1360,10 +1360,10 @@ static int estimate_keyframe_frequency(VP8_COMP *cpi)
          * whichever is smaller.
          */
         int key_freq = cpi->oxcf.key_freq>0 ? cpi->oxcf.key_freq : 1;
-        av_key_frame_frequency = (int)cpi->output_frame_rate * 2;
+        av_key_frame_frequency = 1 + (int)cpi->output_frame_rate * 2;
 
         if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
-            av_key_frame_frequency = cpi->oxcf.key_freq;
+            av_key_frame_frequency = key_freq;
 
         cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]
             = av_key_frame_frequency;
@@ -1393,6 +1393,10 @@ static int estimate_keyframe_frequency(VP8_COMP *cpi)
         av_key_frame_frequency  /= total_weight;
 
     }
+    // TODO (marpan): Given the checks above, |av_key_frame_frequency|
+    // should always be above 0. But for now we keep the sanity check in.
+    if (av_key_frame_frequency == 0)
+        av_key_frame_frequency = 1;
     return av_key_frame_frequency;
 }
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index f0ec7b6e2..3d60bebda 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1733,7 +1733,7 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse
     }
 }
 
-static void rd_update_mvcount(VP8_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv)
+static void rd_update_mvcount(MACROBLOCK *x, int_mv *best_ref_mv)
 {
     if (x->e_mbd.mode_info_context->mbmi.mode == SPLITMV)
     {
@@ -2608,7 +2608,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         != cpi->common.ref_frame_sign_bias[xd->mode_info_context->mbmi.ref_frame])
         best_ref_mv.as_int = best_ref_mv_sb[!sign_bias].as_int;
 
-    rd_update_mvcount(cpi, x, &best_ref_mv);
+    rd_update_mvcount(x, &best_ref_mv);
 }
 
 void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_)
diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
index d880ce0c4..d06bca592 100644
--- a/vp8/encoder/x86/dct_sse2.asm
+++ b/vp8/encoder/x86/dct_sse2.asm
@@ -29,7 +29,7 @@
     movsxd      rax, dword ptr arg(2)
     lea         rcx, [rsi + rax*2]
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     %define     input       rcx
     %define     output      rdx
     %define     pitch       r8
@@ -53,7 +53,7 @@
     RESTORE_GOT
     pop         rbp
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     RESTORE_XMM
   %endif
 %endif
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index fe9464b3d..b41768ce0 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -27,7 +27,7 @@ sym(vp8_regular_quantize_b_sse2):
     push        rdi
     push        rsi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     push        rdi
     push        rsi
   %endif
@@ -46,7 +46,7 @@ sym(vp8_regular_quantize_b_sse2):
     mov         rdi, arg(0)                 ; BLOCK *b
     mov         rsi, arg(1)                 ; BLOCKD *d
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     mov         rdi, rcx                    ; BLOCK *b
     mov         rsi, rdx                    ; BLOCKD *d
   %else
@@ -226,7 +226,7 @@ ZIGZAG_LOOP 15
     pop         rsi
     pop         rdi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     pop         rsi
     pop         rdi
   %endif
@@ -236,147 +236,6 @@ ZIGZAG_LOOP 15
     pop         rbp
     ret
 
-; void vp8_fast_quantize_b_sse2 | arg
-;  (BLOCK  *b,                  |  0
-;   BLOCKD *d)                  |  1
-
-global sym(vp8_fast_quantize_b_sse2) PRIVATE
-sym(vp8_fast_quantize_b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    GET_GOT     rbx
-
-%if ABI_IS_32BIT
-    push        rdi
-    push        rsi
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    push        rdi
-    push        rsi
-  %else
-    ; these registers are used for passing arguments
-  %endif
-%endif
-
-    ; end prolog
-
-%if ABI_IS_32BIT
-    mov         rdi, arg(0)                 ; BLOCK *b
-    mov         rsi, arg(1)                 ; BLOCKD *d
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    mov         rdi, rcx                    ; BLOCK *b
-    mov         rsi, rdx                    ; BLOCKD *d
-  %else
-    ;mov         rdi, rdi                    ; BLOCK *b
-    ;mov         rsi, rsi                    ; BLOCKD *d
-  %endif
-%endif
-
-    mov         rax, [rdi + vp8_block_coeff]
-    mov         rcx, [rdi + vp8_block_round]
-    mov         rdx, [rdi + vp8_block_quant_fast]
-
-    ; z = coeff
-    movdqa      xmm0, [rax]
-    movdqa      xmm4, [rax + 16]
-
-    ; dup z so we can save sz
-    movdqa      xmm1, xmm0
-    movdqa      xmm5, xmm4
-
-    ; sz = z >> 15
-    psraw       xmm0, 15
-    psraw       xmm4, 15
-
-    ; x = abs(z) = (z ^ sz) - sz
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    ; x += round
-    paddw       xmm1, [rcx]
-    paddw       xmm5, [rcx + 16]
-
-    mov         rax, [rsi + vp8_blockd_qcoeff]
-    mov         rcx, [rsi + vp8_blockd_dequant]
-    mov         rdi, [rsi + vp8_blockd_dqcoeff]
-
-    ; y = x * quant >> 16
-    pmulhw      xmm1, [rdx]
-    pmulhw      xmm5, [rdx + 16]
-
-    ; x = (y ^ sz) - sz
-    pxor        xmm1, xmm0
-    pxor        xmm5, xmm4
-    psubw       xmm1, xmm0
-    psubw       xmm5, xmm4
-
-    ; qcoeff = x
-    movdqa      [rax], xmm1
-    movdqa      [rax + 16], xmm5
-
-    ; x * dequant
-    movdqa      xmm2, xmm1
-    movdqa      xmm3, xmm5
-    pmullw      xmm2, [rcx]
-    pmullw      xmm3, [rcx + 16]
-
-    ; dqcoeff = x * dequant
-    movdqa      [rdi], xmm2
-    movdqa      [rdi + 16], xmm3
-
-    pxor        xmm4, xmm4                  ;clear all bits
-    pcmpeqw     xmm1, xmm4
-    pcmpeqw     xmm5, xmm4
-
-    pcmpeqw     xmm4, xmm4                  ;set all bits
-    pxor        xmm1, xmm4
-    pxor        xmm5, xmm4
-
-    pand        xmm1, [GLOBAL(inv_zig_zag)]
-    pand        xmm5, [GLOBAL(inv_zig_zag + 16)]
-
-    pmaxsw      xmm1, xmm5
-
-    mov         rcx, [rsi + vp8_blockd_eob]
-
-    ; now down to 8
-    pshufd      xmm5, xmm1, 00001110b
-
-    pmaxsw      xmm1, xmm5
-
-    ; only 4 left
-    pshuflw     xmm5, xmm1, 00001110b
-
-    pmaxsw      xmm1, xmm5
-
-    ; okay, just 2!
-    pshuflw     xmm5, xmm1, 00000001b
-
-    pmaxsw      xmm1, xmm5
-
-    movd        eax, xmm1
-    and         eax, 0xff
-
-    mov         BYTE PTR [rcx], al          ; store eob
-
-    ; begin epilog
-%if ABI_IS_32BIT
-    pop         rsi
-    pop         rdi
-%else
-  %ifidn __OUTPUT_FORMAT__,x64
-    pop         rsi
-    pop         rdi
-  %endif
-%endif
-
-    RESTORE_GOT
-    pop         rbp
-    ret
-
 SECTION_RODATA
 align 16
 inv_zig_zag:
diff --git a/vp8/encoder/x86/quantize_sse2.c b/vp8/encoder/x86/quantize_sse2.c
new file mode 100644
index 000000000..55d57ad62
--- /dev/null
+++ b/vp8/encoder/x86/quantize_sse2.c
@@ -0,0 +1,103 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/common/blockd.h"
+#include "vp8/common/entropy.h"
+#include "vp8/encoder/block.h"
+
+#include <mmintrin.h> //MMX
+#include <xmmintrin.h> //SSE
+#include <emmintrin.h> //SSE2
+
+void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
+{
+  __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
+  __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
+  __m128i round0 = _mm_load_si128((__m128i *)(b->round));
+  __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
+  __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
+  __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
+  __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
+  __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
+  __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));
+  __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));
+
+  __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;
+
+  /* sign of z: z >> 15 */
+  sz0 = _mm_srai_epi16(z0, 15);
+  sz1 = _mm_srai_epi16(z1, 15);
+
+  /* x = abs(z): (z ^ sz) - sz */
+  x0 = _mm_xor_si128(z0, sz0);
+  x1 = _mm_xor_si128(z1, sz1);
+  x0 = _mm_sub_epi16(x0, sz0);
+  x1 = _mm_sub_epi16(x1, sz1);
+
+  /* x += round */
+  x0 = _mm_add_epi16(x0, round0);
+  x1 = _mm_add_epi16(x1, round1);
+
+  /* y = (x * quant) >> 16 */
+  y0 = _mm_mulhi_epi16(x0, quant_fast0);
+  y1 = _mm_mulhi_epi16(x1, quant_fast1);
+
+  /* x = abs(y) = (y ^ sz) - sz */
+  y0 = _mm_xor_si128(y0, sz0);
+  y1 = _mm_xor_si128(y1, sz1);
+  x0 = _mm_sub_epi16(y0, sz0);
+  x1 = _mm_sub_epi16(y1, sz1);
+
+  /* qcoeff = x */
+  _mm_store_si128((__m128i *)(d->qcoeff), x0);
+  _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
+
+  /* x * dequant */
+  xdq0 = _mm_mullo_epi16(x0, dequant0);
+  xdq1 = _mm_mullo_epi16(x1, dequant1);
+
+  /* dqcoeff = x * dequant */
+  _mm_store_si128((__m128i *)(d->dqcoeff), xdq0);
+  _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);
+
+  /* build a mask for the zig zag */
+  zeros = _mm_setzero_si128();
+
+  x0 = _mm_cmpeq_epi16(x0, zeros);
+  x1 = _mm_cmpeq_epi16(x1, zeros);
+
+  ones = _mm_cmpeq_epi16(zeros, zeros);
+
+  x0 = _mm_xor_si128(x0, ones);
+  x1 = _mm_xor_si128(x1, ones);
+
+  x0 = _mm_and_si128(x0, inv_zig_zag0);
+  x1 = _mm_and_si128(x1, inv_zig_zag1);
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  /* now down to 8 */
+  x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  /* only 4 left */
+  x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  /* okay, just 2! */
+  x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001
+
+  x0 = _mm_max_epi16(x0, x1);
+
+  *d->eob = 0xFF & _mm_cvtsi128_si32(x0);
+}
diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm
index f21146457..dbd171bfc 100644
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@ -31,7 +31,7 @@ sym(vp8_regular_quantize_b_sse4):
     %define stack_size 32
     sub         rsp, stack_size
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     SAVE_XMM 8, u
     push        rdi
     push        rsi
@@ -43,7 +43,7 @@ sym(vp8_regular_quantize_b_sse4):
     mov         rdi, arg(0)                 ; BLOCK *b
     mov         rsi, arg(1)                 ; BLOCKD *d
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     mov         rdi, rcx                    ; BLOCK *b
     mov         rsi, rdx                    ; BLOCKD *d
   %else
@@ -240,7 +240,7 @@ ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
     pop         rbp
 %else
   %undef xmm5
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     pop         rsi
     pop         rdi
     RESTORE_XMM
diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm
index 35368894d..7b1dc119f 100644
--- a/vp8/encoder/x86/quantize_ssse3.asm
+++ b/vp8/encoder/x86/quantize_ssse3.asm
@@ -27,7 +27,7 @@ sym(vp8_fast_quantize_b_ssse3):
     push        rdi
     push        rsi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     push        rdi
     push        rsi
   %endif
@@ -38,7 +38,7 @@ sym(vp8_fast_quantize_b_ssse3):
     mov         rdi, arg(0)                 ; BLOCK *b
     mov         rsi, arg(1)                 ; BLOCKD *d
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     mov         rdi, rcx                    ; BLOCK *b
     mov         rsi, rdx                    ; BLOCKD *d
   %else
@@ -122,7 +122,7 @@ sym(vp8_fast_quantize_b_ssse3):
     pop         rsi
     pop         rdi
 %else
-  %ifidn __OUTPUT_FORMAT__,x64
+  %if LIBVPX_YASM_WIN64
     pop         rsi
     pop         rdi
   %endif
diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index ce9d9836b..bd92b398a 100644
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -50,7 +50,7 @@ sym(vp8_temporal_filter_apply_sse2):
         ; 0x8000 >> (16 - strength)
         mov         rdx,            16
         sub         rdx,            arg(4) ; 16 - strength
-        movd        xmm4,           rdx    ; can't use rdx w/ shift
+        movq        xmm4,           rdx    ; can't use rdx w/ shift
         movdqa      xmm5,           [GLOBAL(_const_top_bit)]
         psrlw       xmm5,           xmm4
         movdqa      [rsp + rounding_bit], xmm5