summaryrefslogtreecommitdiff
path: root/vp8/encoder
diff options
context:
space:
mode:
Diffstat (limited to 'vp8/encoder')
-rw-r--r--vp8/encoder/encodeframe.c14
-rw-r--r--vp8/encoder/onyx_if.c3
-rw-r--r--vp8/encoder/onyx_int.h1
-rw-r--r--vp8/encoder/rdopt.c79
-rw-r--r--vp8/encoder/temporal_filter.c52
-rw-r--r--vp8/encoder/temporal_filter.h6
-rw-r--r--vp8/encoder/x86/temporal_filter_apply_sse2.asm207
-rw-r--r--vp8/encoder/x86/temporal_filter_x86.h27
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c2
9 files changed, 259 insertions, 132 deletions
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index cb7cc65d7..e27e2e64e 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -734,20 +734,6 @@ void vp8_encode_frame(VP8_COMP *cpi)
x->activity_sum = 0;
-#if 0
- // Experimental rd code
- // 2 Pass - Possibly set Rdmult based on last frame distortion + this frame target bits or other metrics
- // such as cpi->rate_correction_factor that indicate relative complexity.
- /*if ( cpi->pass == 2 && (cpi->last_frame_distortion > 0) && (cpi->target_bits_per_mb > 0) )
- {
- //x->rdmult = ((cpi->last_frame_distortion * 256)/cpi->common.MBs)/ cpi->target_bits_per_mb;
- x->rdmult = (int)(cpi->RDMULT * cpi->rate_correction_factor);
- }
- else
- x->rdmult = cpi->RDMULT; */
- //x->rdmult = (int)(cpi->RDMULT * pow( (cpi->rate_correction_factor * 2.0), 0.75 ));
-#endif
-
xd->mode_info_context->mbmi.mode = DC_PRED;
xd->mode_info_context->mbmi.uv_mode = DC_PRED;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 22b25e3af..ee461c610 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -3093,9 +3093,6 @@ static int pick_frame_size(VP8_COMP *cpi)
}
}
- // Note target_size in bits * 256 per MB
- cpi->target_bits_per_mb = (cpi->this_frame_target * 256) / cpi->common.MBs;
-
return 1;
}
static void set_quantizer(VP8_COMP *cpi, int Q)
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index c5dc0c194..05e8c4e6a 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -365,7 +365,6 @@ typedef struct
int this_frame_target;
int projected_frame_size;
int last_q[2]; // Separate values for Intra/Inter
- int target_bits_per_mb;
double rate_correction_factor;
double key_frame_rate_correction_factor;
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index e6c7c9ab3..d694d39fb 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -36,7 +36,6 @@
#include "dct.h"
#include "systemdependent.h"
-#define DIAMONDSEARCH 1
#if CONFIG_RUNTIME_CPU_DETECT
#define IF_RTCD(x) (x)
#else
@@ -46,19 +45,6 @@
void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
-
-#define RDFUNC(RM,DM,R,D,target_rd) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
-/*int RDFUNC( int RM,int DM, int R, int D, int target_r )
-{
- int rd_value;
-
- rd_value = ( ((128+(R)*(RM)) >> 8) + (DM)*(D) );
-
- return rd_value;
-}*/
-
-#define UVRDFUNC(RM,DM,R,D,target_r) RDFUNC(RM,DM,R,D,target_r)
-
#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
#define MAXF(a,b) (((a) > (b)) ? (a) : (b))
@@ -223,8 +209,6 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
{
int q;
int i;
- int *thresh;
- int threshmult;
double capped_q = (Qvalue < 160) ? (double)Qvalue : 160.0;
double rdconst = 3.00;
@@ -271,22 +255,6 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
if (q < 8)
q = 8;
- if (cpi->ref_frame_flags == VP8_ALT_FLAG)
- {
- thresh = &cpi->rd_threshes[THR_NEWA];
- threshmult = cpi->sf.thresh_mult[THR_NEWA];
- }
- else if (cpi->ref_frame_flags == VP8_GOLD_FLAG)
- {
- thresh = &cpi->rd_threshes[THR_NEWG];
- threshmult = cpi->sf.thresh_mult[THR_NEWG];
- }
- else
- {
- thresh = &cpi->rd_threshes[THR_NEWMV];
- threshmult = cpi->sf.thresh_mult[THR_NEWMV];
- }
-
if (cpi->RDMULT > 1000)
{
cpi->RDDIV = 1;
@@ -775,7 +743,7 @@ static int vp8_rd_inter_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *distort
*rate = rd_cost_mbuv(x);
*distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
- return UVRDFUNC(x->rdmult, x->rddiv, *rate, *distortion, cpi->target_bits_per_mb);
+ return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
}
int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int *distortion)
@@ -800,7 +768,7 @@ int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *ra
distortion = vp8_get_mbuvrecon_error(IF_RTCD(&cpi->rtcd.variance), x);
- this_rd = UVRDFUNC(x->rdmult, x->rddiv, rate, distortion, cpi->target_bits_per_mb);
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
if (this_rd < best_rd)
{
@@ -1097,7 +1065,7 @@ void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
// Segmentation method overheads
rate = vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + segmentation);
rate += vp8_cost_mv_ref(SPLITMV, bsi->mdcounts);
- this_segment_rd += RDFUNC(x->rdmult, x->rddiv, rate, 0, cpi->target_bits_per_mb);
+ this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
br += rate;
for (i = 0; i < label_count; i++)
@@ -1252,7 +1220,7 @@ void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
labelyrate = rdcost_mbsegment_y(x, labels, i, ta_s, tl_s);
rate += labelyrate;
- this_rd = RDFUNC(x->rdmult, x->rddiv, rate, distortion, cpi->target_bits_per_mb);
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
if (this_rd < best_label_rd)
{
@@ -1751,7 +1719,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
//int intermodecost[MAX_MODES];
MB_PREDICTION_MODE uv_intra_mode;
- int uvintra_eob = 0;
+
int force_no_skip = 0;
MV mvp;
@@ -1770,27 +1738,6 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
ref_frame_cost[INTRA_FRAME] = vp8_cost_zero(cpi->prob_intra_coded);
- // Experimental code
- // Adjust the RD multiplier based on the best case distortion we saw in the most recently coded mb
- //if ( (cpi->last_mb_distortion) > 0 && (cpi->target_bits_per_mb > 0) )
- /*{
- int tmprdmult;
-
- //tmprdmult = (cpi->last_mb_distortion * 256) / ((cpi->av_per_frame_bandwidth*256)/cpi->common.MBs);
- tmprdmult = (cpi->last_mb_distortion * 256) / cpi->target_bits_per_mb;
- //tmprdmult = tmprdmult;
-
- //if ( tmprdmult > cpi->RDMULT * 2 )
- // tmprdmult = cpi->RDMULT * 2;
- //else if ( tmprdmult < cpi->RDMULT / 2 )
- // tmprdmult = cpi->RDMULT / 2;
-
- //tmprdmult = (tmprdmult < 25) ? 25 : tmprdmult;
-
- //x->rdmult = tmprdmult;
-
- }*/
-
// Special case treatment when GF and ARF are not sensible options for reference
if (cpi->ref_frame_flags == VP8_LAST_FLAG)
{
@@ -1820,12 +1767,6 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
vp8_rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate, &uv_intra_rate_tokenonly, &uv_intra_distortion);
uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode;
- {
- uvintra_eob = 0;
-
- for (i = 16; i < 24; i++)
- uvintra_eob += x->e_mbd.block[i].eob;
- }
for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
{
@@ -2339,8 +2280,8 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
distortion_uv = sse2;
disable_skip = 1;
- this_rd = RDFUNC(x->rdmult, x->rddiv, rate2,
- distortion2, cpi->target_bits_per_mb);
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2,
+ distortion2);
break;
}
@@ -2414,7 +2355,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
}
}
// Calculate the final RD estimate for this mode
- this_rd = RDFUNC(x->rdmult, x->rddiv, rate2, distortion2, cpi->target_bits_per_mb);
+ this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
}
// Experimental debug code.
@@ -2442,8 +2383,8 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
other_cost += ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
/* Calculate the final y RD estimate for this mode */
- best_yrd = RDFUNC(x->rdmult, x->rddiv, (rate2-rate_uv-other_cost),
- (distortion2-distortion_uv), cpi->target_bits_per_mb);
+ best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2-rate_uv-other_cost),
+ (distortion2-distortion_uv));
*returnrate = rate2;
*returndistortion = distortion2;
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index 2fffaa95f..396e3390d 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -36,36 +36,9 @@
#define ALT_REF_MC_ENABLED 1 // dis/enable MC in AltRef filtering
#define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
-#define USE_FILTER_LUT 0 // use lookup table to improve filter
#if VP8_TEMPORAL_ALT_REF
-#if USE_FILTER_LUT
-// for (strength = 0; strength <= 6; strength++) {
-// for (delta = 0; delta <= 18; delta++) {
-// float coeff = (3.0 * delta * delta) / pow(2, strength);
-// printf("%3d", (int)roundf(coeff > 16 ? 0 : 16-coeff));
-// }
-// printf("\n");
-// }
-static int modifier_lut[7][19] =
-{
- // Strength=0
- {16, 13, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- // Strength=1
- {16, 15, 10, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- // Strength=2
- {16, 15, 13, 9, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- // Strength=3
- {16, 16, 15, 13, 10, 7, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- // Strength=4
- {16, 16, 15, 14, 13, 11, 9, 7, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- // Strength=5
- {16, 16, 16, 15, 15, 14, 13, 11, 10, 8, 7, 5, 3, 0, 0, 0, 0, 0, 0},
- // Strength=6
- {16, 16, 16, 16, 15, 15, 14, 14, 13, 12, 11, 10, 9, 8, 7, 5, 4, 2, 1}
-};
-#endif
static void vp8_temporal_filter_predictors_mb_c
(
MACROBLOCKD *x,
@@ -86,14 +59,11 @@ static void vp8_temporal_filter_predictors_mb_c
if ((mv_row | mv_col) & 7)
{
-// vp8_sixtap_predict16x16_c(yptr, stride,
-// mv_col & 7, mv_row & 7, &pred[0], 16);
x->subpixel_predict16x16(yptr, stride,
mv_col & 7, mv_row & 7, &pred[0], 16);
}
else
{
- //vp8_copy_mem16x16_c (yptr, stride, &pred[0], 16);
RECON_INVOKE(&x->rtcd->recon, copy16x16)(yptr, stride, &pred[0], 16);
}
@@ -127,17 +97,13 @@ void vp8_temporal_filter_apply_c
int strength,
int filter_weight,
unsigned int *accumulator,
- unsigned int *count
+ unsigned short *count
)
{
int i, j, k;
int modifier;
int byte = 0;
-#if USE_FILTER_LUT
- int *lut = modifier_lut[strength];
-#endif
-
for (i = 0,k = 0; i < block_size; i++)
{
for (j = 0; j < block_size; j++, k++)
@@ -146,11 +112,10 @@ void vp8_temporal_filter_apply_c
int src_byte = frame1[byte];
int pixel_value = *frame2++;
-#if USE_FILTER_LUT
- modifier = abs(src_byte-pixel_value);
- modifier = modifier>18 ? 0 : lut[modifier];
-#else
modifier = src_byte - pixel_value;
+ // This is an integer approximation of:
+ // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+ // modifier = (int)roundf(coeff > 16 ? 0 : 16-coeff);
modifier *= modifier;
modifier *= 3;
modifier += 1 << (strength - 1);
@@ -160,7 +125,6 @@ void vp8_temporal_filter_apply_c
modifier = 16;
modifier = 16 - modifier;
-#endif
modifier *= filter_weight;
count[k] += modifier;
@@ -331,12 +295,12 @@ static void vp8_temporal_filter_iterate_c
int MBs = cpi->common.MBs;
int mb_y_offset = 0;
int mb_uv_offset = 0;
- unsigned int accumulator[384];
- unsigned int count[384];
+ DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16*16 + 8*8 + 8*8);
+ DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16*16 + 8*8 + 8*8);
MACROBLOCKD *mbd = &cpi->mb.e_mbd;
YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
unsigned char *dst1, *dst2;
- DECLARE_ALIGNED(16, unsigned char, predictor[384]);
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, predictor, 16*16 + 8*8 + 8*8);
// Save input state
unsigned char *y_buffer = mbd->pre.y_buffer;
@@ -366,7 +330,7 @@ static void vp8_temporal_filter_iterate_c
int stride;
vpx_memset(accumulator, 0, 384*sizeof(unsigned int));
- vpx_memset(count, 0, 384*sizeof(unsigned int));
+ vpx_memset(count, 0, 384*sizeof(unsigned short));
#if ALT_REF_MC_ENABLED
// Reduced search extent by 3 for 6-tap filter & smaller UMV border
diff --git a/vp8/encoder/temporal_filter.h b/vp8/encoder/temporal_filter.h
index 7b8c21c04..740037a85 100644
--- a/vp8/encoder/temporal_filter.h
+++ b/vp8/encoder/temporal_filter.h
@@ -22,9 +22,13 @@
int strength, \
int filter_weight, \
unsigned int *accumulator, \
- unsigned int *count \
+ unsigned short *count \
)
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/temporal_filter_x86.h"
+#endif
+
#ifndef vp8_temporal_filter_apply
#define vp8_temporal_filter_apply vp8_temporal_filter_apply_c
#endif
diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
new file mode 100644
index 000000000..0127b012e
--- /dev/null
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -0,0 +1,207 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; void vp8_temporal_filter_apply_sse2 | arg
+; (unsigned char *frame1, | 0
+; unsigned int stride, | 1
+; unsigned char *frame2, | 2
+; unsigned int block_size, | 3
+; int strength, | 4
+; int filter_weight, | 5
+; unsigned int *accumulator, | 6
+; unsigned short *count) | 7
+global sym(vp8_temporal_filter_apply_sse2)
+sym(vp8_temporal_filter_apply_sse2):
+
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 8
+ SAVE_XMM
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ALIGN_STACK 16, rax
+ %define block_size 0
+ %define strength 16
+ %define filter_weight 32
+ %define rounding_bit 48
+ %define rbp_backup 64
+ %define stack_size 80
+ sub rsp, stack_size
+ mov [rsp + rbp_backup], rbp
+ ; end prolog
+
+ mov rdx, arg(3)
+ mov [rsp + block_size], rdx
+ movd xmm6, arg(4)
+ movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+ ; calculate the rounding bit outside the loop
+ ; 0x8000 >> (16 - strength)
+ mov rdx, 16
+ sub rdx, arg(4) ; 16 - strength
+ movd xmm4, rdx ; can't use rdx w/ shift
+ movdqa xmm5, [GLOBAL(_const_top_bit)]
+ psrlw xmm5, xmm4
+ movdqa [rsp + rounding_bit], xmm5
+
+ mov rsi, arg(0) ; src/frame1
+ mov rdx, arg(2) ; predictor frame
+ mov rdi, arg(6) ; accumulator
+ mov rax, arg(7) ; count
+
+ ; dup the filter weight and store for later
+ movd xmm0, arg(5) ; filter_weight
+ pshuflw xmm0, xmm0, 0
+ punpcklwd xmm0, xmm0
+ movdqa [rsp + filter_weight], xmm0
+
+ mov rbp, arg(1) ; stride
+ pxor xmm7, xmm7 ; zero for extraction
+
+ lea rcx, [rdx + 16*16*1]
+ cmp dword ptr [rsp + block_size], 8
+ jne temporal_filter_apply_load_16
+ lea rcx, [rdx + 8*8*1]
+
+temporal_filter_apply_load_8:
+ movq xmm0, [rsi] ; first row
+ lea rsi, [rsi + rbp] ; += stride
+ punpcklbw xmm0, xmm7 ; src[ 0- 7]
+ movq xmm1, [rsi] ; second row
+ lea rsi, [rsi + rbp] ; += stride
+ punpcklbw xmm1, xmm7 ; src[ 8-15]
+ jmp temporal_filter_apply_load_finished
+
+temporal_filter_apply_load_16:
+ movdqa xmm0, [rsi] ; src (frame1)
+ lea rsi, [rsi + rbp] ; += stride
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7 ; src[ 0- 7]
+ punpckhbw xmm1, xmm7 ; src[ 8-15]
+
+temporal_filter_apply_load_finished:
+ movdqa xmm2, [rdx] ; predictor (frame2)
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm7 ; pred[ 0- 7]
+ punpckhbw xmm3, xmm7 ; pred[ 8-15]
+
+ ; modifier = src_byte - pixel_value
+ psubw xmm0, xmm2 ; src - pred[ 0- 7]
+ psubw xmm1, xmm3 ; src - pred[ 8-15]
+
+ ; modifier *= modifier
+ pmullw xmm0, xmm0 ; modifer[ 0- 7]^2
+ pmullw xmm1, xmm1 ; modifer[ 8-15]^2
+
+ ; modifier *= 3
+ pmullw xmm0, [GLOBAL(_const_3w)]
+ pmullw xmm1, [GLOBAL(_const_3w)]
+
+ ; modifer += 0x8000 >> (16 - strength)
+ paddw xmm0, [rsp + rounding_bit]
+ paddw xmm1, [rsp + rounding_bit]
+
+ ; modifier >>= strength
+ psrlw xmm0, [rsp + strength]
+ psrlw xmm1, [rsp + strength]
+
+ ; modifier = 16 - modifier
+ ; saturation takes care of modifier > 16
+ movdqa xmm3, [GLOBAL(_const_16w)]
+ movdqa xmm2, [GLOBAL(_const_16w)]
+ psubusw xmm3, xmm1
+ psubusw xmm2, xmm0
+
+ ; modifier *= filter_weight
+ pmullw xmm2, [rsp + filter_weight]
+ pmullw xmm3, [rsp + filter_weight]
+
+ ; count
+ movdqa xmm4, [rax]
+ movdqa xmm5, [rax+16]
+ ; += modifier
+ paddw xmm4, xmm2
+ paddw xmm5, xmm3
+ ; write back
+ movdqa [rax], xmm4
+ movdqa [rax+16], xmm5
+ lea rax, [rax + 16*2] ; count += 16*(sizeof(short))
+
+ ; load and extract the predictor up to shorts
+ pxor xmm7, xmm7
+ movdqa xmm0, [rdx]
+ lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7 ; pred[ 0- 7]
+ punpckhbw xmm1, xmm7 ; pred[ 8-15]
+
+ ; modifier *= pixel_value
+ pmullw xmm0, xmm2
+ pmullw xmm1, xmm3
+
+ ; expand to double words
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm7 ; [ 0- 3]
+ punpckhwd xmm2, xmm7 ; [ 4- 7]
+ movdqa xmm3, xmm1
+ punpcklwd xmm1, xmm7 ; [ 8-11]
+ punpckhwd xmm3, xmm7 ; [12-15]
+
+ ; accumulator
+ movdqa xmm4, [rdi]
+ movdqa xmm5, [rdi+16]
+ movdqa xmm6, [rdi+32]
+ movdqa xmm7, [rdi+48]
+ ; += modifier
+ paddw xmm4, xmm0
+ paddw xmm5, xmm2
+ paddw xmm6, xmm1
+ paddw xmm7, xmm3
+ ; write back
+ movdqa [rdi], xmm4
+ movdqa [rdi+16], xmm5
+ movdqa [rdi+32], xmm6
+ movdqa [rdi+48], xmm7
+ lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+ cmp rdx, rcx
+ je temporal_filter_apply_epilog
+ pxor xmm7, xmm7 ; zero for extraction
+ cmp dword ptr [rsp + block_size], 16
+ je temporal_filter_apply_load_16
+ jmp temporal_filter_apply_load_8
+
+temporal_filter_apply_epilog:
+ ; begin epilog
+ mov rbp, [rsp + rbp_backup]
+ add rsp, stack_size
+ pop rsp
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+ times 8 dw 3
+align 16
+_const_top_bit:
+ times 8 dw 1<<15
+align 16
+_const_16w
+ times 8 dw 16
diff --git a/vp8/encoder/x86/temporal_filter_x86.h b/vp8/encoder/x86/temporal_filter_x86.h
new file mode 100644
index 000000000..2daa14018
--- /dev/null
+++ b/vp8/encoder/x86/temporal_filter_x86.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8_TEMPORAL_FILTER_X86_H
+#define __INC_VP8_TEMPORAL_FILTER_X86_H
+
+#if HAVE_SSE2
+extern prototype_apply(vp8_temporal_filter_apply_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef vp8_temporal_filter_apply
+#define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2
+
+#endif
+
+#endif
+
+#endif // __INC_VP8_TEMPORAL_FILTER_X86_H
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 6e317e2a2..c7dffc443 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -309,6 +309,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
/*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2;
+
+ cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_sse2;
}
#endif