summaryrefslogtreecommitdiff
path: root/vp8/encoder
diff options
context:
space:
mode:
Diffstat (limited to 'vp8/encoder')
-rw-r--r--vp8/encoder/firstpass.c188
-rw-r--r--vp8/encoder/onyx_int.h1
-rw-r--r--vp8/encoder/rdopt.c2
-rw-r--r--vp8/encoder/x86/variance_impl_sse2.asm33
-rw-r--r--vp8/encoder/x86/variance_impl_ssse3.asm348
-rw-r--r--vp8/encoder/x86/variance_ssse3.c140
-rw-r--r--vp8/encoder/x86/variance_x86.h4
-rw-r--r--vp8/encoder/x86/x86_csystemdependent.c2
8 files changed, 642 insertions, 76 deletions
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index d489413f6..804b80bd5 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -312,7 +312,9 @@ void vp8_output_stats(const VP8_COMP *cpi,
FILE *fpfile;
fpfile = fopen("firstpass.stt", "a");
- fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f\n",
+ fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f"
+ " %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f"
+ " %12.4f\n",
stats->frame,
stats->intra_error,
stats->coded_error,
@@ -320,6 +322,7 @@ void vp8_output_stats(const VP8_COMP *cpi,
stats->pcnt_inter,
stats->pcnt_motion,
stats->pcnt_second_ref,
+ stats->pcnt_neutral,
stats->MVr,
stats->mvr_abs,
stats->MVc,
@@ -327,7 +330,8 @@ void vp8_output_stats(const VP8_COMP *cpi,
stats->MVrv,
stats->MVcv,
stats->mv_in_out_count,
- stats->count);
+ stats->count,
+ stats->duration);
fclose(fpfile);
@@ -359,6 +363,7 @@ void vp8_zero_stats(FIRSTPASS_STATS *section)
section->pcnt_inter = 0.0;
section->pcnt_motion = 0.0;
section->pcnt_second_ref = 0.0;
+ section->pcnt_neutral = 0.0;
section->MVr = 0.0;
section->mvr_abs = 0.0;
section->MVc = 0.0;
@@ -378,6 +383,7 @@ void vp8_accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
section->pcnt_inter += frame->pcnt_inter;
section->pcnt_motion += frame->pcnt_motion;
section->pcnt_second_ref += frame->pcnt_second_ref;
+ section->pcnt_neutral += frame->pcnt_neutral;
section->MVr += frame->MVr;
section->mvr_abs += frame->mvr_abs;
section->MVc += frame->MVc;
@@ -398,6 +404,7 @@ void vp8_avg_stats(FIRSTPASS_STATS *section)
section->ssim_weighted_pred_err /= section->count;
section->pcnt_inter /= section->count;
section->pcnt_second_ref /= section->count;
+ section->pcnt_neutral /= section->count;
section->pcnt_motion /= section->count;
section->MVr /= section->count;
section->mvr_abs /= section->count;
@@ -570,6 +577,7 @@ void vp8_first_pass(VP8_COMP *cpi)
int intercount = 0;
int second_ref_count = 0;
int intrapenalty = 256;
+ int neutral_count = 0;
int sum_in_vectors = 0;
@@ -726,6 +734,17 @@ void vp8_first_pass(VP8_COMP *cpi)
if (motion_error <= this_error)
{
+ // Keep a count of cases where the inter and intra were
+ // very close and very low. This helps with scene cut
+ // detection for example in cropped clips with black bars
+ // at the sides or top and bottom.
+ if( (((this_error-intrapenalty) * 9) <=
+ (motion_error*10)) &&
+ (this_error < (2*intrapenalty)) )
+ {
+ neutral_count++;
+ }
+
d->bmi.mv.as_mv.row <<= 3;
d->bmi.mv.as_mv.col <<= 3;
this_error = motion_error;
@@ -854,6 +873,7 @@ void vp8_first_pass(VP8_COMP *cpi)
fps.pcnt_inter = 1.0 * (double)intercount / cm->MBs;
fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
+ fps.pcnt_neutral = 1.0 * (double)neutral_count / cm->MBs;
if (mvcount > 0)
{
@@ -1341,7 +1361,7 @@ void vp8_end_second_pass(VP8_COMP *cpi)
// This function gives and estimate of how badly we believe
// the predicition quality is decaying from frame to frame.
-double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+double get_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
{
double prediction_decay_rate;
double motion_decay;
@@ -1376,6 +1396,52 @@ double gf_prediction_decay_rate(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
return prediction_decay_rate;
}
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+BOOL detect_transition_to_still(
+ VP8_COMP *cpi,
+ int frame_interval,
+ int still_interval,
+ double loop_decay_rate,
+ double decay_accumulator )
+{
+ BOOL trans_to_still = FALSE;
+
+ // Break clause to detect very still sections after motion
+ // For example a static image after a fade or other transition
+ // instead of a clean scene cut.
+ if ( (frame_interval > MIN_GF_INTERVAL) &&
+ (loop_decay_rate >= 0.999) &&
+ (decay_accumulator < 0.9) )
+ {
+ int j;
+ FIRSTPASS_STATS * position = cpi->stats_in;
+ FIRSTPASS_STATS tmp_next_frame;
+ double decay_rate;
+
+ // Look ahead a few frames to see if static condition
+ // persists...
+ for ( j = 0; j < still_interval; j++ )
+ {
+ if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
+ break;
+
+ decay_rate = get_prediction_decay_rate(cpi, &tmp_next_frame);
+ if ( decay_rate < 0.999 )
+ break;
+ }
+ // Reset file position
+ reset_fpf_position(cpi, position);
+
+ // Only if it does do we signal a transition to still
+ if ( j == still_interval )
+ trans_to_still = TRUE;
+ }
+
+ return trans_to_still;
+}
+
// Analyse and define a gf/arf group .
static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
{
@@ -1528,7 +1594,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
if (r > GF_RMAX)
r = GF_RMAX;
- loop_decay_rate = gf_prediction_decay_rate(cpi, &next_frame);
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
// Cumulative effect of decay
decay_accumulator = decay_accumulator * loop_decay_rate;
@@ -1537,48 +1603,13 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
boost_score += (decay_accumulator * r);
// Break clause to detect very still sections after motion
- // For example a staic image after a fade or other transition
- // instead of a clean key frame.
- if ( (i > MIN_GF_INTERVAL) &&
- (loop_decay_rate >= 0.999) &&
- (decay_accumulator < 0.9) )
+ // For example a staic image after a fade or other transition.
+ if ( detect_transition_to_still( cpi, i, 5,
+ loop_decay_rate, decay_accumulator ) )
{
- int j;
- FIRSTPASS_STATS * position = cpi->stats_in;
- FIRSTPASS_STATS tmp_next_frame;
- double decay_rate;
-
- // Look ahead a few frames to see if static condition
- // persists...
- for ( j = 0; j < 4; j++ )
- {
- if (EOF == vp8_input_stats(cpi, &tmp_next_frame))
- break;
-
- decay_rate = gf_prediction_decay_rate(cpi, &tmp_next_frame);
- if ( decay_rate < 0.999 )
- break;
- }
- reset_fpf_position(cpi, position); // Reset file position
-
- // Force GF not alt ref
- if ( j == 4 )
- {
- if (0)
- {
- FILE *f = fopen("fadegf.stt", "a");
- fprintf(f, " %8d %8d %10.4f %10.4f %10.4f\n",
- cpi->common.current_video_frame+i, i,
- loop_decay_rate, decay_accumulator,
- boost_score );
- fclose(f);
- }
-
- allow_alt_ref = FALSE;
-
- boost_score = old_boost_score;
- break;
- }
+ allow_alt_ref = FALSE;
+ boost_score = old_boost_score;
+ break;
}
// Break out conditions.
@@ -2285,7 +2316,7 @@ static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRST
(next_frame->pcnt_second_ref < 0.10) &&
((this_frame->pcnt_inter < 0.05) ||
(
- (this_frame->pcnt_inter < .25) &&
+ ((this_frame->pcnt_inter - this_frame->pcnt_neutral) < .25) &&
((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
(fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
@@ -2332,7 +2363,9 @@ static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRST
// Test various breakout clauses
if ((local_next_frame.pcnt_inter < 0.05) ||
(next_iiratio < 1.5) ||
- ((local_next_frame.pcnt_inter < 0.20) && (next_iiratio < 3.0)) ||
+ (((local_next_frame.pcnt_inter -
+ local_next_frame.pcnt_neutral) < 0.20) &&
+ (next_iiratio < 3.0)) ||
((boost_score - old_boost_score) < 0.5) ||
(local_next_frame.intra_error < 200)
)
@@ -2363,13 +2396,13 @@ static BOOL test_candidate_kf(VP8_COMP *cpi, FIRSTPASS_STATS *last_frame, FIRST
}
void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
{
- int i;
+ int i,j;
FIRSTPASS_STATS last_frame;
FIRSTPASS_STATS first_frame;
FIRSTPASS_STATS next_frame;
FIRSTPASS_STATS *start_position;
- double decay_accumulator = 0;
+ double decay_accumulator = 1.0;
double boost_score = 0;
double old_boost_score = 0.0;
double loop_decay_rate;
@@ -2379,6 +2412,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
double kf_group_intra_err = 0.0;
double kf_group_coded_err = 0.0;
double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+ double recent_loop_decay[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
@@ -2407,6 +2441,7 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
kf_mod_err = calculate_modified_err(cpi, this_frame);
// find the next keyframe
+ i = 0;
while (cpi->stats_in < cpi->stats_in_end)
{
// Accumulate kf group error
@@ -2425,9 +2460,34 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
if (cpi->oxcf.auto_key
&& lookup_next_frame_stats(cpi, &next_frame) != EOF)
{
+ // Normal scene cut check
if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
break;
+ // How fast is prediction quality decaying
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+ // We want to know something about the recent past... rather than
+ // as used elsewhere where we are concened with decay in prediction
+ // quality since the last GF or KF.
+ recent_loop_decay[i%8] = loop_decay_rate;
+ decay_accumulator = 1.0;
+ for (j = 0; j < 8; j++)
+ {
+ decay_accumulator = decay_accumulator * recent_loop_decay[j];
+ }
+
+ // Special check for transition or high motion followed by a
+ // to a static scene.
+ if ( detect_transition_to_still( cpi, i,
+ (cpi->key_frame_frequency-i),
+ loop_decay_rate,
+ decay_accumulator ) )
+ {
+ break;
+ }
+
+
// Step on to the next frame
cpi->frames_to_key ++;
@@ -2437,6 +2497,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
break;
} else
cpi->frames_to_key ++;
+
+ i++;
}
// If there is a max kf interval set by the user we must obey it.
@@ -2588,32 +2650,8 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
if (r > RMAX)
r = RMAX;
- // Adjust loop decay rate
- //if ( next_frame.pcnt_inter < loop_decay_rate )
- loop_decay_rate = next_frame.pcnt_inter;
-
- // High % motion -> somewhat higher decay rate
- motion_pct = next_frame.pcnt_motion;
- motion_decay = (1.0 - (motion_pct / 20.0));
- if (motion_decay < loop_decay_rate)
- loop_decay_rate = motion_decay;
-
- // Adjustment to decay rate based on speed of motion
- {
- double this_mv_rabs;
- double this_mv_cabs;
- double distance_factor;
-
- this_mv_rabs = fabs(next_frame.mvr_abs * motion_pct);
- this_mv_cabs = fabs(next_frame.mvc_abs * motion_pct);
-
- distance_factor = sqrt((this_mv_rabs * this_mv_rabs) +
- (this_mv_cabs * this_mv_cabs)) / 250.0;
- distance_factor = ((distance_factor > 1.0)
- ? 0.0 : (1.0 - distance_factor));
- if (distance_factor < loop_decay_rate)
- loop_decay_rate = distance_factor;
- }
+ // How fast is prediction quality decaying
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
decay_accumulator = decay_accumulator * loop_decay_rate;
decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 5a4b3c185..0d353c31f 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -99,6 +99,7 @@ typedef struct
double pcnt_inter;
double pcnt_motion;
double pcnt_second_ref;
+ double pcnt_neutral;
double MVr;
double mvr_abs;
double MVc;
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 3db05155c..867ff6a9c 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -2033,7 +2033,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
else
cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
}
- else if (vp8_ref_frame_order[mode_index] == SPLITMV)
+ else if (vp8_mode_order[mode_index] == SPLITMV)
cpi->zbin_mode_boost = 0;
else
cpi->zbin_mode_boost = MV_ZBIN_BOOST;
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index 6cdc47bc9..5d1a17d44 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -627,6 +627,10 @@ filter_block2d_bil_var_sse2_loop:
filter_block2d_bil_var_sse2_sp_only:
movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
+ je filter_block2d_bil_var_sse2_full_pixel
+
shl rdx, 5
lea rdx, [rdx + rcx] ; VFilter
@@ -671,6 +675,35 @@ filter_block2d_bil_sp_only_loop:
jmp filter_block2d_bil_variance
+filter_block2d_bil_var_sse2_full_pixel:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+ pxor xmm0, xmm0 ;
+
+filter_block2d_bil_full_pixel_loop:
+ movq xmm1, QWORD PTR [rsi] ;
+ punpcklbw xmm1, xmm0 ;
+
+ movq xmm2, QWORD PTR [rdi] ;
+ punpcklbw xmm2, xmm0 ;
+
+ psubw xmm1, xmm2 ;
+ paddw xmm6, xmm1 ;
+
+ pmaddwd xmm1, xmm1 ;
+ paddd xmm7, xmm1 ;
+
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1 ;
+ jnz filter_block2d_bil_full_pixel_loop ;
+
+ jmp filter_block2d_bil_variance
+
filter_block2d_bil_var_sse2_fp_only:
mov rsi, arg(0) ;ref_ptr
mov rdi, arg(2) ;src_ptr
diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm
new file mode 100644
index 000000000..b1976328d
--- /dev/null
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm
@@ -0,0 +1,348 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift 7
+
+
+;void vp8_filter_block2d_bil_var_ssse3
+;(
+; unsigned char *ref_ptr,
+; int ref_pixels_per_line,
+; unsigned char *src_ptr,
+; int src_pixels_per_line,
+; unsigned int Height,
+; int xoffset,
+; int yoffset,
+; int *sum,
+; unsigned int *sumsquared;;
+;
+;)
+;Note: The filter coefficient at offset=0 is 128. Since the second register
+;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
+global sym(vp8_filter_block2d_bil_var_ssse3)
+sym(vp8_filter_block2d_bil_var_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM
+ GET_GOT rbx
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ pxor xmm6, xmm6
+ pxor xmm7, xmm7
+
+ lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
+ movsxd rax, dword ptr arg(5) ; xoffset
+
+ cmp rax, 0 ; skip first_pass filter if xoffset=0
+ je filter_block2d_bil_var_ssse3_sp_only
+
+ shl rax, 4 ; point to filter coeff with xoffset
+ lea rax, [rax + rcx] ; HFilter
+
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; skip second_pass filter if yoffset=0
+ je filter_block2d_bil_var_ssse3_fp_only
+
+ shl rdx, 4
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+
+ movdqu xmm0, XMMWORD PTR [rsi]
+ movdqu xmm1, XMMWORD PTR [rsi+1]
+ movdqa xmm2, xmm0
+
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm2, xmm1
+ pmaddubsw xmm0, [rax]
+ pmaddubsw xmm2, [rax]
+
+ paddw xmm0, [GLOBAL(xmm_bi_rd)]
+ paddw xmm2, [GLOBAL(xmm_bi_rd)]
+ psraw xmm0, xmm_filter_shift
+ psraw xmm2, xmm_filter_shift
+
+ packuswb xmm0, xmm2
+
+ movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
+ lea rsi, [rsi + rbx]
+%if ABI_IS_32BIT=0
+ movsxd r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+filter_block2d_bil_var_ssse3_loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rsi+1]
+ movdqa xmm3, xmm1
+
+ punpcklbw xmm1, xmm2
+ punpckhbw xmm3, xmm2
+ pmaddubsw xmm1, [rax]
+ pmaddubsw xmm3, [rax]
+
+ paddw xmm1, [GLOBAL(xmm_bi_rd)]
+ paddw xmm3, [GLOBAL(xmm_bi_rd)]
+ psraw xmm1, xmm_filter_shift
+ psraw xmm3, xmm_filter_shift
+ packuswb xmm1, xmm3
+
+ movdqa xmm2, xmm0
+ movdqa xmm0, xmm1
+ movdqa xmm3, xmm2
+
+ punpcklbw xmm2, xmm1
+ punpckhbw xmm3, xmm1
+ pmaddubsw xmm2, [rdx]
+ pmaddubsw xmm3, [rdx]
+
+ paddw xmm2, [GLOBAL(xmm_bi_rd)]
+ paddw xmm3, [GLOBAL(xmm_bi_rd)]
+ psraw xmm2, xmm_filter_shift
+ psraw xmm3, xmm_filter_shift
+
+ movq xmm1, QWORD PTR [rdi]
+ pxor xmm4, xmm4
+ punpcklbw xmm1, xmm4
+ movq xmm5, QWORD PTR [rdi+8]
+ punpcklbw xmm5, xmm4
+
+ psubw xmm2, xmm1
+ psubw xmm3, xmm5
+ paddw xmm6, xmm2
+ paddw xmm6, xmm3
+ pmaddwd xmm2, xmm2
+ pmaddwd xmm3, xmm3
+ paddd xmm7, xmm2
+ paddd xmm7, xmm3
+
+ lea rsi, [rsi + rbx] ;ref_pixels_per_line
+%if ABI_IS_32BIT
+ add rdi, dword ptr arg(3) ;src_pixels_per_line
+%else
+ lea rdi, [rdi + r9]
+%endif
+
+ sub rcx, 1
+ jnz filter_block2d_bil_var_ssse3_loop
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_sp_only:
+ movsxd rdx, dword ptr arg(6) ; yoffset
+
+ cmp rdx, 0 ; Both xoffset =0 and yoffset=0
+ je filter_block2d_bil_var_ssse3_full_pixel
+
+ shl rdx, 4
+ lea rdx, [rdx + rcx] ; VFilter
+
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqa xmm0, xmm1
+
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+ lea rsi, [rsi + rax]
+
+filter_block2d_bil_sp_only_loop:
+ movdqu xmm3, XMMWORD PTR [rsi]
+ movdqa xmm2, xmm1
+ movdqa xmm0, xmm3
+
+ punpcklbw xmm1, xmm3
+ punpckhbw xmm2, xmm3
+ pmaddubsw xmm1, [rdx]
+ pmaddubsw xmm2, [rdx]
+
+ paddw xmm1, [GLOBAL(xmm_bi_rd)]
+ paddw xmm2, [GLOBAL(xmm_bi_rd)]
+ psraw xmm1, xmm_filter_shift
+ psraw xmm2, xmm_filter_shift
+
+ movq xmm3, QWORD PTR [rdi]
+ pxor xmm4, xmm4
+ punpcklbw xmm3, xmm4
+ movq xmm5, QWORD PTR [rdi+8]
+ punpcklbw xmm5, xmm4
+
+ psubw xmm1, xmm3
+ psubw xmm2, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, xmm2
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm7, xmm1
+ paddd xmm7, xmm2
+
+ movdqa xmm1, xmm0
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1
+ jnz filter_block2d_bil_sp_only_loop
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_full_pixel:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+ pxor xmm0, xmm0
+
+filter_block2d_bil_full_pixel_loop:
+ movq xmm1, QWORD PTR [rsi]
+ punpcklbw xmm1, xmm0
+ movq xmm2, QWORD PTR [rsi+8]
+ punpcklbw xmm2, xmm0
+
+ movq xmm3, QWORD PTR [rdi]
+ punpcklbw xmm3, xmm0
+ movq xmm4, QWORD PTR [rdi+8]
+ punpcklbw xmm4, xmm0
+
+ psubw xmm1, xmm3
+ psubw xmm2, xmm4
+ paddw xmm6, xmm1
+ paddw xmm6, xmm2
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm7, xmm1
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + rax] ;ref_pixels_per_line
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+ sub rcx, 1
+ jnz filter_block2d_bil_full_pixel_loop
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_var_ssse3_fp_only:
+ mov rsi, arg(0) ;ref_ptr
+ mov rdi, arg(2) ;src_ptr
+ movsxd rcx, dword ptr arg(4) ;Height
+ movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
+
+ pxor xmm0, xmm0
+ movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
+
+filter_block2d_bil_fp_only_loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rsi+1]
+ movdqa xmm3, xmm1
+
+ punpcklbw xmm1, xmm2
+ punpckhbw xmm3, xmm2
+ pmaddubsw xmm1, [rax]
+ pmaddubsw xmm3, [rax]
+
+ paddw xmm1, [GLOBAL(xmm_bi_rd)]
+ paddw xmm3, [GLOBAL(xmm_bi_rd)]
+ psraw xmm1, xmm_filter_shift
+ psraw xmm3, xmm_filter_shift
+
+ movq xmm2, XMMWORD PTR [rdi]
+ pxor xmm4, xmm4
+ punpcklbw xmm2, xmm4
+ movq xmm5, QWORD PTR [rdi+8]
+ punpcklbw xmm5, xmm4
+
+ psubw xmm1, xmm2
+ psubw xmm3, xmm5
+ paddw xmm6, xmm1
+ paddw xmm6, xmm3
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm3, xmm3
+ paddd xmm7, xmm1
+ paddd xmm7, xmm3
+
+ lea rsi, [rsi + rdx]
+ lea rdi, [rdi + rbx] ;src_pixels_per_line
+
+ sub rcx, 1
+ jnz filter_block2d_bil_fp_only_loop
+
+ jmp filter_block2d_bil_variance
+
+filter_block2d_bil_variance:
+ pxor xmm0, xmm0
+ pxor xmm1, xmm1
+ pxor xmm5, xmm5
+
+ punpcklwd xmm0, xmm6
+ punpckhwd xmm1, xmm6
+ psrad xmm0, 16
+ psrad xmm1, 16
+ paddd xmm0, xmm1
+ movdqa xmm1, xmm0
+
+ movdqa xmm6, xmm7
+ punpckldq xmm6, xmm5
+ punpckhdq xmm7, xmm5
+ paddd xmm6, xmm7
+
+ punpckldq xmm0, xmm5
+ punpckhdq xmm1, xmm5
+ paddd xmm0, xmm1
+
+ movdqa xmm7, xmm6
+ movdqa xmm1, xmm0
+
+ psrldq xmm7, 8
+ psrldq xmm1, 8
+
+ paddd xmm6, xmm7
+ paddd xmm0, xmm1
+
+ mov rsi, arg(7) ;[Sum]
+ mov rdi, arg(8) ;[SSE]
+
+ movd [rsi], xmm0
+ movd [rdi], xmm6
+
+ ; begin epilog
+ pop rbx
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+SECTION_RODATA
+align 16
+xmm_bi_rd:
+ times 8 dw 64
+align 16
+vp8_bilinear_filters_ssse3:
+ times 8 db 128, 0
+ times 8 db 112, 16
+ times 8 db 96, 32
+ times 8 db 80, 48
+ times 8 db 64, 64
+ times 8 db 48, 80
+ times 8 db 32, 96
+ times 8 db 16, 112
diff --git a/vp8/encoder/x86/variance_ssse3.c b/vp8/encoder/x86/variance_ssse3.c
new file mode 100644
index 000000000..750ae8b86
--- /dev/null
+++ b/vp8/encoder/x86/variance_ssse3.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vp8/encoder/variance.h"
+#include "vp8/common/pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern unsigned int vp8_get16x16var_sse2
+(
+ const unsigned char *src_ptr,
+ int source_stride,
+ const unsigned char *ref_ptr,
+ int recon_stride,
+ unsigned int *SSE,
+ int *Sum
+);
+extern void vp8_half_horiz_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_half_horiz_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_half_vert_variance16x_h_sse2
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int *sum,
+ unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_ssse3
+(
+ const unsigned char *ref_ptr,
+ int ref_pixels_per_line,
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ unsigned int Height,
+ int xoffset,
+ int yoffset,
+ int *sum,
+ unsigned int *sumsquared
+);
+
+unsigned int vp8_sub_pixel_variance16x16_ssse3
+(
+ const unsigned char *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const unsigned char *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse
+)
+{
+ int xsum0, xsum1;
+ unsigned int xxsum0, xxsum1;
+
+ // note we could avoid these if statements if the calling function
+ // just called the appropriate functions inside.
+ if (xoffset == 4 && yoffset == 0)
+ {
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ vp8_half_horiz_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ &xsum1, &xxsum1);
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ }
+ else if (xoffset == 0 && yoffset == 4)
+ {
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ vp8_half_vert_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ &xsum1, &xxsum1);
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ }
+ else if (xoffset == 4 && yoffset == 4)
+ {
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ &xsum0, &xxsum0);
+
+ vp8_half_horiz_vert_variance16x_h_sse2(
+ src_ptr + 8, src_pixels_per_line,
+ dst_ptr + 8, dst_pixels_per_line, 16,
+ &xsum1, &xxsum1);
+
+ xsum0 += xsum1;
+ xxsum0 += xxsum1;
+ }
+ else
+ {
+ vp8_filter_block2d_bil_var_ssse3(
+ src_ptr, src_pixels_per_line,
+ dst_ptr, dst_pixels_per_line, 16,
+ xoffset, yoffset,
+ &xsum0, &xxsum0);
+ }
+
+ *sse = xxsum0;
+ return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index 6bea15ebc..1e2fb3490 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -286,6 +286,7 @@ extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
#if HAVE_SSSE3
extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);
extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_ssse3);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_variance_sad16x16x3
@@ -294,6 +295,9 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
#undef vp8_variance_sad16x8x3
#define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3
+#undef vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_ssse3
+
#endif
#endif
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 61c603229..c7639a7e4 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -334,6 +334,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3;
cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3;
+ cpi->rtcd.variance.subpixvar16x16 = vp8_sub_pixel_variance16x16_ssse3;
+
cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3;
}