diff options
-rw-r--r-- | test/vp9_intrapred_test.cc | 12 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 | ||||
-rw-r--r-- | vpx_dsp/x86/highbd_intrapred_sse2.asm | 53 |
3 files changed, 34 insertions, 33 deletions
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 70f51cdc6..2bebdcbd9 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -155,7 +155,7 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest, &vpx_highbd_v_predictor_16x16_c, 16, 8), make_tuple(&vpx_highbd_v_predictor_32x32_sse2, &vpx_highbd_v_predictor_32x32_c, 32, 8), - make_tuple(&vpx_highbd_tm_predictor_4x4_sse, + make_tuple(&vpx_highbd_tm_predictor_4x4_sse2, &vpx_highbd_tm_predictor_4x4_c, 4, 8), make_tuple(&vpx_highbd_tm_predictor_8x8_sse2, &vpx_highbd_tm_predictor_8x8_c, 8, 8))); @@ -176,7 +176,7 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest, &vpx_highbd_v_predictor_16x16_c, 16, 8), make_tuple(&vpx_highbd_v_predictor_32x32_sse2, &vpx_highbd_v_predictor_32x32_c, 32, 8), - make_tuple(&vpx_highbd_tm_predictor_4x4_sse, + make_tuple(&vpx_highbd_tm_predictor_4x4_sse2, &vpx_highbd_tm_predictor_4x4_c, 4, 8), make_tuple(&vpx_highbd_tm_predictor_8x8_sse2, &vpx_highbd_tm_predictor_8x8_c, 8, 8))); @@ -211,7 +211,7 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest, make_tuple(&vpx_highbd_v_predictor_32x32_sse2, &vpx_highbd_v_predictor_32x32_c, 32, 10), - make_tuple(&vpx_highbd_tm_predictor_4x4_sse, + make_tuple(&vpx_highbd_tm_predictor_4x4_sse2, &vpx_highbd_tm_predictor_4x4_c, 4, 10), make_tuple(&vpx_highbd_tm_predictor_8x8_sse2, &vpx_highbd_tm_predictor_8x8_c, 8, 10))); @@ -233,7 +233,7 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest, &vpx_highbd_v_predictor_16x16_c, 16, 10), make_tuple(&vpx_highbd_v_predictor_32x32_sse2, &vpx_highbd_v_predictor_32x32_c, 32, 10), - make_tuple(&vpx_highbd_tm_predictor_4x4_sse, + make_tuple(&vpx_highbd_tm_predictor_4x4_sse2, &vpx_highbd_tm_predictor_4x4_c, 4, 10), make_tuple(&vpx_highbd_tm_predictor_8x8_sse2, &vpx_highbd_tm_predictor_8x8_c, 8, 10))); @@ -268,7 +268,7 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest, make_tuple(&vpx_highbd_v_predictor_32x32_sse2, &vpx_highbd_v_predictor_32x32_c, 32, 12), - make_tuple(&vpx_highbd_tm_predictor_4x4_sse, + make_tuple(&vpx_highbd_tm_predictor_4x4_sse2, &vpx_highbd_tm_predictor_4x4_c, 4, 12), make_tuple(&vpx_highbd_tm_predictor_8x8_sse2, &vpx_highbd_tm_predictor_8x8_c, 8, 12))); @@ -290,7 +290,7 @@ INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest, &vpx_highbd_v_predictor_16x16_c, 16, 12), make_tuple(&vpx_highbd_v_predictor_32x32_sse2, &vpx_highbd_v_predictor_32x32_c, 32, 12), - make_tuple(&vpx_highbd_tm_predictor_4x4_sse, + make_tuple(&vpx_highbd_tm_predictor_4x4_sse2, &vpx_highbd_tm_predictor_4x4_c, 4, 12), make_tuple(&vpx_highbd_tm_predictor_8x8_sse2, &vpx_highbd_tm_predictor_8x8_c, 8, 12))); diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 23065df79..a2a067457 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -291,7 +291,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_v_predictor_4x4/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse_x86inc"; + specialize qw/vpx_highbd_tm_predictor_4x4/, "$sse2_x86inc"; add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_dc_predictor_4x4/, "$sse2_x86inc"; diff --git a/vpx_dsp/x86/highbd_intrapred_sse2.asm b/vpx_dsp/x86/highbd_intrapred_sse2.asm index c8ed613b3..233958a52 100644 --- a/vpx_dsp/x86/highbd_intrapred_sse2.asm +++ b/vpx_dsp/x86/highbd_intrapred_sse2.asm @@ -257,43 +257,44 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above jnz .loop REP_RET -INIT_MMX sse -cglobal highbd_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one +INIT_XMM sse2 +cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps movd m1, [aboveq-2] movq m0, [aboveq] - pshufw m1, m1, 0x0 + pshuflw m1, m1, 0x0 + movlhps m0, m0 ; t1 t2 t3 t4 t1 t2 t3 t4 + movlhps m1, m1 ; tl tl tl tl tl tl tl tl ; Get the values to compute the maximum value at this bit depth - mov oned, 1 - movd m3, oned + pcmpeqw m3, m3 movd m4, bpsd - pshufw m3, m3, 0x0 - DEFINE_ARGS dst, stride, line, left - mov lineq, -2 - mova m2, m3 + psubw m0, m1 ; t1-tl t2-tl t3-tl t4-tl psllw m3, m4 - add leftq, 8 - psubw m3, m2 ; max possible value - pxor m4, m4 ; min possible value - psubw m0, m1 -.loop: - movq m1, [leftq+lineq*4] - movq m2, [leftq+lineq*4+2] - pshufw m1, m1, 0x0 - pshufw m2, m2, 0x0 - paddw m1, m0 + pcmpeqw m2, m2 + pxor m4, m4 ; min possible value + pxor m3, m2 ; max possible value + mova m1, [leftq] + pshuflw m2, m1, 0x0 + pshuflw m5, m1, 0x55 + movlhps m2, m5 ; l1 l1 l1 l1 l2 l2 l2 l2 paddw m2, m0 ;Clamp to the bit-depth - pminsw m1, m3 pminsw m2, m3 - pmaxsw m1, m4 pmaxsw m2, m4 ;Store the values - movq [dstq ], m1 - movq [dstq+strideq*2], m2 + movq [dstq ], m2 + movhpd [dstq+strideq*2], m2 lea dstq, [dstq+strideq*4] - inc lineq - jnz .loop - REP_RET + pshuflw m2, m1, 0xaa + pshuflw m5, m1, 0xff + movlhps m2, m5 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m2, m3 + pmaxsw m2, m4 + ;Store the values + movq [dstq ], m2 + movhpd [dstq+strideq*2], m2 + RET INIT_XMM sse2 cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one |