diff options
author | Parag Salasakar <img.mips1@gmail.com> | 2015-06-13 10:18:47 +0530 |
---|---|---|
committer | Parag Salasakar <img.mips1@gmail.com> | 2015-06-16 12:49:34 +0530 |
commit | 89b4b315aa3b2ff90766672272af69505cd13f0f (patch) | |
tree | f369d63522032f4f025fed86dbaa5a4e9883cbb3 /vp9/common | |
parent | a4bb5f2a29fc925f0fd033490c1c8ecb54e502c3 (diff) | |
download | libvpx-89b4b315aa3b2ff90766672272af69505cd13f0f.tar libvpx-89b4b315aa3b2ff90766672272af69505cd13f0f.tar.gz libvpx-89b4b315aa3b2ff90766672272af69505cd13f0f.tar.bz2 libvpx-89b4b315aa3b2ff90766672272af69505cd13f0f.zip |
mips msa vp9 fdct 16x16 optimization
average improvement ~4x-6x
Change-Id: Id3b2243e5b3c7844c90c4231a5e75fa69911362c
Diffstat (limited to 'vp9/common')
-rw-r--r-- | vp9/common/mips/msa/vp9_macros_msa.h | 86 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 6 |
2 files changed, 82 insertions, 10 deletions
diff --git a/vp9/common/mips/msa/vp9_macros_msa.h b/vp9/common/mips/msa/vp9_macros_msa.h index 2043e13b3..2f2390bb2 100644 --- a/vp9/common/mips/msa/vp9_macros_msa.h +++ b/vp9/common/mips/msa/vp9_macros_msa.h @@ -380,6 +380,17 @@ out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \ } +/* Description : Load 2 vectors of signed word elements with stride + Arguments : Inputs - psrc (source pointer to load from) + - stride + Outputs - out0, out1 + Return Type - signed word +*/ +#define LD_SW2(psrc, stride, out0, out1) { \ + out0 = LD_SW((psrc)); \ + out1 = LD_SW((psrc) + stride); \ +} + /* Description : Store vectors of 16 byte elements with stride Arguments : Inputs - in0, in1, stride Outputs - pdst (destination pointer to store to) @@ -777,6 +788,24 @@ CLIP_SH2_0_255(in2, in3); \ } +/* Description : Addition of 4 signed word elements + 4 signed word elements of input vector are added together and + the resulting integer sum is returned + Arguments : Inputs - in (signed word vector) + Outputs - sum_m (i32 sum) + Return Type - signed word +*/ +#define HADD_SW_S32(in) ({ \ + v2i64 res0_m, res1_m; \ + int32_t sum_m; \ + \ + res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \ + res1_m = __msa_splati_d(res0_m, 1); \ + res0_m = res0_m + res1_m; \ + sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \ + sum_m; \ +}) + /* Description : Horizontal addition of unsigned byte vector elements Arguments : Inputs - in0, in1 Outputs - out0, out1 @@ -1073,8 +1102,8 @@ Outputs - in0, in1, in2, in3 (in place) Return Type - unsigned halfword Details : Each unsigned halfword element from 'in0' is saturated to the - value generated with (sat_val+1) bit range - Results are in placed to original vectors + value generated with (sat_val+1) bit range. + The results are in placed to original vectors */ #define SAT_UH2(RTYPE, in0, in1, sat_val) { \ in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ @@ -1096,7 +1125,7 @@ Return Type - unsigned halfword Details : Each unsigned halfword element from 'in0' is saturated to the value generated with (sat_val+1) bit range - Results are in placed to original vectors + The results are in placed to original vectors */ #define SAT_SH2(RTYPE, in0, in1, sat_val) { \ in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ @@ -1216,10 +1245,10 @@ Outputs - in0, in1 (in-place) Return Type - as per RTYPE Details : Each unsigned byte element from input vector 'in0' is - logically xor'ed with 128 and result is in-place stored in + logically xor'ed with 128 and the result is in-place stored in 'in0' vector Each unsigned byte element from input vector 'in1' is - logically xor'ed with 128 and result is in-place stored in + logically xor'ed with 128 and the result is in-place stored in 'in1' vector Similar for other pairs */ @@ -1271,13 +1300,28 @@ } #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) +/* Description : Shift left all elements of vector (generic for all data types) + Arguments : Inputs - in0, in1, in2, in3, shift + Outputs - in0, in1, in2, in3 (in place) + Return Type - as per input vector RTYPE + Details : Each element of vector 'in0' is left shifted by 'shift' and + the result is in place written to 'in0' + Similar for other pairs +*/ +#define SLLI_4V(in0, in1, in2, in3, shift) { \ + in0 = in0 << shift; \ + in1 = in1 << shift; \ + in2 = in2 << shift; \ + in3 = in3 << shift; \ +} + /* Description : Arithmetic shift right all elements of vector (generic for all data types) Arguments : Inputs - in0, in1, in2, in3, shift Outputs - in0, in1, in2, in3 (in place) Return Type - as per input vector RTYPE Details : Each element of vector 'in0' is right shifted by 'shift' and - result is in place written to 'in0' + the result is in place written to 'in0' Here, 'shift' is GP variable passed in Similar for other pairs */ @@ -1427,6 +1471,34 @@ out7 = in0 - in7; \ } +/* Description : Butterfly of 16 input vectors + Arguments : Inputs - in0 ... in15 + Outputs - out0 .. out15 + Details : Butterfly operation +*/ +#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \ + in8, in9, in10, in11, in12, in13, in14, in15, \ + out0, out1, out2, out3, out4, out5, out6, out7, \ + out8, out9, out10, out11, out12, out13, out14, out15) { \ + out0 = in0 + in15; \ + out1 = in1 + in14; \ + out2 = in2 + in13; \ + out3 = in3 + in12; \ + out4 = in4 + in11; \ + out5 = in5 + in10; \ + out6 = in6 + in9; \ + out7 = in7 + in8; \ + \ + out8 = in7 - in8; \ + out9 = in6 - in9; \ + out10 = in5 - in10; \ + out11 = in4 - in11; \ + out12 = in3 - in12; \ + out13 = in2 - in13; \ + out14 = in1 - in14; \ + out15 = in0 - in15; \ +} + /* Description : Transposes input 8x8 byte block Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 (input 8x8 byte block) @@ -1640,7 +1712,7 @@ Outputs - out_m Return Type - unsigned byte Details : Signed byte even elements from 'in0' and 'in1' are packed - together in one vector and the resulted vector is xor'ed with + together in one vector and the resulting vector is xor'ed with 128 to shift the range from signed to unsigned byte */ #define PCKEV_XORI128_UB(in0, in1) ({ \ diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 2f262a6f1..ba76eef07 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1029,7 +1029,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_fht8x8 sse2/; add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_fht16x16 sse2/; + specialize qw/vp9_fht16x16 sse2 msa/; add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fwht4x4/, "$mmx_x86inc"; @@ -1047,10 +1047,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64"; add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct16x16_1 sse2/; + specialize qw/vp9_fdct16x16_1 sse2 msa/; add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vp9_fdct16x16 sse2/; + specialize qw/vp9_fdct16x16 sse2 msa/; add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_fdct32x32_1 sse2/; |