mips msa vp9 fdct 16x16 optimization

average improvement ~4x-6x Change-Id: Id3b2243e5b3c7844c90c4231a5e75fa69911362c
author: Parag Salasakar <img.mips1@gmail.com> 2015-06-13 10:18:47 +0530
committer: Parag Salasakar <img.mips1@gmail.com> 2015-06-16 12:49:34 +0530
commit: 89b4b315aa3b2ff90766672272af69505cd13f0f (patch)
tree: f369d63522032f4f025fed86dbaa5a4e9883cbb3 /vp9/common
parent: a4bb5f2a29fc925f0fd033490c1c8ecb54e502c3 (diff)
download: libvpx-89b4b315aa3b2ff90766672272af69505cd13f0f.tar
libvpx-89b4b315aa3b2ff90766672272af69505cd13f0f.tar.gz
libvpx-89b4b315aa3b2ff90766672272af69505cd13f0f.tar.bz2
libvpx-89b4b315aa3b2ff90766672272af69505cd13f0f.zip
2 files changed, 82 insertions, 10 deletions
diff --git a/vp9/common/mips/msa/vp9_macros_msa.h b/vp9/common/mips/msa/vp9_macros_msa.h
index 2043e13b3..2f2390bb2 100644
--- a/vp9/common/mips/msa/vp9_macros_msa.h
+++ b/vp9/common/mips/msa/vp9_macros_msa.h
@@ -380,6 +380,17 @@
   out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2);  \
 }
 
+/* Description : Load 2 vectors of signed word elements with stride
+   Arguments   : Inputs  - psrc    (source pointer to load from)
+                         - stride
+                 Outputs - out0, out1
+                 Return Type - signed word
+*/
+#define LD_SW2(psrc, stride, out0, out1) {  \
+  out0 = LD_SW((psrc));                     \
+  out1 = LD_SW((psrc) + stride);            \
+}
+
 /* Description : Store vectors of 16 byte elements with stride
    Arguments   : Inputs  - in0, in1, stride
                  Outputs - pdst    (destination pointer to store to)
@@ -777,6 +788,24 @@
   CLIP_SH2_0_255(in2, in3);                   \
 }
 
+/* Description : Addition of 4 signed word elements
+                 4 signed word elements of input vector are added together and
+                 the resulting integer sum is returned
+   Arguments   : Inputs  - in       (signed word vector)
+                 Outputs - sum_m    (i32 sum)
+                 Return Type - signed word
+*/
+#define HADD_SW_S32(in) ({                        \
+  v2i64 res0_m, res1_m;                           \
+  int32_t sum_m;                                  \
+                                                  \
+  res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);  \
+  res1_m = __msa_splati_d(res0_m, 1);             \
+  res0_m = res0_m + res1_m;                       \
+  sum_m = __msa_copy_s_w((v4i32)res0_m, 0);       \
+  sum_m;                                          \
+})
+
 /* Description : Horizontal addition of unsigned byte vector elements
    Arguments   : Inputs  - in0, in1
                  Outputs - out0, out1
@@ -1073,8 +1102,8 @@
                  Outputs - in0, in1, in2, in3 (in place)
                  Return Type - unsigned halfword
    Details     : Each unsigned halfword element from 'in0' is saturated to the
-                 value generated with (sat_val+1) bit range
-                 Results are in placed to original vectors
+                 value generated with (sat_val+1) bit range.
+                 The results are in placed to original vectors
 */
 #define SAT_UH2(RTYPE, in0, in1, sat_val) {         \
   in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
@@ -1096,7 +1125,7 @@
                  Return Type - unsigned halfword
    Details     : Each unsigned halfword element from 'in0' is saturated to the
                  value generated with (sat_val+1) bit range
-                 Results are in placed to original vectors
+                 The results are in placed to original vectors
 */
 #define SAT_SH2(RTYPE, in0, in1, sat_val) {         \
   in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \
@@ -1216,10 +1245,10 @@
                  Outputs - in0, in1 (in-place)
                  Return Type - as per RTYPE
    Details     : Each unsigned byte element from input vector 'in0' is
-                 logically xor'ed with 128 and result is in-place stored in
+                 logically xor'ed with 128 and the result is in-place stored in
                  'in0' vector
                  Each unsigned byte element from input vector 'in1' is
-                 logically xor'ed with 128 and result is in-place stored in
+                 logically xor'ed with 128 and the result is in-place stored in
                  'in1' vector
                  Similar for other pairs
 */
@@ -1271,13 +1300,28 @@
 }
 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
 
+/* Description : Shift left all elements of vector (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in0, in1, in2, in3 (in place)
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is left shifted by 'shift' and
+                 the result is in place written to 'in0'
+                 Similar for other pairs
+*/
+#define SLLI_4V(in0, in1, in2, in3, shift) {  \
+  in0 = in0 << shift;                         \
+  in1 = in1 << shift;                         \
+  in2 = in2 << shift;                         \
+  in3 = in3 << shift;                         \
+}
+
 /* Description : Arithmetic shift right all elements of vector
                  (generic for all data types)
    Arguments   : Inputs  - in0, in1, in2, in3, shift
                  Outputs - in0, in1, in2, in3 (in place)
                  Return Type - as per input vector RTYPE
    Details     : Each element of vector 'in0' is right shifted by 'shift' and
-                 result is in place written to 'in0'
+                 the result is in place written to 'in0'
                  Here, 'shift' is GP variable passed in
                  Similar for other pairs
 */
@@ -1427,6 +1471,34 @@
   out7 = in0 - in7;                                                    \
 }
 
+/* Description : Butterfly of 16 input vectors
+   Arguments   : Inputs  - in0 ...  in15
+                 Outputs - out0 .. out15
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                  \
+                     in8, in9,  in10, in11, in12, in13, in14, in15,           \
+                     out0, out1, out2, out3, out4, out5, out6, out7,          \
+                     out8, out9, out10, out11, out12, out13, out14, out15) {  \
+  out0 = in0 + in15;                                                          \
+  out1 = in1 + in14;                                                          \
+  out2 = in2 + in13;                                                          \
+  out3 = in3 + in12;                                                          \
+  out4 = in4 + in11;                                                          \
+  out5 = in5 + in10;                                                          \
+  out6 = in6 + in9;                                                           \
+  out7 = in7 + in8;                                                           \
+                                                                              \
+  out8 = in7 - in8;                                                           \
+  out9 = in6 - in9;                                                           \
+  out10 = in5 - in10;                                                         \
+  out11 = in4 - in11;                                                         \
+  out12 = in3 - in12;                                                         \
+  out13 = in2 - in13;                                                         \
+  out14 = in1 - in14;                                                         \
+  out15 = in0 - in15;                                                         \
+}
+
 /* Description : Transposes input 8x8 byte block
    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
                            (input 8x8 byte block)
@@ -1640,7 +1712,7 @@
                  Outputs - out_m
                  Return Type - unsigned byte
    Details     : Signed byte even elements from 'in0' and 'in1' are packed
-                 together in one vector and the resulted vector is xor'ed with
+                 together in one vector and the resulting vector is xor'ed with
                  128 to shift the range from signed to unsigned byte
 */
 #define PCKEV_XORI128_UB(in0, in1) ({                    \
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 2f262a6f1..ba76eef07 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1029,7 +1029,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_fht8x8 sse2/;
 
   add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht16x16 sse2/;
+  specialize qw/vp9_fht16x16 sse2 msa/;
 
   add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
@@ -1047,10 +1047,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
 
   add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct16x16_1 sse2/;
+  specialize qw/vp9_fdct16x16_1 sse2 msa/;
 
   add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fdct16x16 sse2/;
+  specialize qw/vp9_fdct16x16 sse2 msa/;
 
   add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp9_fdct32x32_1 sse2/;
author	Parag Salasakar <img.mips1@gmail.com>	2015-06-13 10:18:47 +0530
committer	Parag Salasakar <img.mips1@gmail.com>	2015-06-16 12:49:34 +0530
commit	89b4b315aa3b2ff90766672272af69505cd13f0f (patch)
tree	f369d63522032f4f025fed86dbaa5a4e9883cbb3 /vp9/common
parent	a4bb5f2a29fc925f0fd033490c1c8ecb54e502c3 (diff)
download	libvpx-89b4b315aa3b2ff90766672272af69505cd13f0f.tar libvpx-89b4b315aa3b2ff90766672272af69505cd13f0f.tar.gz libvpx-89b4b315aa3b2ff90766672272af69505cd13f0f.tar.bz2 libvpx-89b4b315aa3b2ff90766672272af69505cd13f0f.zip