diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/mips/msa/vp9_macros_msa.h | 549 | ||||
-rw-r--r-- | vp9/common/vp9_blockd.h | 4 | ||||
-rw-r--r-- | vp9/common/vp9_common_data.c | 18 | ||||
-rw-r--r-- | vp9/common/vp9_common_data.h | 19 | ||||
-rw-r--r-- | vp9/common/vp9_mvref_common.c | 8 | ||||
-rw-r--r-- | vp9/common/vp9_mvref_common.h | 2 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 2 | ||||
-rw-r--r-- | vp9/decoder/vp9_decodeframe.c | 38 | ||||
-rw-r--r-- | vp9/decoder/vp9_decodemv.c | 12 | ||||
-rw-r--r-- | vp9/decoder/vp9_decodemv.h | 3 | ||||
-rw-r--r-- | vp9/encoder/mips/msa/vp9_temporal_filter_msa.c | 289 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 5 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.c | 6 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 8 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.h | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 16 | ||||
-rw-r--r-- | vp9/vp9cx.mk | 1 |
17 files changed, 581 insertions, 400 deletions
diff --git a/vp9/common/mips/msa/vp9_macros_msa.h b/vp9/common/mips/msa/vp9_macros_msa.h index 43850758c..e008eafe9 100644 --- a/vp9/common/mips/msa/vp9_macros_msa.h +++ b/vp9/common/mips/msa/vp9_macros_msa.h @@ -229,13 +229,12 @@ #endif // (__mips_isa_rev >= 6) /* Description : Load 4 words with stride - Arguments : Inputs - psrc (source pointer to load from) - - stride + Arguments : Inputs - psrc, stride Outputs - out0, out1, out2, out3 - Details : Loads word in 'out0' from (psrc) - Loads word in 'out1' from (psrc + stride) - Loads word in 'out2' from (psrc + 2 * stride) - Loads word in 'out3' from (psrc + 3 * stride) + Details : Load word in 'out0' from (psrc) + Load word in 'out1' from (psrc + stride) + Load word in 'out2' from (psrc + 2 * stride) + Load word in 'out3' from (psrc + 3 * stride) */ #define LW4(psrc, stride, out0, out1, out2, out3) { \ out0 = LW((psrc)); \ @@ -245,11 +244,10 @@ } /* Description : Load double words with stride - Arguments : Inputs - psrc (source pointer to load from) - - stride + Arguments : Inputs - psrc, stride Outputs - out0, out1 - Details : Loads double word in 'out0' from (psrc) - Loads double word in 'out1' from (psrc + stride) + Details : Load double word in 'out0' from (psrc) + Load double word in 'out1' from (psrc + stride) */ #define LD2(psrc, stride, out0, out1) { \ out0 = LD((psrc)); \ @@ -261,11 +259,11 @@ } /* Description : Store 4 words with stride - Arguments : Inputs - in0, in1, in2, in3, pdst, stride - Details : Stores word from 'in0' to (pdst) - Stores word from 'in1' to (pdst + stride) - Stores word from 'in2' to (pdst + 2 * stride) - Stores word from 'in3' to (pdst + 3 * stride) + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store word from 'in0' to (pdst) + Store word from 'in1' to (pdst + stride) + Store word from 'in2' to (pdst + 2 * stride) + Store word from 'in3' to (pdst + 3 * stride) */ #define SW4(in0, in1, in2, in3, pdst, stride) { \ SW(in0, (pdst)) \ @@ -275,11 +273,11 @@ } /* Description : Store 4 double words with stride - Arguments : Inputs - in0, in1, in2, in3, pdst, stride - Details : Stores double word from 'in0' to (pdst) - Stores double word from 'in1' to (pdst + stride) - Stores double word from 'in2' to (pdst + 2 * stride) - Stores double word from 'in3' to (pdst + 3 * stride) + Arguments : Inputs - in0, in1, in2, in3, pdst, stride + Details : Store double word from 'in0' to (pdst) + Store double word from 'in1' to (pdst + stride) + Store double word from 'in2' to (pdst + 2 * stride) + Store double word from 'in3' to (pdst + 3 * stride) */ #define SD4(in0, in1, in2, in3, pdst, stride) { \ SD(in0, (pdst)) \ @@ -289,12 +287,11 @@ } /* Description : Load vectors with 16 byte elements with stride - Arguments : Inputs - psrc (source pointer to load from) - - stride + Arguments : Inputs - psrc, stride Outputs - out0, out1 Return Type - as per RTYPE - Details : Loads 16 byte elements in 'out0' from (psrc) - Loads 16 byte elements in 'out1' from (psrc + stride) + Details : Load 16 byte elements in 'out0' from (psrc) + Load 16 byte elements in 'out1' from (psrc + stride) */ #define LD_B2(RTYPE, psrc, stride, out0, out1) { \ out0 = LD_B(RTYPE, (psrc)); \ @@ -333,11 +330,10 @@ #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) /* Description : Load vectors with 8 halfword elements with stride - Arguments : Inputs - psrc (source pointer to load from) - - stride + Arguments : Inputs - psrc, stride Outputs - out0, out1 - Details : Loads 8 halfword elements in 'out0' from (psrc) - Loads 8 halfword elements in 'out1' from (psrc + stride) + Details : Load 8 halfword elements in 'out0' from (psrc) + Load 8 halfword elements in 'out1' from (psrc + stride) */ #define LD_H2(RTYPE, psrc, stride, out0, out1) { \ out0 = LD_H(RTYPE, (psrc)); \ @@ -368,9 +364,9 @@ } #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) -/* Description : Load as 4x4 block of signed halfword elements from 1D source +/* Description : Load 4x4 block of signed halfword elements from 1D source data into 4 vectors (Each vector with 4 signed halfwords) - Arguments : Inputs - psrc + Arguments : Input - psrc Outputs - out0, out1, out2, out3 */ #define LD4x4_SH(psrc, out0, out1, out2, out3) { \ @@ -381,8 +377,7 @@ } /* Description : Load 2 vectors of signed word elements with stride - Arguments : Inputs - psrc (source pointer to load from) - - stride + Arguments : Inputs - psrc, stride Outputs - out0, out1 Return Type - signed word */ @@ -392,10 +387,9 @@ } /* Description : Store vectors of 16 byte elements with stride - Arguments : Inputs - in0, in1, stride - Outputs - pdst (destination pointer to store to) - Details : Stores 16 byte elements from 'in0' to (pdst) - Stores 16 byte elements from 'in1' to (pdst + stride) + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 16 byte elements from 'in0' to (pdst) + Store 16 byte elements from 'in1' to (pdst + stride) */ #define ST_B2(RTYPE, in0, in1, pdst, stride) { \ ST_B(RTYPE, in0, (pdst)); \ @@ -417,10 +411,9 @@ #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) /* Description : Store vectors of 8 halfword elements with stride - Arguments : Inputs - in0, in1, stride - Outputs - pdst (destination pointer to store to) - Details : Stores 8 halfword elements from 'in0' to (pdst) - Stores 8 halfword elements from 'in1' to (pdst + stride) + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 8 halfword elements from 'in0' to (pdst) + Store 8 halfword elements from 'in1' to (pdst + stride) */ #define ST_H2(RTYPE, in0, in1, pdst, stride) { \ ST_H(RTYPE, in0, (pdst)); \ @@ -441,8 +434,7 @@ #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) /* Description : Store vectors of word elements with stride - Arguments : Inputs - in0, in1, stride - - pdst (destination pointer to store to) + Arguments : Inputs - in0, in1, pdst, stride Details : Store 4 word elements from 'in0' to (pdst) Store 4 word elements from 'in1' to (pdst + stride) */ @@ -451,17 +443,16 @@ ST_SW(in1, (pdst) + stride); \ } -/* Description : Store as 2x4 byte block to destination memory from input vector - Arguments : Inputs - in, stidx, pdst, stride - Return Type - unsigned byte - Details : Index stidx halfword element from 'in' vector is copied and - stored on first line - Index stidx+1 halfword element from 'in' vector is copied and - stored on second line - Index stidx+2 halfword element from 'in' vector is copied and - stored on third line - Index stidx+3 halfword element from 'in' vector is copied and - stored on fourth line +/* Description : Store 2x4 byte block to destination memory from input vector + Arguments : Inputs - in, stidx, pdst, stride + Details : Index 'stidx' halfword element from 'in' vector is copied to + the GP register and stored to (pdst) + Index 'stidx+1' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + stride) + Index 'stidx+2' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 2 * stride) + Index 'stidx+3' halfword element from 'in' vector is copied to + the GP register and stored to (pdst + 3 * stride) */ #define ST2x4_UB(in, stidx, pdst, stride) { \ uint16_t out0_m, out1_m, out2_m, out3_m; \ @@ -479,10 +470,10 @@ } /* Description : Store 4x2 byte block to destination memory from input vector - Arguments : Inputs - in, pdst, stride - Details : Index 0 word element from 'in' vector is copied to a GP + Arguments : Inputs - in, pdst, stride + Details : Index 0 word element from 'in' vector is copied to the GP register and stored to (pdst) - Index 1 word element from 'in' vector is copied to a GP + Index 1 word element from 'in' vector is copied to the GP register and stored to (pdst + stride) */ #define ST4x2_UB(in, pdst, stride) { \ @@ -496,17 +487,16 @@ SW(out1_m, pblk_4x2_m + stride); \ } -/* Description : Store as 4x4 byte block to destination memory from input vector - Arguments : Inputs - in0, in1, pdst, stride - Return Type - unsigned byte - Details : Idx0 word element from input vector 'in0' is copied and stored - on first line - Idx1 word element from input vector 'in0' is copied and stored - on second line - Idx2 word element from input vector 'in1' is copied and stored - on third line - Idx3 word element from input vector 'in1' is copied and stored - on fourth line +/* Description : Store 4x4 byte block to destination memory from input vector + Arguments : Inputs - in0, in1, pdst, stride + Details : 'Idx0' word element from input vector 'in0' is copied to the + GP register and stored to (pdst) + 'Idx1' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + stride) + 'Idx2' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 2 * stride) + 'Idx3' word element from input vector 'in0' is copied to the + GP register and stored to (pdst + 3 * stride) */ #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \ uint32_t out0_m, out1_m, out2_m, out3_m; \ @@ -526,10 +516,10 @@ ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \ } -/* Description : Store as 8x1 byte block to destination memory from input vector - Arguments : Inputs - in, pdst - Details : Index 0 double word element from input vector 'in' is copied - and stored to destination memory at (pdst) +/* Description : Store 8x1 byte block to destination memory from input vector + Arguments : Inputs - in, pdst + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) */ #define ST8x1_UB(in, pdst) { \ uint64_t out0_m; \ @@ -538,12 +528,12 @@ SD(out0_m, pdst); \ } -/* Description : Store as 8x2 byte block to destination memory from input vector - Arguments : Inputs - in, pdst, stride - Details : Index 0 double word element from input vector 'in' is copied - and stored to destination memory at (pdst) - Index 1 double word element from input vector 'in' is copied - and stored to destination memory at (pdst + stride) +/* Description : Store 8x2 byte block to destination memory from input vector + Arguments : Inputs - in, pdst, stride + Details : Index 0 double word element from 'in' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in' vector is copied to the + GP register and stored to (pdst + stride) */ #define ST8x2_UB(in, pdst, stride) { \ uint64_t out0_m, out1_m; \ @@ -556,17 +546,17 @@ SD(out1_m, pblk_8x2_m + stride); \ } -/* Description : Store as 8x4 byte block to destination memory from input +/* Description : Store 8x4 byte block to destination memory from input vectors - Arguments : Inputs - in0, in1, pdst, stride - Details : Index 0 double word element from input vector 'in0' is copied - and stored to destination memory at (pblk_8x4_m) - Index 1 double word element from input vector 'in0' is copied - and stored to destination memory at (pblk_8x4_m + stride) - Index 0 double word element from input vector 'in1' is copied - and stored to destination memory at (pblk_8x4_m + 2 * stride) - Index 1 double word element from input vector 'in1' is copied - and stored to destination memory at (pblk_8x4_m + 3 * stride) + Arguments : Inputs - in0, in1, pdst, stride + Details : Index 0 double word element from 'in0' vector is copied to the + GP register and stored to (pdst) + Index 1 double word element from 'in0' vector is copied to the + GP register and stored to (pdst + stride) + Index 0 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 2 * stride) + Index 1 double word element from 'in1' vector is copied to the + GP register and stored to (pdst + 3 * stride) */ #define ST8x4_UB(in0, in1, pdst, stride) { \ uint64_t out0_m, out1_m, out2_m, out3_m; \ @@ -583,14 +573,10 @@ /* Description : average with rounding (in0 + in1 + 1) / 2. Arguments : Inputs - in0, in1, in2, in3, Outputs - out0, out1 - Return Type - signed byte - Details : Each byte element from 'in0' vector is added with each byte - element from 'in1' vector. The addition of the elements plus 1 - (for rounding) is done unsigned with full precision, - i.e. the result has one extra bit. Unsigned division by 2 - (or logical shift right by one bit) is performed before writing - the result to vector 'out0' - Similar for the pair of 'in2' and 'in3' + Return Type - as per RTYPE + Details : Each unsigned byte element from 'in0' vector is added with + each unsigned byte element from 'in1' vector. Then average + with rounding is calculated and written to 'out0' */ #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) { \ out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \ @@ -605,12 +591,12 @@ } #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) -/* Description : Immediate number of columns to slide with zero +/* Description : Immediate number of elements to slide with zero Arguments : Inputs - in0, in1, slide_val Outputs - out0, out1 Return Type - as per RTYPE Details : Byte elements from 'zero_m' vector are slide into 'in0' by - number of elements specified by 'slide_val' + value specified in the 'slide_val' */ #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) { \ v16i8 zero_m = { 0 }; \ @@ -626,12 +612,12 @@ } #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) -/* Description : Immediate number of columns to slide +/* Description : Immediate number of elements to slide Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val Outputs - out0, out1 Return Type - as per RTYPE Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by - number of elements specified by 'slide_val' + value specified in the 'slide_val' */ #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) { \ out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \ @@ -651,10 +637,8 @@ Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 Outputs - out0, out1 Return Type - as per RTYPE - Details : Selective byte elements from in0 & in1 are copied to out0 as - per control vector mask0 - Selective byte elements from in2 & in3 are copied to out1 as - per control vector mask1 + Details : Byte elements from 'in0' & 'in1' are copied selectively to + 'out0' as per control vector 'mask0' */ #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \ out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \ @@ -673,16 +657,14 @@ #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) /* Description : Dot product of byte vector elements - Arguments : Inputs - mult0, mult1 - cnst0, cnst1 + Arguments : Inputs - mult0, mult1, cnst0, cnst1 Outputs - out0, out1 - Return Type - unsigned halfword - Details : Unsigned byte elements from mult0 are multiplied with - unsigned byte elements from cnst0 producing a result + Return Type - as per RTYPE + Details : Unsigned byte elements from 'mult0' are multiplied with + unsigned byte elements from 'cnst0' producing a result twice the size of input i.e. unsigned halfword. - Then this multiplication results of adjacent odd-even elements - are added together and stored to the out vector - (2 unsigned halfword results) + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector */ #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \ @@ -699,16 +681,14 @@ #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) /* Description : Dot product of byte vector elements - Arguments : Inputs - mult0, mult1 - cnst0, cnst1 + Arguments : Inputs - mult0, mult1, cnst0, cnst1 Outputs - out0, out1 - Return Type - signed halfword - Details : Signed byte elements from mult0 are multiplied with - signed byte elements from cnst0 producing a result + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result twice the size of input i.e. signed halfword. - Then this multiplication results of adjacent odd-even elements - are added together and stored to the out vector - (2 signed halfword results) + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector */ #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \ @@ -724,16 +704,14 @@ #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) /* Description : Dot product of halfword vector elements - Arguments : Inputs - mult0, mult1 - cnst0, cnst1 + Arguments : Inputs - mult0, mult1, cnst0, cnst1 Outputs - out0, out1 - Return Type - signed word - Details : Signed halfword elements from mult0 are multiplied with - signed halfword elements from cnst0 producing a result + Return Type - as per RTYPE + Details : Signed halfword elements from 'mult0' are multiplied with + signed halfword elements from 'cnst0' producing a result twice the size of input i.e. signed word. - Then this multiplication results of adjacent odd-even elements - are added together and stored to the out vector - (2 signed word results) + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector */ #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \ @@ -750,16 +728,14 @@ #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) /* Description : Dot product of word vector elements - Arguments : Inputs - mult0, mult1 - cnst0, cnst1 + Arguments : Inputs - mult0, mult1, cnst0, cnst1 Outputs - out0, out1 - Return Type - signed word - Details : Signed word elements from mult0 are multiplied with - signed word elements from cnst0 producing a result + Return Type - as per RTYPE + Details : Signed word elements from 'mult0' are multiplied with + signed word elements from 'cnst0' producing a result twice the size of input i.e. signed double word. - Then this multiplication results of adjacent odd-even elements - are added together and stored to the out vector - (2 signed double word results) + The multiplication result of adjacent odd-even elements + are added together and written to the 'out0' vector */ #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \ @@ -768,16 +744,14 @@ #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__) /* Description : Dot product & addition of byte vector elements - Arguments : Inputs - mult0, mult1 - cnst0, cnst1 + Arguments : Inputs - mult0, mult1, cnst0, cnst1 Outputs - out0, out1 - Return Type - signed halfword - Details : Signed byte elements from mult0 are multiplied with - signed byte elements from cnst0 producing a result + Return Type - as per RTYPE + Details : Signed byte elements from 'mult0' are multiplied with + signed byte elements from 'cnst0' producing a result twice the size of input i.e. signed halfword. - Then this multiplication results of adjacent odd-even elements - are added to the out vector - (2 signed halfword results) + The multiplication result of adjacent odd-even elements + are added to the 'out0' vector */ #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \ out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \ @@ -793,8 +767,7 @@ #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) /* Description : Dot product & addition of halfword vector elements - Arguments : Inputs - mult0, mult1 - cnst0, cnst1 + Arguments : Inputs - mult0, mult1, cnst0, cnst1 Outputs - out0, out1 Return Type - as per RTYPE Details : Signed halfword elements from 'mult0' are multiplied with @@ -828,10 +801,10 @@ /* Description : Minimum values between unsigned elements of either vector are copied to the output vector Arguments : Inputs - in0, in1, min_vec - Outputs - in0, in1, (in place) - Return Type - unsigned halfword + Outputs - in place operation + Return Type - as per RTYPE Details : Minimum of unsigned halfword element values from 'in0' and - 'min_value' are written to output vector 'in0' + 'min_vec' are written to output vector 'in0' */ #define MIN_UH2(RTYPE, in0, in1, min_vec) { \ in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \ @@ -847,8 +820,8 @@ /* Description : Clips all signed halfword elements of input vector between 0 & 255 - Arguments : Inputs - in (input vector) - Outputs - out_m (output vector with clipped elements) + Arguments : Input - in + Output - out_m Return Type - signed halfword */ #define CLIP_SH_0_255(in) ({ \ @@ -868,12 +841,12 @@ CLIP_SH2_0_255(in2, in3); \ } -/* Description : Addition of 4 signed word elements - 4 signed word elements of input vector are added together and +/* Description : Horizontal addition of 4 signed word elements of input vector + Arguments : Input - in (signed word vector) + Output - sum_m (i32 sum) + Return Type - signed word (GP) + Details : 4 signed word elements of 'in' vector are added together and the resulting integer sum is returned - Arguments : Inputs - in (signed word vector) - Outputs - sum_m (i32 sum) - Return Type - signed word */ #define HADD_SW_S32(in) ({ \ v2i64 res0_m, res1_m; \ @@ -892,7 +865,7 @@ Return Type - as per RTYPE Details : Each unsigned odd byte element from 'in0' is added to even unsigned byte element from 'in0' (pairwise) and the - halfword result is stored in 'out0' + halfword result is written to 'out0' */ #define HADD_UB2(RTYPE, in0, in1, out0, out1) { \ out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \ @@ -934,11 +907,11 @@ } #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__) -/* Description : Insert specified word elements from input vectors to 1 - destination vector - Arguments : Inputs - in0, in1, in2, in3 (4 input vectors) - Outputs - out (output vector) +/* Description : Set element n input vector to GPR value + Arguments : Inputs - in0, in1, in2, in3 + Output - out Return Type - as per RTYPE + Details : Set element 0 in vector 'out' to value specified in 'in0' */ #define INSERT_W2(RTYPE, in0, in1, out) { \ out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ @@ -955,12 +928,6 @@ #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) -/* Description : Insert specified double word elements from input vectors to 1 - destination vector - Arguments : Inputs - in0, in1 (2 input vectors) - Outputs - out (output vector) - Return Type - as per RTYPE -*/ #define INSERT_D2(RTYPE, in0, in1, out) { \ out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \ out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \ @@ -972,10 +939,8 @@ Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE - Details : Even byte elements of 'in0' and even byte - elements of 'in1' are interleaved and copied to 'out0' - Even byte elements of 'in2' and even byte - elements of 'in3' are interleaved and copied to 'out1' + Details : Even byte elements of 'in0' and 'in1' are interleaved + and written to 'out0' */ #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ @@ -988,10 +953,8 @@ Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE - Details : Even halfword elements of 'in0' and even halfword - elements of 'in1' are interleaved and copied to 'out0' - Even halfword elements of 'in2' and even halfword - elements of 'in3' are interleaved and copied to 'out1' + Details : Even halfword elements of 'in0' and 'in1' are interleaved + and written to 'out0' */ #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \ @@ -1018,10 +981,8 @@ Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE - Details : Even double word elements of 'in0' and even double word - elements of 'in1' are interleaved and copied to 'out0' - Even double word elements of 'in2' and even double word - elements of 'in3' are interleaved and copied to 'out1' + Details : Even double word elements of 'in0' and 'in1' are interleaved + and written to 'out0' */ #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \ @@ -1033,10 +994,8 @@ Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE - Details : Left half of byte elements of in0 and left half of byte - elements of in1 are interleaved and copied to out0. - Left half of byte elements of in2 and left half of byte - elements of in3 are interleaved and copied to out1. + Details : Left half of byte elements of 'in0' and 'in1' are interleaved + and written to 'out0'. */ #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \ @@ -1059,10 +1018,8 @@ Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE - Details : Left half of halfword elements of in0 and left half of halfword - elements of in1 are interleaved and copied to out0. - Left half of halfword elements of in2 and left half of halfword - elements of in3 are interleaved and copied to out1. + Details : Left half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. */ #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ @@ -1074,10 +1031,8 @@ Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE - Details : Left half of word elements of in0 and left half of word - elements of in1 are interleaved and copied to out0. - Left half of word elements of in2 and left half of word - elements of in3 are interleaved and copied to out1. + Details : Left half of word elements of 'in0' and 'in1' are interleaved + and written to 'out0'. */ #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) { \ out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ @@ -1087,14 +1042,11 @@ #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) /* Description : Interleave right half of byte elements from vectors - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3 + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 Return Type - as per RTYPE - Details : Right half of byte elements of in0 and right half of byte - elements of in1 are interleaved and copied to out0. - Right half of byte elements of in2 and right half of byte - elements of in3 are interleaved and copied to out1. - Similar for other pairs + Details : Right half of byte elements of 'in0' and 'in1' are interleaved + and written to out0. */ #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ @@ -1126,14 +1078,11 @@ #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) /* Description : Interleave right half of halfword elements from vectors - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3 - Return Type - signed halfword - Details : Right half of halfword elements of in0 and right half of - halfword elements of in1 are interleaved and copied to out0. - Right half of halfword elements of in2 and right half of - halfword elements of in3 are interleaved and copied to out1. - Similar for other pairs + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of halfword elements of 'in0' and 'in1' are + interleaved and written to 'out0'. */ #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ @@ -1163,13 +1112,11 @@ #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) /* Description : Interleave right half of double word elements from vectors - Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - Outputs - out0, out1, out2, out3 - Return Type - unsigned double word - Details : Right half of double word elements of in0 and right half of - double word elements of in1 are interleaved and copied to out0. - Right half of double word elements of in2 and right half of - double word elements of in3 are interleaved and copied to out1. + Arguments : Inputs - in0, in1, in2, in3 + Outputs - out0, out1 + Return Type - as per RTYPE + Details : Right half of double word elements of 'in0' and 'in1' are + interleaved and written to 'out0'. */ #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \ @@ -1198,9 +1145,7 @@ Outputs - out0, out1 Return Type - as per RTYPE Details : Right half of byte elements from 'in0' and 'in1' are - interleaved and stored to 'out0' - Left half of byte elements from 'in0' and 'in1' are - interleaved and stored to 'out1' + interleaved and written to 'out0' */ #define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \ out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ @@ -1226,14 +1171,14 @@ #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) /* Description : Saturate the halfword element values to the max - unsigned value of (sat_val+1 bits) + unsigned value of (sat_val + 1) bits The element data width remains unchanged - Arguments : Inputs - in0, in1, in2, in3, sat_val - Outputs - in0, in1, in2, in3 (in place) - Return Type - unsigned halfword + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE Details : Each unsigned halfword element from 'in0' is saturated to the - value generated with (sat_val+1) bit range. - The results are stored in place + value generated with (sat_val + 1) bit range. + The results are written in place */ #define SAT_UH2(RTYPE, in0, in1, sat_val) { \ in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \ @@ -1248,14 +1193,14 @@ #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) /* Description : Saturate the halfword element values to the max - unsigned value of (sat_val+1 bits) + unsigned value of (sat_val + 1) bits The element data width remains unchanged - Arguments : Inputs - in0, in1, in2, in3, sat_val - Outputs - in0, in1, in2, in3 (in place) - Return Type - unsigned halfword + Arguments : Inputs - in0, in1, sat_val + Outputs - in place operation + Return Type - as per RTYPE Details : Each unsigned halfword element from 'in0' is saturated to the - value generated with (sat_val+1) bit range - The results are stored in place + value generated with (sat_val + 1) bit range + The results are written in place */ #define SAT_SH2(RTYPE, in0, in1, sat_val) { \ in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \ @@ -1296,12 +1241,9 @@ Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE - Details : Even byte elements of in0 are copied to the left half of - out0 & even byte elements of in1 are copied to the right - half of out0. - Even byte elements of in2 are copied to the left half of - out1 & even byte elements of in3 are copied to the right - half of out1. + Details : Even byte elements of 'in0' are copied to the left half of + 'out0' & even byte elements of 'in1' are copied to the right + half of 'out0'. */ #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ @@ -1324,12 +1266,9 @@ Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 Return Type - as per RTYPE - Details : Even halfword elements of in0 are copied to the left half of - out0 & even halfword elements of in1 are copied to the right - half of out0. - Even halfword elements of in2 are copied to the left half of - out1 & even halfword elements of in3 are copied to the right - half of out1. + Details : Even halfword elements of 'in0' are copied to the left half of + 'out0' & even halfword elements of 'in1' are copied to the + right half of 'out0'. */ #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \ @@ -1348,13 +1287,10 @@ /* Description : Pack even double word elements of vector pairs Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 - Return Type - unsigned byte - Details : Even double elements of in0 are copied to the left half of - out0 & even double elements of in1 are copied to the right - half of out0. - Even double elements of in2 are copied to the left half of - out1 & even double elements of in3 are copied to the right - half of out1. + Return Type - as per RTYPE + Details : Even double elements of 'in0' are copied to the left half of + 'out0' & even double elements of 'in1' are copied to the right + half of 'out0'. */ #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \ @@ -1372,15 +1308,10 @@ /* Description : Each byte element is logically xor'ed with immediate 128 Arguments : Inputs - in0, in1 - Outputs - in0, in1 (in-place) + Outputs - in place operation Return Type - as per RTYPE Details : Each unsigned byte element from input vector 'in0' is - logically xor'ed with 128 and the result is in-place stored in - 'in0' vector - Each unsigned byte element from input vector 'in1' is - logically xor'ed with 128 and the result is in-place stored in - 'in1' vector - Similar for other pairs + logically xor'ed with 128 and the result is stored in-place. */ #define XORI_B2_128(RTYPE, in0, in1) { \ in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \ @@ -1432,8 +1363,7 @@ Return Type - as per RTYPE Details : Signed halfword elements from 'in0' are added to signed halfword elements of 'in1'. The result is then signed saturated - between -32768 to +32767 (as per halfword data type) - Similar for other pairs + between halfword data type range */ #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) { \ out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \ @@ -1450,11 +1380,10 @@ /* Description : Shift left all elements of vector (generic for all data types) Arguments : Inputs - in0, in1, in2, in3, shift - Outputs - in0, in1, in2, in3 (in place) + Outputs - in place operation Return Type - as per input vector RTYPE Details : Each element of vector 'in0' is left shifted by 'shift' and - the result is in place written to 'in0' - Similar for other pairs + the result is written in-place. */ #define SLLI_4V(in0, in1, in2, in3, shift) { \ in0 = in0 << shift; \ @@ -1466,12 +1395,10 @@ /* Description : Arithmetic shift right all elements of vector (generic for all data types) Arguments : Inputs - in0, in1, in2, in3, shift - Outputs - in0, in1, in2, in3 (in place) + Outputs - in place operation Return Type - as per input vector RTYPE Details : Each element of vector 'in0' is right shifted by 'shift' and - the result is in place written to 'in0' - Here, 'shift' is GP variable passed in - Similar for other pairs + the result is written in-place. 'shift' is a GP variable. */ #define SRA_4V(in0, in1, in2, in3, shift) { \ in0 = in0 >> shift; \ @@ -1502,14 +1429,13 @@ #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) /* Description : Shift right arithmetic rounded (immediate) - Arguments : Inputs - in0, in1, in2, in3, shift - Outputs - in0, in1, in2, in3 (in place) + Arguments : Inputs - in0, in1, shift + Outputs - in place operation Return Type - as per RTYPE - Details : Each element of vector 'in0' is shifted right arithmetic by - value in 'shift'. - The last discarded bit is added to shifted value for rounding - and the result is in place written to 'in0' - Similar for other pairs + Details : Each element of vector 'in0' is shifted right arithmetically by + the value in 'shift'. The last discarded bit is added to the + shifted value for rounding and the result is written in-place. + 'shift' is an immediate value. */ #define SRARI_H2(RTYPE, in0, in1, shift) { \ in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \ @@ -1525,16 +1451,6 @@ #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) -/* Description : Shift right arithmetic rounded (immediate) - Arguments : Inputs - in0, in1, shift - Outputs - in0, in1 (in place) - Return Type - as per RTYPE - Details : Each element of vector 'in0' is shifted right arithmetic by - value in 'shift'. - The last discarded bit is added to shifted value for rounding - and the result is in place written to 'in0' - Similar for other pairs -*/ #define SRARI_W2(RTYPE, in0, in1, shift) { \ in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ @@ -1581,8 +1497,8 @@ /* Description : Addition of 2 pairs of vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 - Details : Each element from 2 pairs vectors is added and 2 results are - produced + Details : Each element in 'in0' is added to 'in1' and result is written + to 'out0'. */ #define ADD2(in0, in1, in2, in3, out0, out1) { \ out0 = in0 + in1; \ @@ -1597,8 +1513,8 @@ /* Description : Subtraction of 2 pairs of vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1 - Details : Each element from 2 pairs vectors is subtracted and 2 results - are produced + Details : Each element in 'in1' is subtracted from 'in0' and result is + written to 'out0'. */ #define SUB2(in0, in1, in2, in3, out0, out1) { \ out0 = in0 - in1; \ @@ -1613,8 +1529,8 @@ } /* Description : Sign extend halfword elements from right half of the vector - Arguments : Inputs - in (input halfword vector) - Outputs - out (sign extended word vectors) + Arguments : Input - in (halfword vector) + Output - out (sign extended word vector) Return Type - signed word Details : Sign bit of halfword elements from input vector 'in' is extracted and interleaved with same vector 'in0' to generate @@ -1628,8 +1544,8 @@ } /* Description : Zero extend unsigned byte elements to halfword elements - Arguments : Inputs - in (1 input unsigned byte vector) - Outputs - out0, out1 (unsigned 2 halfword vectors) + Arguments : Input - in (unsigned byte vector) + Outputs - out0, out1 (unsigned halfword vectors) Return Type - signed halfword Details : Zero extended right half of vector is returned in 'out0' Zero extended left half of vector is returned in 'out1' @@ -1641,9 +1557,9 @@ } /* Description : Sign extend halfword elements from input vector and return - result in pair of vectors - Arguments : Inputs - in (1 input halfword vector) - Outputs - out0, out1 (sign extended 2 word vectors) + the result in pair of vectors + Arguments : Input - in (halfword vector) + Outputs - out0, out1 (sign extended word vectors) Return Type - signed word Details : Sign bit of halfword elements from input vector 'in' is extracted and interleaved right with same vector 'in0' to @@ -1717,13 +1633,10 @@ out15 = in0 - in15; \ } -/* Description : Transposes input 8x8 byte block +/* Description : Transpose input 8x8 byte block Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 - (input 8x8 byte block) Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - (output 8x8 byte block) - Return Type - unsigned byte - Details : + Return Type - as per RTYPE */ #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3, out4, out5, out6, out7) { \ @@ -1741,12 +1654,11 @@ } #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) -/* Description : Transposes 16x8 block into 8x16 with byte elements in vectors +/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 Return Type - unsigned byte - Details : */ #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \ in8, in9, in10, in11, in12, in13, in14, in15, \ @@ -1789,11 +1701,10 @@ out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \ } -/* Description : Transposes 4x4 block with half word elements in vectors +/* Description : Transpose 4x4 block with half word elements in vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1, out2, out3 Return Type - signed halfword - Details : */ #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \ v8i16 s0_m, s1_m; \ @@ -1804,11 +1715,10 @@ out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \ } -/* Description : Transposes 4x8 block with half word elements in vectors +/* Description : Transpose 4x8 block with half word elements in vectors Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 Return Type - signed halfword - Details : */ #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3, out4, out5, out6, out7) { \ @@ -1832,11 +1742,10 @@ out7 = zero_m; \ } -/* Description : Transposes 8x4 block with half word elements in vectors +/* Description : Transpose 8x4 block with half word elements in vectors Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 Return Type - signed halfword - Details : */ #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) { \ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ @@ -1847,11 +1756,10 @@ ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \ } -/* Description : Transposes 8x8 block with half word elements in vectors +/* Description : Transpose 8x8 block with half word elements in vectors Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Outputs - out0, out1, out2, out3, out4, out5, out6, out7 - Return Type - signed halfword - Details : + Return Type - as per RTYPE */ #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ out0, out1, out2, out3, out4, out5, out6, out7) { \ @@ -1876,11 +1784,10 @@ } #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) -/* Description : Transposes 4x4 block with word elements in vectors +/* Description : Transpose 4x4 block with word elements in vectors Arguments : Inputs - in0, in1, in2, in3 Outputs - out0, out1, out2, out3 Return Type - signed word - Details : */ #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) { \ v4i32 s0_m, s1_m, s2_m, s3_m; \ @@ -1895,15 +1802,12 @@ } /* Description : Add block 4x4 - Arguments : Inputs - in0, in1, in2, in3, pdst, stride - Outputs - - Return Type - unsigned bytes + Arguments : Inputs - in0, in1, in2, in3, pdst, stride Details : Least significant 4 bytes from each input vector are added to - the destination bytes, clipped between 0-255 and then stored. + the destination bytes, clipped between 0-255 and stored. */ #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) { \ uint32_t src0_m, src1_m, src2_m, src3_m; \ - uint32_t out0_m, out1_m, out2_m, out3_m; \ v8i16 inp0_m, inp1_m, res0_m, res1_m; \ v16i8 dst0_m = { 0 }; \ v16i8 dst1_m = { 0 }; \ @@ -1917,17 +1821,12 @@ ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ CLIP_SH2_0_255(res0_m, res1_m); \ PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ - \ - out0_m = __msa_copy_u_w((v4i32)dst0_m, 0); \ - out1_m = __msa_copy_u_w((v4i32)dst0_m, 1); \ - out2_m = __msa_copy_u_w((v4i32)dst1_m, 0); \ - out3_m = __msa_copy_u_w((v4i32)dst1_m, 1); \ - SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \ + ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ } /* Description : Pack even elements of input vectors & xor with 128 - Arguments : Inputs - in0, in1 - Outputs - out_m + Arguments : Inputs - in0, in1 + Output - out_m Return Type - unsigned byte Details : Signed byte even elements from 'in0' and 'in1' are packed together in one vector and the resulting vector is xor'ed with @@ -1943,8 +1842,8 @@ /* Description : Converts inputs to unsigned bytes, interleave, average & store as 8x4 unsigned byte block - Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, - pdst, stride + Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3, + pdst, stride */ #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \ dst0, dst1, dst2, dst3, pdst, stride) { \ @@ -1960,7 +1859,7 @@ /* Description : Pack even byte elements and store byte vector in destination memory - Arguments : Inputs - in0, in1, pdst + Arguments : Inputs - in0, in1, pdst */ #define PCKEV_ST_SB(in0, in1, pdst) { \ v16i8 tmp_m; \ @@ -1970,7 +1869,7 @@ } /* Description : Horizontal 2 tap filter kernel code - Arguments : Inputs - in0, in1, mask, coeff, shift + Arguments : Inputs - in0, in1, mask, coeff, shift */ #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({ \ v16i8 tmp0_m; \ diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 64d379cab..d26048cdf 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -23,6 +23,7 @@ #include "vp9/common/vp9_mv.h" #include "vp9/common/vp9_scale.h" #include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_tile_common.h" #ifdef __cplusplus extern "C" { @@ -149,7 +150,10 @@ typedef struct RefBuffer { typedef struct macroblockd { struct macroblockd_plane plane[MAX_MB_PLANE]; + FRAME_COUNTS *counts; + TileInfo tile; + int mi_stride; MODE_INFO **mi; diff --git a/vp9/common/vp9_common_data.c b/vp9/common/vp9_common_data.c index 2aaa009fa..0bf7cbcc0 100644 --- a/vp9/common/vp9_common_data.c +++ b/vp9/common/vp9_common_data.c @@ -11,27 +11,27 @@ #include "vp9/common/vp9_common_data.h" // Log 2 conversion lookup tables for block width and height -const int b_width_log2_lookup[BLOCK_SIZES] = +const uint8_t b_width_log2_lookup[BLOCK_SIZES] = {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4}; -const int b_height_log2_lookup[BLOCK_SIZES] = +const uint8_t b_height_log2_lookup[BLOCK_SIZES] = {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4}; -const int num_4x4_blocks_wide_lookup[BLOCK_SIZES] = +const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES] = {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16}; -const int num_4x4_blocks_high_lookup[BLOCK_SIZES] = +const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES] = {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16}; // Log 2 conversion lookup tables for modeinfo width and height -const int mi_width_log2_lookup[BLOCK_SIZES] = +const uint8_t mi_width_log2_lookup[BLOCK_SIZES] = {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3}; -const int num_8x8_blocks_wide_lookup[BLOCK_SIZES] = +const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] = {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8}; -const int num_8x8_blocks_high_lookup[BLOCK_SIZES] = +const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] = {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8}; // MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize))) -const int size_group_lookup[BLOCK_SIZES] = +const uint8_t size_group_lookup[BLOCK_SIZES] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3}; -const int num_pels_log2_lookup[BLOCK_SIZES] = +const uint8_t num_pels_log2_lookup[BLOCK_SIZES] = {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12}; const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = { diff --git a/vp9/common/vp9_common_data.h b/vp9/common/vp9_common_data.h index a06c9bed8..95a117961 100644 --- a/vp9/common/vp9_common_data.h +++ b/vp9/common/vp9_common_data.h @@ -12,20 +12,21 @@ #define VP9_COMMON_VP9_COMMON_DATA_H_ #include "vp9/common/vp9_enums.h" +#include "vpx/vpx_integer.h" #ifdef __cplusplus extern "C" { #endif -extern const int b_width_log2_lookup[BLOCK_SIZES]; -extern const int b_height_log2_lookup[BLOCK_SIZES]; -extern const int mi_width_log2_lookup[BLOCK_SIZES]; -extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZES]; -extern const int num_8x8_blocks_high_lookup[BLOCK_SIZES]; -extern const int num_4x4_blocks_high_lookup[BLOCK_SIZES]; -extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZES]; -extern const int size_group_lookup[BLOCK_SIZES]; -extern const int num_pels_log2_lookup[BLOCK_SIZES]; +extern const uint8_t b_width_log2_lookup[BLOCK_SIZES]; +extern const uint8_t b_height_log2_lookup[BLOCK_SIZES]; +extern const uint8_t mi_width_log2_lookup[BLOCK_SIZES]; +extern const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES]; +extern const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES]; +extern const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES]; +extern const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES]; +extern const uint8_t size_group_lookup[BLOCK_SIZES]; +extern const uint8_t num_pels_log2_lookup[BLOCK_SIZES]; extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES]; extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES]; extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES]; diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c index 5f8ee0fcc..77d1ff459 100644 --- a/vp9/common/vp9_mvref_common.c +++ b/vp9/common/vp9_mvref_common.c @@ -14,7 +14,6 @@ // This function searches the neighbourhood of a given MB/SB // to try and find candidate reference vectors. static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, - const TileInfo *const tile, MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, int block, int mi_row, int mi_col, @@ -27,6 +26,7 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, int context_counter = 0; const MV_REF *const prev_frame_mvs = cm->use_prev_frame_mvs ? cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL; + const TileInfo *const tile = &xd->tile; // Blank the reference vector list memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES); @@ -147,13 +147,12 @@ static void find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd, } void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, - const TileInfo *const tile, MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, int mi_row, int mi_col, find_mv_refs_sync sync, void *const data, uint8_t *mode_context) { - find_mv_refs_idx(cm, xd, tile, mi, ref_frame, mv_ref_list, -1, + find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1, mi_row, mi_col, sync, data, mode_context); } @@ -181,7 +180,6 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, } void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, - const TileInfo *const tile, int block, int ref, int mi_row, int mi_col, int_mv *nearest_mv, int_mv *near_mv, uint8_t *mode_context) { @@ -192,7 +190,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, assert(MAX_MV_REF_CANDIDATES == 2); - find_mv_refs_idx(cm, xd, tile, mi, mi->mbmi.ref_frame[ref], mv_list, block, + find_mv_refs_idx(cm, xd, mi, mi->mbmi.ref_frame[ref], mv_list, block, mi_row, mi_col, NULL, NULL, mode_context); near_mv->as_int = 0; diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h index 621dc14be..bd216d433 100644 --- a/vp9/common/vp9_mvref_common.h +++ b/vp9/common/vp9_mvref_common.h @@ -209,7 +209,6 @@ static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { typedef void (*find_mv_refs_sync)(void *const data, int mi_row); void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, - const TileInfo *const tile, MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, int mi_row, int mi_col, find_mv_refs_sync sync, void *const data, @@ -222,7 +221,6 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp, int_mv *mvlist, int_mv *nearest_mv, int_mv *near_mv); void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, - const TileInfo *const tile, int block, int ref, int mi_row, int mi_col, int_mv *nearest_mv, int_mv *near_mv, uint8_t *mode_context); diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index c97b5d76d..22a5efdd5 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1077,7 +1077,7 @@ add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const stru specialize qw/vp9_full_range_search/; add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; -specialize qw/vp9_temporal_filter_apply sse2/; +specialize qw/vp9_temporal_filter_apply sse2 msa/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 9311d8dad..659b84848 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -698,7 +698,6 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi, } static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, - const TileInfo *const tile, BLOCK_SIZE bsize, int mi_row, int mi_col) { const int bw = num_8x8_blocks_wide_lookup[bsize]; const int bh = num_8x8_blocks_high_lookup[bsize]; @@ -706,6 +705,7 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, const int y_mis = MIN(bh, cm->mi_rows - mi_row); const int offset = mi_row * cm->mi_stride + mi_col; int x, y; + const TileInfo *const tile = &xd->tile; xd->mi = cm->mi_grid_visible + offset; xd->mi[0] = &cm->mi[offset]; @@ -726,12 +726,11 @@ static MB_MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, } static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd, - const TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &pbi->common; const int less8x8 = bsize < BLOCK_8X8; - MB_MODE_INFO *mbmi = set_offsets(cm, xd, tile, bsize, mi_row, mi_col); + MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col); if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) { const BLOCK_SIZE uv_subsize = @@ -741,7 +740,7 @@ static void decode_block(VP9Decoder *const pbi, MACROBLOCKD *const xd, VPX_CODEC_CORRUPT_FRAME, "Invalid block size."); } - vp9_read_mode_info(pbi, xd, tile, mi_row, mi_col, r); + vp9_read_mode_info(pbi, xd, mi_row, mi_col, r); if (less8x8) bsize = BLOCK_8X8; @@ -795,7 +794,6 @@ static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col, } static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd, - const TileInfo *const tile, int mi_row, int mi_col, vp9_reader* r, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &pbi->common; @@ -811,27 +809,27 @@ static void decode_partition(VP9Decoder *const pbi, MACROBLOCKD *const xd, partition = read_partition(xd, mi_row, mi_col, bsize, r, has_rows, has_cols); subsize = get_subsize(bsize, partition); if (bsize == BLOCK_8X8) { - decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize); + decode_block(pbi, xd, mi_row, mi_col, r, subsize); } else { switch (partition) { case PARTITION_NONE: - decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize); + decode_block(pbi, xd, mi_row, mi_col, r, subsize); break; case PARTITION_HORZ: - decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize); + decode_block(pbi, xd, mi_row, mi_col, r, subsize); if (has_rows) - decode_block(pbi, xd, tile, mi_row + hbs, mi_col, r, subsize); + decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize); break; case PARTITION_VERT: - decode_block(pbi, xd, tile, mi_row, mi_col, r, subsize); + decode_block(pbi, xd, mi_row, mi_col, r, subsize); if (has_cols) - decode_block(pbi, xd, tile, mi_row, mi_col + hbs, r, subsize); + decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize); break; case PARTITION_SPLIT: - decode_partition(pbi, xd, tile, mi_row, mi_col, r, subsize); - decode_partition(pbi, xd, tile, mi_row, mi_col + hbs, r, subsize); - decode_partition(pbi, xd, tile, mi_row + hbs, mi_col, r, subsize); - decode_partition(pbi, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize); + decode_partition(pbi, xd, mi_row, mi_col, r, subsize); + decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize); + decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize); + decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize); break; default: assert(0 && "Invalid partition type"); @@ -1315,7 +1313,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, // Load all tile information into tile_data. for (tile_row = 0; tile_row < tile_rows; ++tile_row) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileInfo tile; const TileBuffer *const buf = &tile_buffers[tile_row][tile_col]; tile_data = pbi->tile_data + tile_cols * tile_row + tile_col; tile_data->cm = cm; @@ -1323,7 +1320,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, tile_data->xd.corrupted = 0; tile_data->xd.counts = cm->frame_parallel_decoding_mode ? NULL : &cm->counts; - vp9_tile_init(&tile, tile_data->cm, tile_row, tile_col); + vp9_tile_init(&tile_data->xd.tile, tile_data->cm, tile_row, tile_col); setup_token_decoder(buf->data, data_end, buf->size, &cm->error, &tile_data->bit_reader, pbi->decrypt_cb, pbi->decrypt_state); @@ -1345,8 +1342,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, vp9_zero(tile_data->xd.left_seg_context); for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; mi_col += MI_BLOCK_SIZE) { - decode_partition(pbi, &tile_data->xd, &tile, mi_row, - mi_col, &tile_data->bit_reader, BLOCK_64X64); + decode_partition(pbi, &tile_data->xd, mi_row, mi_col, + &tile_data->bit_reader, BLOCK_64X64); } pbi->mb.corrupted |= tile_data->xd.corrupted; if (pbi->mb.corrupted) @@ -1419,7 +1416,7 @@ static int tile_worker_hook(TileWorkerData *const tile_data, for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE) { decode_partition(tile_data->pbi, &tile_data->xd, - tile, mi_row, mi_col, &tile_data->bit_reader, + mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64); } } @@ -1543,6 +1540,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, tile_data->xd.counts = cm->frame_parallel_decoding_mode ? 0 : &tile_data->counts; vp9_tile_init(tile, cm, 0, buf->col); + vp9_tile_init(&tile_data->xd.tile, cm, 0, buf->col); setup_token_decoder(buf->data, data_end, buf->size, &cm->error, &tile_data->bit_reader, pbi->decrypt_cb, pbi->decrypt_state); diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index 8a8d8ddd8..cd20c84cf 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -464,7 +464,6 @@ static void fpm_sync(void *const data, int mi_row) { static void read_inter_block_mode_info(VP9Decoder *const pbi, MACROBLOCKD *const xd, - const TileInfo *const tile, MODE_INFO *const mi, int mi_row, int mi_col, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; @@ -482,13 +481,14 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, for (ref = 0; ref < 1 + is_compound; ++ref) { const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME]; + xd->block_refs[ref] = ref_buf; if ((!vp9_is_valid_scale(&ref_buf->sf))) vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM, "Reference frame has invalid dimensions"); vp9_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, &ref_buf->sf); - vp9_find_mv_refs(cm, xd, tile, mi, frame, ref_mvs[frame], + vp9_find_mv_refs(cm, xd, mi, frame, ref_mvs[frame], mi_row, mi_col, fpm_sync, (void *)pbi, inter_mode_ctx); } @@ -531,7 +531,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, if (b_mode == NEARESTMV || b_mode == NEARMV) { uint8_t dummy_mode_ctx[MAX_REF_FRAMES]; for (ref = 0; ref < 1 + is_compound; ++ref) - vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, j, ref, mi_row, mi_col, + vp9_append_sub8x8_mvs_for_idx(cm, xd, j, ref, mi_row, mi_col, &nearest_sub8x8[ref], &near_sub8x8[ref], dummy_mode_ctx); @@ -567,7 +567,6 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, static void read_inter_frame_mode_info(VP9Decoder *const pbi, MACROBLOCKD *const xd, - const TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; MODE_INFO *const mi = xd->mi[0]; @@ -582,13 +581,12 @@ static void read_inter_frame_mode_info(VP9Decoder *const pbi, mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r); if (inter_block) - read_inter_block_mode_info(pbi, xd, tile, mi, mi_row, mi_col, r); + read_inter_block_mode_info(pbi, xd, mi, mi_row, mi_col, r); else read_intra_block_mode_info(cm, xd, mi, r); } void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, - const TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; MODE_INFO *const mi = xd->mi[0]; @@ -602,7 +600,7 @@ void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, if (frame_is_intra_only(cm)) { read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r); } else { - read_inter_frame_mode_info(pbi, xd, tile, mi_row, mi_col, r); + read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r); for (h = 0; h < y_mis; ++h) { MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols; diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h index dd97d8da0..db57b4023 100644 --- a/vp9/decoder/vp9_decodemv.h +++ b/vp9/decoder/vp9_decodemv.h @@ -18,10 +18,7 @@ extern "C" { #endif -struct TileInfo; - void vp9_read_mode_info(VP9Decoder *const pbi, MACROBLOCKD *xd, - const struct TileInfo *const tile, int mi_row, int mi_col, vp9_reader *r); #ifdef __cplusplus diff --git a/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c new file mode 100644 index 000000000..4053bffae --- /dev/null +++ b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/mips/msa/vp9_macros_msa.h" + +static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, + uint32_t stride, + uint8_t *frm2_ptr, + int32_t filt_sth, + int32_t filt_wgt, + uint32_t *acc, + uint16_t *cnt) { + uint32_t row; + uint64_t f0, f1, f2, f3; + v16i8 frm2, frm1 = { 0 }; + v16i8 frm4, frm3 = { 0 }; + v16u8 frm_r, frm_l; + v8i16 frm2_r, frm2_l; + v8i16 diff0, diff1, mod0_h, mod1_h; + v4i32 cnst3, cnst16, filt_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; + v4i32 acc0, acc1, acc2, acc3; + v8i16 cnt0, cnt1; + + filt_wt = __msa_fill_w(filt_wgt); + strength = __msa_fill_w(filt_sth); + cnst3 = __msa_ldi_w(3); + cnst16 = __msa_ldi_w(16); + + for (row = 2; row--;) { + LD4(frm1_ptr, stride, f0, f1, f2, f3); + frm1_ptr += (4 * stride); + + LD_SB2(frm2_ptr, 16, frm2, frm4); + frm2_ptr += 32; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + INSERT_D2_SB(f0, f1, frm1); + INSERT_D2_SB(f2, f3, frm3); + ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, + diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + UNPCK_UB_SH(frm2, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, + diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + UNPCK_UB_SH(frm4, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + } +} + +static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, + uint32_t stride, + uint8_t *frm2_ptr, + int32_t filt_sth, + int32_t filt_wgt, + uint32_t *acc, + uint16_t *cnt) { + uint32_t row; + v16i8 frm1, frm2, frm3, frm4; + v16u8 frm_r, frm_l; + v16i8 zero = { 0 }; + v8u16 frm2_r, frm2_l; + v8i16 diff0, diff1, mod0_h, mod1_h; + v4i32 cnst3, cnst16, filt_wt, strength; + v4i32 mod0_w, mod1_w, mod2_w, mod3_w; + v4i32 diff0_r, diff0_l, diff1_r, diff1_l; + v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll; + v4i32 acc0, acc1, acc2, acc3; + v8i16 cnt0, cnt1; + + filt_wt = __msa_fill_w(filt_wgt); + strength = __msa_fill_w(filt_sth); + cnst3 = __msa_ldi_w(3); + cnst16 = __msa_ldi_w(16); + + for (row = 8; row--;) { + LD_SB2(frm1_ptr, stride, frm1, frm3); + frm1_ptr += stride; + + LD_SB2(frm2_ptr, 16, frm2, frm4); + frm2_ptr += 16; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm1, frm2, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + LD_SW2(acc, 4, acc0, acc1); + LD_SW2(acc + 8, 4, acc2, acc3); + LD_SH2(cnt, 8, cnt0, cnt1); + + ILVRL_B2_UB(frm3, frm4, frm_r, frm_l); + HSUB_UB2_SH(frm_r, frm_l, diff0, diff1); + UNPCK_SH_SW(diff0, diff0_r, diff0_l); + UNPCK_SH_SW(diff1, diff1_r, diff1_l); + MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, + mod0_w, mod1_w, mod2_w, mod3_w); + MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, + mod0_w, mod1_w, mod2_w, mod3_w); + SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); + + diff0_r = (mod0_w < cnst16); + diff0_l = (mod1_w < cnst16); + diff1_r = (mod2_w < cnst16); + diff1_l = (mod3_w < cnst16); + + SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, + mod0_w, mod1_w, mod2_w, mod3_w); + + mod0_w = diff0_r & mod0_w; + mod1_w = diff0_l & mod1_w; + mod2_w = diff1_r & mod2_w; + mod3_w = diff1_l & mod3_w; + + MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt, + mod0_w, mod1_w, mod2_w, mod3_w); + PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); + ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); + ST_SH2(mod0_h, mod1_h, cnt, 8); + cnt += 16; + + ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l); + UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl); + UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll); + MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll, + mod0_w, mod1_w, mod2_w, mod3_w); + ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, + mod0_w, mod1_w, mod2_w, mod3_w); + ST_SW2(mod0_w, mod1_w, acc, 4); + acc += 8; + ST_SW2(mod2_w, mod3_w, acc, 4); + acc += 8; + + frm1_ptr += stride; + frm2_ptr += 16; + } +} + +void vp9_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride, + uint8_t *frame2_ptr, uint32_t blk_w, + uint32_t blk_h, int32_t strength, + int32_t filt_wgt, uint32_t *accu, + uint16_t *cnt) { + if (8 == (blk_w * blk_h)) { + temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, + strength, filt_wgt, accu, cnt); + } else if (16 == (blk_w * blk_h)) { + temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, + strength, filt_wgt, accu, cnt); + } else { + vp9_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h, + strength, filt_wgt, accu, cnt); + } +} diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index b4059752e..cd8c4e17d 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -230,6 +230,9 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, mbmi->segment_id = 0; x->encode_breakout = cpi->encode_breakout; } + + // required by vp9_append_sub8x8_mvs_for_idx() and vp9_find_best_ref_mvs() + xd->tile = *tile; } static void duplicate_mode_info_in_sb(VP9_COMMON *cm, MACROBLOCKD *xd, @@ -2929,7 +2932,7 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, ctx); else - vp9_pick_inter_mode_sub8x8(cpi, x, tile_data, mi_row, mi_col, + vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx); duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 3d7843ea7..16640fed9 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -1031,8 +1031,10 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { fps.mvr_abs = (double)sum_mvr_abs / mvcount; fps.MVc = (double)sum_mvc / mvcount; fps.mvc_abs = (double)sum_mvc_abs / mvcount; - fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / mvcount)) / mvcount; - fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / mvcount)) / mvcount; + fps.MVrv = ((double)sum_mvrs - + ((double)sum_mvr * sum_mvr / mvcount)) / mvcount; + fps.MVcv = ((double)sum_mvcs - + ((double)sum_mvc * sum_mvc / mvcount)) / mvcount; fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2); fps.new_mv_count = new_mv_count; fps.pcnt_motion = (double)mvcount / num_mbs; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 3eaa99054..a6271362d 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -1179,7 +1179,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, sf, sf); if (cm->use_prev_frame_mvs) - vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0], ref_frame, + vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col, NULL, NULL, xd->mi[0]->mbmi.mode_context); else @@ -1623,11 +1623,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, - TileDataEnc *tile_data, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { VP9_COMMON *const cm = &cpi->common; - TileInfo *const tile_info = &tile_data->tile_info; SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi; @@ -1659,7 +1657,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, &cm->frame_refs[ref_frame - 1].sf; vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); - vp9_find_mv_refs(cm, xd, tile_info, xd->mi[0], ref_frame, + vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col, NULL, NULL, xd->mi[0]->mbmi.mode_context); @@ -1733,7 +1731,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, b_mv[ZEROMV].as_int = 0; b_mv[NEWMV].as_int = INVALID_MV; - vp9_append_sub8x8_mvs_for_idx(cm, xd, tile_info, i, 0, mi_row, mi_col, + vp9_append_sub8x8_mvs_for_idx(cm, xd, i, 0, mi_row, mi_col, &b_mv[NEARESTMV], &b_mv[NEARMV], xd->mi[0]->mbmi.mode_context); diff --git a/vp9/encoder/vp9_pickmode.h b/vp9/encoder/vp9_pickmode.h index 11f44099c..a43bb8126 100644 --- a/vp9/encoder/vp9_pickmode.h +++ b/vp9/encoder/vp9_pickmode.h @@ -27,7 +27,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, PICK_MODE_CONTEXT *ctx); void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, - TileDataEnc *tile_data, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 162d4de5f..3f9b2eb0c 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1732,7 +1732,6 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo * const tile, int_mv *best_ref_mv, int_mv *second_best_ref_mv, int64_t best_rd, int *returntotrate, @@ -1802,7 +1801,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, for (ref = 0; ref < 1 + has_second_rf; ++ref) { const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; frame_mv[ZEROMV][frame].as_int = 0; - vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col, + vp9_append_sub8x8_mvs_for_idx(cm, xd, i, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame], &frame_mv[NEARMV][frame], xd->mi[0]->mbmi.mode_context); @@ -2199,7 +2198,6 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, } static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, - const TileInfo *const tile, MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE block_size, int mi_row, int mi_col, @@ -2220,7 +2218,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); // Gets an initial list of candidate vectors from neighbours and orders them - vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col, + vp9_find_mv_refs(cm, xd, mi, ref_frame, candidates, mi_row, mi_col, NULL, NULL, xd->mi[0]->mbmi.mode_context); // Candidate refinement carried out at encoder and decoder @@ -2982,7 +2980,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, x->pred_mv_sad[ref_frame] = INT_MAX; if (cpi->ref_frame_flags & flag_list[ref_frame]) { assert(get_ref_frame_buffer(cpi, ref_frame) != NULL); - setup_buffer_inter(cpi, x, tile_info, ref_frame, bsize, mi_row, mi_col, + setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); } frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; @@ -3714,7 +3712,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) { VP9_COMMON *const cm = &cpi->common; - TileInfo *const tile_info = &tile_data->tile_info; RD_OPT *const rd_opt = &cpi->rd; SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCKD *const xd = &x->e_mbd; @@ -3778,8 +3775,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { if (cpi->ref_frame_flags & flag_list[ref_frame]) { - setup_buffer_inter(cpi, x, tile_info, - ref_frame, bsize, mi_row, mi_col, + setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); } else { @@ -3971,7 +3967,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, int newbest, rs; int64_t rs_rd; mbmi->interp_filter = switchable_filter_index; - tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info, + tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, &mbmi->ref_mvs[ref_frame][0], second_ref, best_yrd, &rate, &rate_y, &distortion, @@ -4037,7 +4033,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, if (!pred_exists) { // Handles the special case when a filter that is not in the // switchable list (bilinear, 6-tap) is indicated at the frame level - tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, tile_info, + tmp_rd = rd_pick_best_sub8x8_mode(cpi, x, &mbmi->ref_mvs[ref_frame][0], second_ref, best_yrd, &rate, &rate_y, &distortion, &skippable, &total_sse, diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 96422d7e1..e78c111f0 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -160,5 +160,6 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_subtract_msa.c +VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) |