summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/vp9_blockd.h11
-rw-r--r--vp9/common/vp9_idct.c1706
-rw-r--r--vp9/common/vp9_idct.h144
-rw-r--r--vp9/common/vp9_loopfilter.c7
-rw-r--r--vp9/common/vp9_loopfilter.h8
-rw-r--r--vp9/common/vp9_rtcd_defs.pl996
-rw-r--r--vp9/common/x86/vp9_asm_stubs.c20
-rw-r--r--vp9/common/x86/vp9_idct_intrin_sse2.c11
-rw-r--r--vp9/decoder/vp9_decodeframe.c37
-rw-r--r--vp9/decoder/vp9_decoder.c2
-rw-r--r--vp9/decoder/vp9_decoder.h1
-rw-r--r--vp9/decoder/vp9_detokenize.c2
-rw-r--r--vp9/decoder/vp9_dthread.c16
-rw-r--r--vp9/decoder/vp9_dthread.h4
-rw-r--r--vp9/encoder/vp9_block.h12
-rw-r--r--vp9/encoder/vp9_context_tree.c8
-rw-r--r--vp9/encoder/vp9_context_tree.h12
-rw-r--r--vp9/encoder/vp9_dct.c348
-rw-r--r--vp9/encoder/vp9_denoiser.c6
-rw-r--r--vp9/encoder/vp9_denoiser.h2
-rw-r--r--vp9/encoder/vp9_encodeframe.c3
-rw-r--r--vp9/encoder/vp9_encodemb.c47
-rw-r--r--vp9/encoder/vp9_encoder.c22
-rw-r--r--vp9/encoder/vp9_encoder.h3
-rw-r--r--vp9/encoder/vp9_firstpass.c53
-rw-r--r--vp9/encoder/vp9_mcomp.c328
-rw-r--r--vp9/encoder/vp9_picklpf.c21
-rw-r--r--vp9/encoder/vp9_quantize.c54
-rw-r--r--vp9/encoder/vp9_quantize.h20
-rw-r--r--vp9/encoder/vp9_rdopt.c16
-rw-r--r--vp9/encoder/vp9_sad.c138
-rw-r--r--vp9/encoder/vp9_speed_features.c10
-rw-r--r--vp9/encoder/vp9_svc_layercontext.h1
-rw-r--r--vp9/encoder/vp9_tokenize.c2
-rw-r--r--vp9/encoder/vp9_variance.c372
-rw-r--r--vp9/encoder/vp9_variance.h22
-rw-r--r--vp9/vp9_cx_iface.c28
37 files changed, 3916 insertions, 577 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 7f8c5f215..b9094ed61 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -21,6 +21,7 @@
#include "vp9/common/vp9_common_data.h"
#include "vp9/common/vp9_enums.h"
#include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_mv.h"
#include "vp9/common/vp9_scale.h"
#include "vp9/common/vp9_seg_common.h"
@@ -179,7 +180,7 @@ struct buf_2d {
};
struct macroblockd_plane {
- int16_t *dqcoeff;
+ tran_low_t *dqcoeff;
PLANE_TYPE plane_type;
int subsampling_x;
int subsampling_y;
@@ -226,11 +227,17 @@ typedef struct macroblockd {
/* mc buffer */
DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
+#if CONFIG_VP9_HIGHBITDEPTH
+ /* Bit depth: 8, 10, 12 */
+ int bd;
+ DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
+#endif
+
int lossless;
int corrupted;
- DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 856d41e70..b196fc527 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -18,14 +18,47 @@
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_idct.h"
-void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
+#if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
+// When CONFIG_EMULATE_HW_HIGHBITDEPTH is 1 the transform performs strict
+// overflow wrapping to match expected hardware implementations.
+// bd of 8 uses trans_low with 16bits, need to remove 16bits
+// bd of 10 uses trans_low with 18bits, need to remove 14bits
+// bd of 12 uses trans_low with 20bits, need to remove 12bits
+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
+#define WRAPLOW(x) ((((int32_t)x) << (24 - bd)) >> (24 - bd))
+#else
+#define WRAPLOW(x) (x)
+#endif // CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_low_t clamp_high(tran_high_t value, tran_low_t low,
+ tran_low_t high) {
+ return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE tran_low_t clip_pixel_bd_high(tran_high_t dest,
+ tran_high_t trans, int bd) {
+ trans = WRAPLOW(trans);
+ switch (bd) {
+ case 8:
+ default:
+ return clamp_high(WRAPLOW(dest + trans), 0, 255);
+ case 10:
+ return clamp_high(WRAPLOW(dest + trans), 0, 1023);
+ case 12:
+ return clamp_high(WRAPLOW(dest + trans), 0, 4095);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
0.5 shifts per pixel. */
int i;
- int16_t output[16];
- int a1, b1, c1, d1, e1;
- const int16_t *ip = input;
- int16_t *op = output;
+ tran_low_t output[16];
+ tran_high_t a1, b1, c1, d1, e1;
+ const tran_low_t *ip = input;
+ tran_low_t *op = output;
for (i = 0; i < 4; i++) {
a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -70,12 +103,12 @@ void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
}
}
-void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
int i;
- int a1, e1;
- int16_t tmp[4];
- const int16_t *ip = in;
- int16_t *op = tmp;
+ tran_high_t a1, e1;
+ tran_low_t tmp[4];
+ const tran_low_t *ip = in;
+ tran_low_t *op = tmp;
a1 = ip[0] >> UNIT_QUANT_SHIFT;
e1 = a1 >> 1;
@@ -96,9 +129,9 @@ void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
}
}
-static void idct4(const int16_t *input, int16_t *output) {
- int16_t step[4];
- int temp1, temp2;
+static void idct4(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step[4];
+ tran_high_t temp1, temp2;
// stage 1
temp1 = (input[0] + input[2]) * cospi_16_64;
temp2 = (input[0] - input[2]) * cospi_16_64;
@@ -116,11 +149,11 @@ static void idct4(const int16_t *input, int16_t *output) {
output[3] = step[0] - step[3];
}
-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
- int16_t out[4 * 4];
- int16_t *outptr = out;
+void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
int i, j;
- int16_t temp_in[4], temp_out[4];
+ tran_low_t temp_in[4], temp_out[4];
// Rows
for (i = 0; i < 4; ++i) {
@@ -140,10 +173,11 @@ void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
}
}
-void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
+ int dest_stride) {
int i;
- int a1;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ tran_high_t a1;
+ tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 4);
@@ -156,9 +190,9 @@ void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
}
}
-static void idct8(const int16_t *input, int16_t *output) {
- int16_t step1[8], step2[8];
- int temp1, temp2;
+static void idct8(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step1[8], step2[8];
+ tran_high_t temp1, temp2;
// stage 1
step1[0] = input[0];
step1[2] = input[4];
@@ -201,11 +235,11 @@ static void idct8(const int16_t *input, int16_t *output) {
output[7] = step1[0] - step1[7];
}
-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
- int16_t out[8 * 8];
- int16_t *outptr = out;
+void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
int i, j;
- int16_t temp_in[8], temp_out[8];
+ tran_low_t temp_in[8], temp_out[8];
// First transform rows
for (i = 0; i < 8; ++i) {
@@ -225,10 +259,10 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
}
}
-void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i, j;
- int a1;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ tran_high_t a1;
+ tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 5);
for (j = 0; j < 8; ++j) {
@@ -238,13 +272,13 @@ void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
}
}
-static void iadst4(const int16_t *input, int16_t *output) {
- int s0, s1, s2, s3, s4, s5, s6, s7;
+static void iadst4(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
- int x0 = input[0];
- int x1 = input[1];
- int x2 = input[2];
- int x3 = input[3];
+ tran_high_t x0 = input[0];
+ tran_high_t x1 = input[1];
+ tran_high_t x2 = input[2];
+ tran_high_t x3 = input[3];
if (!(x0 | x1 | x2 | x3)) {
output[0] = output[1] = output[2] = output[3] = 0;
@@ -280,7 +314,7 @@ static void iadst4(const int16_t *input, int16_t *output) {
output[3] = dct_const_round_shift(s3);
}
-void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
const transform_2d IHT_4[] = {
{ idct4, idct4 }, // DCT_DCT = 0
@@ -290,9 +324,9 @@ void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
};
int i, j;
- int16_t out[4 * 4];
- int16_t *outptr = out;
- int16_t temp_in[4], temp_out[4];
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[4], temp_out[4];
// inverse transform row vectors
for (i = 0; i < 4; ++i) {
@@ -311,17 +345,17 @@ void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
+ dest[j * stride + i]);
}
}
-static void iadst8(const int16_t *input, int16_t *output) {
+static void iadst8(const tran_low_t *input, tran_low_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
- int x0 = input[7];
- int x1 = input[0];
- int x2 = input[5];
- int x3 = input[2];
- int x4 = input[3];
- int x5 = input[4];
- int x6 = input[1];
- int x7 = input[6];
+ tran_high_t x0 = input[7];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[5];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[3];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[1];
+ tran_high_t x7 = input[6];
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
output[0] = output[1] = output[2] = output[3] = output[4]
@@ -395,12 +429,12 @@ static const transform_2d IHT_8[] = {
{ iadst8, iadst8 } // ADST_ADST = 3
};
-void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
int i, j;
- int16_t out[8 * 8];
- int16_t *outptr = out;
- int16_t temp_in[8], temp_out[8];
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[8], temp_out[8];
const transform_2d ht = IHT_8[tx_type];
// inverse transform row vectors
@@ -421,11 +455,11 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
}
}
-void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {
- int16_t out[8 * 8] = { 0 };
- int16_t *outptr = out;
+void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
int i, j;
- int16_t temp_in[8], temp_out[8];
+ tran_low_t temp_in[8], temp_out[8];
// First transform rows
// only first 4 row has non-zero coefs
@@ -446,9 +480,9 @@ void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {
}
}
-static void idct16(const int16_t *input, int16_t *output) {
- int16_t step1[16], step2[16];
- int temp1, temp2;
+static void idct16(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step1[16], step2[16];
+ tran_high_t temp1, temp2;
// stage 1
step1[0] = input[0/2];
@@ -611,11 +645,12 @@ static void idct16(const int16_t *input, int16_t *output) {
output[15] = step2[0] - step2[15];
}
-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
- int16_t out[16 * 16];
- int16_t *outptr = out;
+void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
int i, j;
- int16_t temp_in[16], temp_out[16];
+ tran_low_t temp_in[16], temp_out[16];
// First transform rows
for (i = 0; i < 16; ++i) {
@@ -635,25 +670,26 @@ void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
}
}
-static void iadst16(const int16_t *input, int16_t *output) {
- int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
-
- int x0 = input[15];
- int x1 = input[0];
- int x2 = input[13];
- int x3 = input[2];
- int x4 = input[11];
- int x5 = input[4];
- int x6 = input[9];
- int x7 = input[6];
- int x8 = input[7];
- int x9 = input[8];
- int x10 = input[5];
- int x11 = input[10];
- int x12 = input[3];
- int x13 = input[12];
- int x14 = input[1];
- int x15 = input[14];
+static void iadst16(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+ tran_high_t x0 = input[15];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[13];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[11];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[9];
+ tran_high_t x7 = input[6];
+ tran_high_t x8 = input[7];
+ tran_high_t x9 = input[8];
+ tran_high_t x10 = input[5];
+ tran_high_t x11 = input[10];
+ tran_high_t x12 = input[3];
+ tran_high_t x13 = input[12];
+ tran_high_t x14 = input[1];
+ tran_high_t x15 = input[14];
if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
| x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
@@ -813,12 +849,12 @@ static const transform_2d IHT_16[] = {
{ iadst16, iadst16 } // ADST_ADST = 3
};
-void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
int i, j;
- int16_t out[16 * 16];
- int16_t *outptr = out;
- int16_t temp_in[16], temp_out[16];
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[16], temp_out[16];
const transform_2d ht = IHT_16[tx_type];
// Rows
@@ -839,11 +875,12 @@ void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
}
}
-void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
- int16_t out[16 * 16] = { 0 };
- int16_t *outptr = out;
+void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
int i, j;
- int16_t temp_in[16], temp_out[16];
+ tran_low_t temp_in[16], temp_out[16];
// First transform rows. Since all non-zero dct coefficients are in
// upper-left 4x4 area, we only need to calculate first 4 rows here.
@@ -864,10 +901,10 @@ void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
}
}
-void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i, j;
- int a1;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ tran_high_t a1;
+ tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 6);
for (j = 0; j < 16; ++j) {
@@ -877,9 +914,9 @@ void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
}
}
-static void idct32(const int16_t *input, int16_t *output) {
- int16_t step1[32], step2[32];
- int temp1, temp2;
+static void idct32(const tran_low_t *input, tran_low_t *output) {
+ tran_low_t step1[32], step2[32];
+ tran_high_t temp1, temp2;
// stage 1
step1[0] = input[0];
@@ -1244,11 +1281,12 @@ static void idct32(const int16_t *input, int16_t *output) {
output[31] = step1[0] - step1[31];
}
-void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
- int16_t out[32 * 32];
- int16_t *outptr = out;
+void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[32 * 32];
+ tran_low_t *outptr = out;
int i, j;
- int16_t temp_in[32], temp_out[32];
+ tran_low_t temp_in[32], temp_out[32];
// Rows
for (i = 0; i < 32; ++i) {
@@ -1265,7 +1303,7 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
if (zero_coeff[0] | zero_coeff[1])
idct32(input, outptr);
else
- vpx_memset(outptr, 0, sizeof(int16_t) * 32);
+ vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
input += 32;
outptr += 32;
}
@@ -1281,11 +1319,12 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
}
}
-void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
- int16_t out[32 * 32] = {0};
- int16_t *outptr = out;
+void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ tran_low_t out[32 * 32] = {0};
+ tran_low_t *outptr = out;
int i, j;
- int16_t temp_in[32], temp_out[32];
+ tran_low_t temp_in[32], temp_out[32];
// Rows
// only upper-left 8x8 has non-zero coeff
@@ -1306,11 +1345,11 @@ void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
}
}
-void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i, j;
- int a1;
+ tran_high_t a1;
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
a1 = ROUND_POWER_OF_TWO(out, 6);
@@ -1322,7 +1361,8 @@ void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
}
// idct
-void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob) {
if (eob > 1)
vp9_idct4x4_16_add(input, dest, stride);
else
@@ -1330,14 +1370,16 @@ void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
}
-void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob) {
if (eob > 1)
vp9_iwht4x4_16_add(input, dest, stride);
else
vp9_iwht4x4_1_add(input, dest, stride);
}
-void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob) {
// If dc is 1, then input[0] is the reconstructed value, do not need
// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
@@ -1354,7 +1396,7 @@ void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
vp9_idct8x8_64_add(input, dest, stride);
}
-void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob) {
/* The calculation can be simplified if there are not many non-zero dct
* coefficients. Use eobs to separate different cases. */
@@ -1367,7 +1409,7 @@ void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
vp9_idct16x16_256_add(input, dest, stride);
}
-void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob) {
if (eob == 1)
vp9_idct32x32_1_add(input, dest, stride);
@@ -1379,7 +1421,7 @@ void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
}
// iht
-void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob) {
if (tx_type == DCT_DCT)
vp9_idct4x4_add(input, dest, stride, eob);
@@ -1387,7 +1429,7 @@ void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
vp9_iht4x4_16_add(input, dest, stride, tx_type);
}
-void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob) {
if (tx_type == DCT_DCT) {
vp9_idct8x8_add(input, dest, stride, eob);
@@ -1396,7 +1438,7 @@ void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
}
}
-void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob) {
if (tx_type == DCT_DCT) {
vp9_idct16x16_add(input, dest, stride, eob);
@@ -1404,3 +1446,1433 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
vp9_iht16x16_256_add(input, dest, stride, tx_type);
}
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ int i;
+ tran_low_t output[16];
+ tran_high_t a1, b1, c1, d1, e1;
+ const tran_low_t *ip = input;
+ tran_low_t *op = output;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ c1 = ip[1] >> UNIT_QUANT_SHIFT;
+ d1 = ip[2] >> UNIT_QUANT_SHIFT;
+ b1 = ip[3] >> UNIT_QUANT_SHIFT;
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ op[0] = WRAPLOW(a1);
+ op[1] = WRAPLOW(b1);
+ op[2] = WRAPLOW(c1);
+ op[3] = WRAPLOW(d1);
+ ip += 4;
+ op += 4;
+ }
+
+ ip = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[4 * 0];
+ c1 = ip[4 * 1];
+ d1 = ip[4 * 2];
+ b1 = ip[4 * 3];
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+ dest[stride * 0] = clip_pixel_bd_high(dest[stride * 0], a1, bd);
+ dest[stride * 1] = clip_pixel_bd_high(dest[stride * 1], b1, bd);
+ dest[stride * 2] = clip_pixel_bd_high(dest[stride * 2], c1, bd);
+ dest[stride * 3] = clip_pixel_bd_high(dest[stride * 3], d1, bd);
+
+ ip++;
+ dest++;
+ }
+}
+
+static void high_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step[4];
+ tran_high_t temp1, temp2;
+ (void) bd;
+ // stage 1
+ temp1 = (input[0] + input[2]) * cospi_16_64;
+ temp2 = (input[0] - input[2]) * cospi_16_64;
+ step[0] = WRAPLOW(dct_const_round_shift(temp1));
+ step[1] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step[2] = WRAPLOW(dct_const_round_shift(temp1));
+ step[3] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 2
+ output[0] = WRAPLOW(step[0] + step[3]);
+ output[1] = WRAPLOW(step[1] + step[2]);
+ output[2] = WRAPLOW(step[1] - step[2]);
+ output[3] = WRAPLOW(step[0] - step[3]);
+}
+
+void vp9_high_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+ int dest_stride, int bd) {
+ int i;
+ tran_high_t a1, e1;
+ tran_low_t tmp[4];
+ const tran_low_t *ip = in;
+ tran_low_t *op = tmp;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ (void) bd;
+
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ e1 = a1 >> 1;
+ a1 -= e1;
+ op[0] = WRAPLOW(a1);
+ op[1] = op[2] = op[3] = WRAPLOW(e1);
+
+ ip = tmp;
+ for (i = 0; i < 4; i++) {
+ e1 = ip[0] >> 1;
+ a1 = ip[0] - e1;
+ dest[dest_stride * 0] = clip_pixel_bd_high(dest[dest_stride * 0], a1, bd);
+ dest[dest_stride * 1] = clip_pixel_bd_high(dest[dest_stride * 1], e1, bd);
+ dest[dest_stride * 2] = clip_pixel_bd_high(dest[dest_stride * 2], e1, bd);
+ dest[dest_stride * 3] = clip_pixel_bd_high(dest[dest_stride * 3], e1, bd);
+ ip++;
+ dest++;
+ }
+}
+
+void vp9_high_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[4], temp_out[4];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Rows
+ for (i = 0; i < 4; ++i) {
+ high_idct4(input, outptr, bd);
+ input += 4;
+ outptr += 4;
+ }
+
+ // Columns
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j * 4 + i];
+ high_idct4(temp_in, temp_out, bd);
+ for (j = 0; j < 4; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+ }
+}
+
+void vp9_high_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int dest_stride, int bd) {
+ int i;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+
+ for (i = 0; i < 4; i++) {
+ dest[0] = clip_pixel_bd_high(dest[0], a1, bd);
+ dest[1] = clip_pixel_bd_high(dest[1], a1, bd);
+ dest[2] = clip_pixel_bd_high(dest[2], a1, bd);
+ dest[3] = clip_pixel_bd_high(dest[3], a1, bd);
+ dest += dest_stride;
+ }
+}
+
+static void high_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step1[8], step2[8];
+ tran_high_t temp1, temp2;
+ // stage 1
+ step1[0] = input[0];
+ step1[2] = input[4];
+ step1[1] = input[2];
+ step1[3] = input[6];
+ temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 2 & stage 3 - even half
+ high_idct4(step1, step1, bd);
+
+ // stage 2 - odd half
+ step2[4] = WRAPLOW(step1[4] + step1[5]);
+ step2[5] = WRAPLOW(step1[4] - step1[5]);
+ step2[6] = WRAPLOW(-step1[6] + step1[7]);
+ step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+ // stage 3 - odd half
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[7] = step2[7];
+
+ // stage 4
+ output[0] = WRAPLOW(step1[0] + step1[7]);
+ output[1] = WRAPLOW(step1[1] + step1[6]);
+ output[2] = WRAPLOW(step1[2] + step1[5]);
+ output[3] = WRAPLOW(step1[3] + step1[4]);
+ output[4] = WRAPLOW(step1[3] - step1[4]);
+ output[5] = WRAPLOW(step1[2] - step1[5]);
+ output[6] = WRAPLOW(step1[1] - step1[6]);
+ output[7] = WRAPLOW(step1[0] - step1[7]);
+}
+
+void vp9_high_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows.
+ for (i = 0; i < 8; ++i) {
+ high_idct8(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns.
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ high_idct8(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 5),
+ bd);
+ }
+}
+
+void vp9_high_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+ for (j = 0; j < 8; ++j) {
+ for (i = 0; i < 8; ++i)
+ dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
+ dest += stride;
+ }
+}
+
+static void high_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_high_t x0 = input[0];
+ tran_high_t x1 = input[1];
+ tran_high_t x2 = input[2];
+ tran_high_t x3 = input[3];
+ (void) bd;
+
+ if (!(x0 | x1 | x2 | x3)) {
+ vpx_memset(output, 0, 4 * sizeof(*output));
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_2_9 * x0;
+ s2 = sinpi_3_9 * x1;
+ s3 = sinpi_4_9 * x2;
+ s4 = sinpi_1_9 * x2;
+ s5 = sinpi_2_9 * x3;
+ s6 = sinpi_4_9 * x3;
+ s7 = x0 - x2 + x3;
+
+ x0 = s0 + s3 + s5;
+ x1 = s1 - s4 - s6;
+ x2 = sinpi_3_9 * s7;
+ x3 = s2;
+
+ s0 = x0 + x3;
+ s1 = x1 + x3;
+ s2 = x2;
+ s3 = x0 + x1 - x3;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = WRAPLOW(dct_const_round_shift(s0));
+ output[1] = WRAPLOW(dct_const_round_shift(s1));
+ output[2] = WRAPLOW(dct_const_round_shift(s2));
+ output[3] = WRAPLOW(dct_const_round_shift(s3));
+}
+
+void vp9_high_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int tx_type, int bd) {
+ const high_transform_2d IHT_4[] = {
+ { high_idct4, high_idct4 }, // DCT_DCT = 0
+ { high_iadst4, high_idct4 }, // ADST_DCT = 1
+ { high_idct4, high_iadst4 }, // DCT_ADST = 2
+ { high_iadst4, high_iadst4 } // ADST_ADST = 3
+ };
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ int i, j;
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[4], temp_out[4];
+
+ // Inverse transform row vectors.
+ for (i = 0; i < 4; ++i) {
+ IHT_4[tx_type].rows(input, outptr, bd);
+ input += 4;
+ outptr += 4;
+ }
+
+ // Inverse transform column vectors.
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j * 4 + i];
+ IHT_4[tx_type].cols(temp_in, temp_out, bd);
+ for (j = 0; j < 4; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+ }
+}
+
+static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ tran_high_t x0 = input[7];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[5];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[3];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[1];
+ tran_high_t x7 = input[6];
+ (void) bd;
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ vpx_memset(output, 0, 8 * sizeof(*output));
+ return;
+ }
+
+ // stage 1
+ s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+ s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+
+ x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
+ x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
+ x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
+ x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
+ x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
+ x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
+ x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
+ x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+
+ x0 = s0 + s2;
+ x1 = s1 + s3;
+ x2 = s0 - s2;
+ x3 = s1 - s3;
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+ x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+ x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+
+ // stage 3
+ s2 = cospi_16_64 * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (x6 - x7);
+
+ x2 = WRAPLOW(dct_const_round_shift(s2));
+ x3 = WRAPLOW(dct_const_round_shift(s3));
+ x6 = WRAPLOW(dct_const_round_shift(s6));
+ x7 = WRAPLOW(dct_const_round_shift(s7));
+
+ output[0] = WRAPLOW(x0);
+ output[1] = WRAPLOW(-x4);
+ output[2] = WRAPLOW(x6);
+ output[3] = WRAPLOW(-x2);
+ output[4] = WRAPLOW(x3);
+ output[5] = WRAPLOW(-x7);
+ output[6] = WRAPLOW(x5);
+ output[7] = WRAPLOW(-x1);
+}
+
+static const high_transform_2d HIGH_IHT_8[] = {
+ { high_idct8, high_idct8 }, // DCT_DCT = 0
+ { high_iadst8, high_idct8 }, // ADST_DCT = 1
+ { high_idct8, high_iadst8 }, // DCT_ADST = 2
+ { high_iadst8, high_iadst8 } // ADST_ADST = 3
+};
+
+void vp9_high_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int tx_type, int bd) {
+ int i, j;
+ tran_low_t out[8 * 8];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[8], temp_out[8];
+ const high_transform_2d ht = HIGH_IHT_8[tx_type];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Inverse transform row vectors.
+ for (i = 0; i < 8; ++i) {
+ ht.rows(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Inverse transform column vectors.
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ ht.cols(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+}
+
+void vp9_high_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[8], temp_out[8];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows.
+ // Only first 4 row has non-zero coefs.
+ for (i = 0; i < 4; ++i) {
+ high_idct8(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+ // Then transform columns.
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+ high_idct8(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+}
+
+static void high_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step1[16], step2[16];
+ tran_high_t temp1, temp2;
+ (void) bd;
+
+ // stage 1
+ step1[0] = input[0/2];
+ step1[1] = input[16/2];
+ step1[2] = input[8/2];
+ step1[3] = input[24/2];
+ step1[4] = input[4/2];
+ step1[5] = input[20/2];
+ step1[6] = input[12/2];
+ step1[7] = input[28/2];
+ step1[8] = input[2/2];
+ step1[9] = input[18/2];
+ step1[10] = input[10/2];
+ step1[11] = input[26/2];
+ step1[12] = input[6/2];
+ step1[13] = input[22/2];
+ step1[14] = input[14/2];
+ step1[15] = input[30/2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[15] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+ step1[8] = WRAPLOW(step2[8] + step2[9]);
+ step1[9] = WRAPLOW(step2[8] - step2[9]);
+ step1[10] = WRAPLOW(-step2[10] + step2[11]);
+ step1[11] = WRAPLOW(step2[10] + step2[11]);
+ step1[12] = WRAPLOW(step2[12] + step2[13]);
+ step1[13] = WRAPLOW(step2[12] - step2[13]);
+ step1[14] = WRAPLOW(-step2[14] + step2[15]);
+ step1[15] = WRAPLOW(step2[14] + step2[15]);
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[4] = WRAPLOW(step1[4] + step1[5]);
+ step2[5] = WRAPLOW(step1[4] - step1[5]);
+ step2[6] = WRAPLOW(-step1[6] + step1[7]);
+ step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3]);
+ step1[1] = WRAPLOW(step2[1] + step2[2]);
+ step1[2] = WRAPLOW(step2[1] - step2[2]);
+ step1[3] = WRAPLOW(step2[0] - step2[3]);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11]);
+ step1[9] = WRAPLOW(step2[9] + step2[10]);
+ step1[10] = WRAPLOW(step2[9] - step2[10]);
+ step1[11] = WRAPLOW(step2[8] - step2[11]);
+ step1[12] = WRAPLOW(-step2[12] + step2[15]);
+ step1[13] = WRAPLOW(-step2[13] + step2[14]);
+ step1[14] = WRAPLOW(step2[13] + step2[14]);
+ step1[15] = WRAPLOW(step2[12] + step2[15]);
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7]);
+ step2[1] = WRAPLOW(step1[1] + step1[6]);
+ step2[2] = WRAPLOW(step1[2] + step1[5]);
+ step2[3] = WRAPLOW(step1[3] + step1[4]);
+ step2[4] = WRAPLOW(step1[3] - step1[4]);
+ step2[5] = WRAPLOW(step1[2] - step1[5]);
+ step2[6] = WRAPLOW(step1[1] - step1[6]);
+ step2[7] = WRAPLOW(step1[0] - step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ output[0] = WRAPLOW(step2[0] + step2[15]);
+ output[1] = WRAPLOW(step2[1] + step2[14]);
+ output[2] = WRAPLOW(step2[2] + step2[13]);
+ output[3] = WRAPLOW(step2[3] + step2[12]);
+ output[4] = WRAPLOW(step2[4] + step2[11]);
+ output[5] = WRAPLOW(step2[5] + step2[10]);
+ output[6] = WRAPLOW(step2[6] + step2[9]);
+ output[7] = WRAPLOW(step2[7] + step2[8]);
+ output[8] = WRAPLOW(step2[7] - step2[8]);
+ output[9] = WRAPLOW(step2[6] - step2[9]);
+ output[10] = WRAPLOW(step2[5] - step2[10]);
+ output[11] = WRAPLOW(step2[4] - step2[11]);
+ output[12] = WRAPLOW(step2[3] - step2[12]);
+ output[13] = WRAPLOW(step2[2] - step2[13]);
+ output[14] = WRAPLOW(step2[1] - step2[14]);
+ output[15] = WRAPLOW(step2[0] - step2[15]);
+}
+
+void vp9_high_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows.
+ for (i = 0; i < 16; ++i) {
+ high_idct16(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns.
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ high_idct16(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+}
+
+static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+ tran_high_t x0 = input[15];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[13];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[11];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[9];
+ tran_high_t x7 = input[6];
+ tran_high_t x8 = input[7];
+ tran_high_t x9 = input[8];
+ tran_high_t x10 = input[5];
+ tran_high_t x11 = input[10];
+ tran_high_t x12 = input[3];
+ tran_high_t x13 = input[12];
+ tran_high_t x14 = input[1];
+ tran_high_t x15 = input[14];
+ (void) bd;
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+ | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+ vpx_memset(output, 0, 16 * sizeof(*output));
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+ x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
+ x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
+ x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
+ x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
+ x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
+ x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
+ x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
+ x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
+ x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
+ x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
+ x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
+ x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
+ x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
+ x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = WRAPLOW(s0 + s4);
+ x1 = WRAPLOW(s1 + s5);
+ x2 = WRAPLOW(s2 + s6);
+ x3 = WRAPLOW(s3 + s7);
+ x4 = WRAPLOW(s0 - s4);
+ x5 = WRAPLOW(s1 - s5);
+ x6 = WRAPLOW(s2 - s6);
+ x7 = WRAPLOW(s3 - s7);
+ x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
+ x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
+ x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
+ x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
+ x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
+ x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
+ x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
+ x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+ x0 = WRAPLOW(s0 + s2);
+ x1 = WRAPLOW(s1 + s3);
+ x2 = WRAPLOW(s0 - s2);
+ x3 = WRAPLOW(s1 - s3);
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+ x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+ x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+ x8 = WRAPLOW(s8 + s10);
+ x9 = WRAPLOW(s9 + s11);
+ x10 = WRAPLOW(s8 - s10);
+ x11 = WRAPLOW(s9 - s11);
+ x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
+ x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
+ x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
+ x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
+
+ // stage 4
+ s2 = (- cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (-x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (-x10 + x11);
+ s14 = (- cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
+
+ x2 = WRAPLOW(dct_const_round_shift(s2));
+ x3 = WRAPLOW(dct_const_round_shift(s3));
+ x6 = WRAPLOW(dct_const_round_shift(s6));
+ x7 = WRAPLOW(dct_const_round_shift(s7));
+ x10 = WRAPLOW(dct_const_round_shift(s10));
+ x11 = WRAPLOW(dct_const_round_shift(s11));
+ x14 = WRAPLOW(dct_const_round_shift(s14));
+ x15 = WRAPLOW(dct_const_round_shift(s15));
+
+ output[0] = WRAPLOW(x0);
+ output[1] = WRAPLOW(-x8);
+ output[2] = WRAPLOW(x12);
+ output[3] = WRAPLOW(-x4);
+ output[4] = WRAPLOW(x6);
+ output[5] = WRAPLOW(x14);
+ output[6] = WRAPLOW(x10);
+ output[7] = WRAPLOW(x2);
+ output[8] = WRAPLOW(x3);
+ output[9] = WRAPLOW(x11);
+ output[10] = WRAPLOW(x15);
+ output[11] = WRAPLOW(x7);
+ output[12] = WRAPLOW(x5);
+ output[13] = WRAPLOW(-x13);
+ output[14] = WRAPLOW(x9);
+ output[15] = WRAPLOW(-x1);
+}
+
+static const high_transform_2d HIGH_IHT_16[] = {
+ { high_idct16, high_idct16 }, // DCT_DCT = 0
+ { high_iadst16, high_idct16 }, // ADST_DCT = 1
+ { high_idct16, high_iadst16 }, // DCT_ADST = 2
+ { high_iadst16, high_iadst16 } // ADST_ADST = 3
+};
+
+void vp9_high_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int tx_type, int bd) {
+ int i, j;
+ tran_low_t out[16 * 16];
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[16], temp_out[16];
+ const high_transform_2d ht = HIGH_IHT_16[tx_type];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Rows
+ for (i = 0; i < 16; ++i) {
+ ht.rows(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ ht.cols(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+}
+
+void vp9_high_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[16], temp_out[16];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ for (i = 0; i < 4; ++i) {
+ high_idct16(input, outptr, bd);
+ input += 16;
+ outptr += 16;
+ }
+
+ // Then transform columns.
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j*16 + i];
+ high_idct16(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+}
+
+void vp9_high_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ int i, j;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ for (j = 0; j < 16; ++j) {
+ for (i = 0; i < 16; ++i)
+ dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
+ dest += stride;
+ }
+}
+
+static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_low_t step1[32], step2[32];
+ tran_high_t temp1, temp2;
+ (void) bd;
+
+ // stage 1
+ step1[0] = input[0];
+ step1[1] = input[16];
+ step1[2] = input[8];
+ step1[3] = input[24];
+ step1[4] = input[4];
+ step1[5] = input[20];
+ step1[6] = input[12];
+ step1[7] = input[28];
+ step1[8] = input[2];
+ step1[9] = input[18];
+ step1[10] = input[10];
+ step1[11] = input[26];
+ step1[12] = input[6];
+ step1[13] = input[22];
+ step1[14] = input[14];
+ step1[15] = input[30];
+
+ temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+ temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+ step1[16] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[31] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+ temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+ step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[30] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+ temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+ temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+ step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[28] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+ temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+ temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+ temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+ temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+ step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[24] = WRAPLOW(dct_const_round_shift(temp2));
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+
+ temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+ temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+ step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[15] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+ temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+ temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+
+ temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+ temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+ step2[16] = WRAPLOW(step1[16] + step1[17]);
+ step2[17] = WRAPLOW(step1[16] - step1[17]);
+ step2[18] = WRAPLOW(-step1[18] + step1[19]);
+ step2[19] = WRAPLOW(step1[18] + step1[19]);
+ step2[20] = WRAPLOW(step1[20] + step1[21]);
+ step2[21] = WRAPLOW(step1[20] - step1[21]);
+ step2[22] = WRAPLOW(-step1[22] + step1[23]);
+ step2[23] = WRAPLOW(step1[22] + step1[23]);
+ step2[24] = WRAPLOW(step1[24] + step1[25]);
+ step2[25] = WRAPLOW(step1[24] - step1[25]);
+ step2[26] = WRAPLOW(-step1[26] + step1[27]);
+ step2[27] = WRAPLOW(step1[26] + step1[27]);
+ step2[28] = WRAPLOW(step1[28] + step1[29]);
+ step2[29] = WRAPLOW(step1[28] - step1[29]);
+ step2[30] = WRAPLOW(-step1[30] + step1[31]);
+ step2[31] = WRAPLOW(step1[30] + step1[31]);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+
+ temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+ temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+ step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+ temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+ step1[8] = WRAPLOW(step2[8] + step2[9]);
+ step1[9] = WRAPLOW(step2[8] - step2[9]);
+ step1[10] = WRAPLOW(-step2[10] + step2[11]);
+ step1[11] = WRAPLOW(step2[10] + step2[11]);
+ step1[12] = WRAPLOW(step2[12] + step2[13]);
+ step1[13] = WRAPLOW(step2[12] - step2[13]);
+ step1[14] = WRAPLOW(-step2[14] + step2[15]);
+ step1[15] = WRAPLOW(step2[14] + step2[15]);
+
+ step1[16] = step2[16];
+ step1[31] = step2[31];
+ temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+ temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+ step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[30] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+ temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+ temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+ temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+
+ // stage 4
+ temp1 = (step1[0] + step1[1]) * cospi_16_64;
+ temp2 = (step1[0] - step1[1]) * cospi_16_64;
+ step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+ temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+ step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[4] = WRAPLOW(step1[4] + step1[5]);
+ step2[5] = WRAPLOW(step1[4] - step1[5]);
+ step2[6] = WRAPLOW(-step1[6] + step1[7]);
+ step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+ step2[8] = step1[8];
+ step2[15] = step1[15];
+ temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+ temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+ step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+ temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+
+ step2[16] = WRAPLOW(step1[16] + step1[19]);
+ step2[17] = WRAPLOW(step1[17] + step1[18]);
+ step2[18] = WRAPLOW(step1[17] - step1[18]);
+ step2[19] = WRAPLOW(step1[16] - step1[19]);
+ step2[20] = WRAPLOW(-step1[20] + step1[23]);
+ step2[21] = WRAPLOW(-step1[21] + step1[22]);
+ step2[22] = WRAPLOW(step1[21] + step1[22]);
+ step2[23] = WRAPLOW(step1[20] + step1[23]);
+
+ step2[24] = WRAPLOW(step1[24] + step1[27]);
+ step2[25] = WRAPLOW(step1[25] + step1[26]);
+ step2[26] = WRAPLOW(step1[25] - step1[26]);
+ step2[27] = WRAPLOW(step1[24] - step1[27]);
+ step2[28] = WRAPLOW(-step1[28] + step1[31]);
+ step2[29] = WRAPLOW(-step1[29] + step1[30]);
+ step2[30] = WRAPLOW(step1[29] + step1[30]);
+ step2[31] = WRAPLOW(step1[28] + step1[31]);
+
+ // stage 5
+ step1[0] = WRAPLOW(step2[0] + step2[3]);
+ step1[1] = WRAPLOW(step2[1] + step2[2]);
+ step1[2] = WRAPLOW(step2[1] - step2[2]);
+ step1[3] = WRAPLOW(step2[0] - step2[3]);
+ step1[4] = step2[4];
+ temp1 = (step2[6] - step2[5]) * cospi_16_64;
+ temp2 = (step2[5] + step2[6]) * cospi_16_64;
+ step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[7] = step2[7];
+
+ step1[8] = WRAPLOW(step2[8] + step2[11]);
+ step1[9] = WRAPLOW(step2[9] + step2[10]);
+ step1[10] = WRAPLOW(step2[9] - step2[10]);
+ step1[11] = WRAPLOW(step2[8] - step2[11]);
+ step1[12] = WRAPLOW(-step2[12] + step2[15]);
+ step1[13] = WRAPLOW(-step2[13] + step2[14]);
+ step1[14] = WRAPLOW(step2[13] + step2[14]);
+ step1[15] = WRAPLOW(step2[12] + step2[15]);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+ temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+ step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+ temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+ step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[28] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+ temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+ temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[22] = step2[22];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[25];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // stage 6
+ step2[0] = WRAPLOW(step1[0] + step1[7]);
+ step2[1] = WRAPLOW(step1[1] + step1[6]);
+ step2[2] = WRAPLOW(step1[2] + step1[5]);
+ step2[3] = WRAPLOW(step1[3] + step1[4]);
+ step2[4] = WRAPLOW(step1[3] - step1[4]);
+ step2[5] = WRAPLOW(step1[2] - step1[5]);
+ step2[6] = WRAPLOW(step1[1] - step1[6]);
+ step2[7] = WRAPLOW(step1[0] - step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+ temp2 = (step1[10] + step1[13]) * cospi_16_64;
+ step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+ temp2 = (step1[11] + step1[12]) * cospi_16_64;
+ step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+ step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+ step2[14] = WRAPLOW(step1[14]);
+ step2[15] = WRAPLOW(step1[15]);
+
+ step2[16] = WRAPLOW(step1[16] + step1[23]);
+ step2[17] = WRAPLOW(step1[17] + step1[22]);
+ step2[18] = WRAPLOW(step1[18] + step1[21]);
+ step2[19] = WRAPLOW(step1[19] + step1[20]);
+ step2[20] = WRAPLOW(step1[19] - step1[20]);
+ step2[21] = WRAPLOW(step1[18] - step1[21]);
+ step2[22] = WRAPLOW(step1[17] - step1[22]);
+ step2[23] = WRAPLOW(step1[16] - step1[23]);
+
+ step2[24] = WRAPLOW(-step1[24] + step1[31]);
+ step2[25] = WRAPLOW(-step1[25] + step1[30]);
+ step2[26] = WRAPLOW(-step1[26] + step1[29]);
+ step2[27] = WRAPLOW(-step1[27] + step1[28]);
+ step2[28] = WRAPLOW(step1[27] + step1[28]);
+ step2[29] = WRAPLOW(step1[26] + step1[29]);
+ step2[30] = WRAPLOW(step1[25] + step1[30]);
+ step2[31] = WRAPLOW(step1[24] + step1[31]);
+
+ // stage 7
+ step1[0] = WRAPLOW(step2[0] + step2[15]);
+ step1[1] = WRAPLOW(step2[1] + step2[14]);
+ step1[2] = WRAPLOW(step2[2] + step2[13]);
+ step1[3] = WRAPLOW(step2[3] + step2[12]);
+ step1[4] = WRAPLOW(step2[4] + step2[11]);
+ step1[5] = WRAPLOW(step2[5] + step2[10]);
+ step1[6] = WRAPLOW(step2[6] + step2[9]);
+ step1[7] = WRAPLOW(step2[7] + step2[8]);
+ step1[8] = WRAPLOW(step2[7] - step2[8]);
+ step1[9] = WRAPLOW(step2[6] - step2[9]);
+ step1[10] = WRAPLOW(step2[5] - step2[10]);
+ step1[11] = WRAPLOW(step2[4] - step2[11]);
+ step1[12] = WRAPLOW(step2[3] - step2[12]);
+ step1[13] = WRAPLOW(step2[2] - step2[13]);
+ step1[14] = WRAPLOW(step2[1] - step2[14]);
+ step1[15] = WRAPLOW(step2[0] - step2[15]);
+
+ step1[16] = step2[16];
+ step1[17] = step2[17];
+ step1[18] = step2[18];
+ step1[19] = step2[19];
+ temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+ temp2 = (step2[20] + step2[27]) * cospi_16_64;
+ step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+ temp2 = (step2[21] + step2[26]) * cospi_16_64;
+ step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+ temp2 = (step2[22] + step2[25]) * cospi_16_64;
+ step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+ temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+ temp2 = (step2[23] + step2[24]) * cospi_16_64;
+ step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+ step1[24] = WRAPLOW(dct_const_round_shift(temp2));
+ step1[28] = step2[28];
+ step1[29] = step2[29];
+ step1[30] = step2[30];
+ step1[31] = step2[31];
+
+ // final stage
+ output[0] = WRAPLOW(step1[0] + step1[31]);
+ output[1] = WRAPLOW(step1[1] + step1[30]);
+ output[2] = WRAPLOW(step1[2] + step1[29]);
+ output[3] = WRAPLOW(step1[3] + step1[28]);
+ output[4] = WRAPLOW(step1[4] + step1[27]);
+ output[5] = WRAPLOW(step1[5] + step1[26]);
+ output[6] = WRAPLOW(step1[6] + step1[25]);
+ output[7] = WRAPLOW(step1[7] + step1[24]);
+ output[8] = WRAPLOW(step1[8] + step1[23]);
+ output[9] = WRAPLOW(step1[9] + step1[22]);
+ output[10] = WRAPLOW(step1[10] + step1[21]);
+ output[11] = WRAPLOW(step1[11] + step1[20]);
+ output[12] = WRAPLOW(step1[12] + step1[19]);
+ output[13] = WRAPLOW(step1[13] + step1[18]);
+ output[14] = WRAPLOW(step1[14] + step1[17]);
+ output[15] = WRAPLOW(step1[15] + step1[16]);
+ output[16] = WRAPLOW(step1[15] - step1[16]);
+ output[17] = WRAPLOW(step1[14] - step1[17]);
+ output[18] = WRAPLOW(step1[13] - step1[18]);
+ output[19] = WRAPLOW(step1[12] - step1[19]);
+ output[20] = WRAPLOW(step1[11] - step1[20]);
+ output[21] = WRAPLOW(step1[10] - step1[21]);
+ output[22] = WRAPLOW(step1[9] - step1[22]);
+ output[23] = WRAPLOW(step1[8] - step1[23]);
+ output[24] = WRAPLOW(step1[7] - step1[24]);
+ output[25] = WRAPLOW(step1[6] - step1[25]);
+ output[26] = WRAPLOW(step1[5] - step1[26]);
+ output[27] = WRAPLOW(step1[4] - step1[27]);
+ output[28] = WRAPLOW(step1[3] - step1[28]);
+ output[29] = WRAPLOW(step1[2] - step1[29]);
+ output[30] = WRAPLOW(step1[1] - step1[30]);
+ output[31] = WRAPLOW(step1[0] - step1[31]);
+}
+
+void vp9_high_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[32 * 32];
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ tran_low_t zero_coeff[16];
+ for (j = 0; j < 16; ++j)
+ zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+ for (j = 0; j < 8; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 4; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+ for (j = 0; j < 2; ++j)
+ zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+ if (zero_coeff[0] | zero_coeff[1])
+ high_idct32(input, outptr, bd);
+ else
+ vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j * 32 + i];
+ high_idct32(temp_in, temp_out, bd);
+ for (j = 0; j < 32; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+}
+
+void vp9_high_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ tran_low_t out[32 * 32] = {0};
+ tran_low_t *outptr = out;
+ int i, j;
+ tran_low_t temp_in[32], temp_out[32];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // Rows
+ // Only upper-left 8x8 has non-zero coeff.
+ for (i = 0; i < 8; ++i) {
+ high_idct32(input, outptr, bd);
+ input += 32;
+ outptr += 32;
+ }
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j * 32 + i];
+ high_idct32(temp_in, temp_out, bd);
+ for (j = 0; j < 32; ++j)
+ dest[j * stride + i] = clip_pixel_bd_high(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ }
+}
+
+void vp9_high_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ int i, j;
+ int a1;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+
+ for (j = 0; j < 32; ++j) {
+ for (i = 0; i < 32; ++i)
+ dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
+ dest += stride;
+ }
+}
+
+// idct
+void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd) {
+ if (eob > 1)
+ vp9_high_idct4x4_16_add(input, dest, stride, bd);
+ else
+ vp9_high_idct4x4_1_add(input, dest, stride, bd);
+}
+
+
+void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd) {
+ if (eob > 1)
+ vp9_high_iwht4x4_16_add(input, dest, stride, bd);
+ else
+ vp9_high_iwht4x4_1_add(input, dest, stride, bd);
+}
+
+void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd) {
+ // If dc is 1, then input[0] is the reconstructed value, do not need
+ // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+
+ // The calculation can be simplified if there are not many non-zero dct
+ // coefficients. Use eobs to decide what to do.
+ // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
+ // Combine that with code here.
+ // DC only DCT coefficient
+ if (eob == 1) {
+ vp9_high_idct8x8_1_add(input, dest, stride, bd);
+ } else if (eob <= 10) {
+ vp9_high_idct8x8_10_add(input, dest, stride, bd);
+ } else {
+ vp9_high_idct8x8_64_add(input, dest, stride, bd);
+ }
+}
+
+void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd) {
+ // The calculation can be simplified if there are not many non-zero dct
+ // coefficients. Use eobs to separate different cases.
+ // DC only DCT coefficient.
+ if (eob == 1) {
+ vp9_high_idct16x16_1_add(input, dest, stride, bd);
+ } else if (eob <= 10) {
+ vp9_high_idct16x16_10_add(input, dest, stride, bd);
+ } else {
+ vp9_high_idct16x16_256_add(input, dest, stride, bd);
+ }
+}
+
+void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd) {
+ // Non-zero coeff only in upper-left 8x8
+ if (eob == 1) {
+ vp9_high_idct32x32_1_add(input, dest, stride, bd);
+ } else if (eob <= 34) {
+ vp9_high_idct32x32_34_add(input, dest, stride, bd);
+ } else {
+ vp9_high_idct32x32_1024_add(input, dest, stride, bd);
+ }
+}
+
+// iht
+void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
+ uint8_t *dest, int stride, int eob, int bd) {
+ if (tx_type == DCT_DCT)
+ vp9_high_idct4x4_add(input, dest, stride, eob, bd);
+ else
+ vp9_high_iht4x4_16_add(input, dest, stride, tx_type, bd);
+}
+
+void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
+ uint8_t *dest, int stride, int eob, int bd) {
+ if (tx_type == DCT_DCT) {
+ vp9_high_idct8x8_add(input, dest, stride, eob, bd);
+ } else {
+ vp9_high_iht8x8_64_add(input, dest, stride, tx_type, bd);
+ }
+}
+
+void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
+ uint8_t *dest, int stride, int eob, int bd) {
+ if (tx_type == DCT_DCT) {
+ vp9_high_idct16x16_add(input, dest, stride, eob, bd);
+ } else {
+ vp9_high_iht16x16_256_add(input, dest, stride, tx_type, bd);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index 7f595e1cc..694be3cf9 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -36,52 +36,69 @@ extern "C" {
#define dual_set_epi16(a, b) \
_mm_set_epi16(b, b, b, b, a, a, a, a)
+// Note:
+// tran_low_t is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+#else
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
+#endif
+
// Constants:
// for (int i = 1; i< 32; ++i)
// printf("static const int cospi_%d_64 = %.0f;\n", i,
// round(16384 * cos(i*M_PI/64)));
// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
-static const int cospi_1_64 = 16364;
-static const int cospi_2_64 = 16305;
-static const int cospi_3_64 = 16207;
-static const int cospi_4_64 = 16069;
-static const int cospi_5_64 = 15893;
-static const int cospi_6_64 = 15679;
-static const int cospi_7_64 = 15426;
-static const int cospi_8_64 = 15137;
-static const int cospi_9_64 = 14811;
-static const int cospi_10_64 = 14449;
-static const int cospi_11_64 = 14053;
-static const int cospi_12_64 = 13623;
-static const int cospi_13_64 = 13160;
-static const int cospi_14_64 = 12665;
-static const int cospi_15_64 = 12140;
-static const int cospi_16_64 = 11585;
-static const int cospi_17_64 = 11003;
-static const int cospi_18_64 = 10394;
-static const int cospi_19_64 = 9760;
-static const int cospi_20_64 = 9102;
-static const int cospi_21_64 = 8423;
-static const int cospi_22_64 = 7723;
-static const int cospi_23_64 = 7005;
-static const int cospi_24_64 = 6270;
-static const int cospi_25_64 = 5520;
-static const int cospi_26_64 = 4756;
-static const int cospi_27_64 = 3981;
-static const int cospi_28_64 = 3196;
-static const int cospi_29_64 = 2404;
-static const int cospi_30_64 = 1606;
-static const int cospi_31_64 = 804;
+static const tran_high_t cospi_1_64 = 16364;
+static const tran_high_t cospi_2_64 = 16305;
+static const tran_high_t cospi_3_64 = 16207;
+static const tran_high_t cospi_4_64 = 16069;
+static const tran_high_t cospi_5_64 = 15893;
+static const tran_high_t cospi_6_64 = 15679;
+static const tran_high_t cospi_7_64 = 15426;
+static const tran_high_t cospi_8_64 = 15137;
+static const tran_high_t cospi_9_64 = 14811;
+static const tran_high_t cospi_10_64 = 14449;
+static const tran_high_t cospi_11_64 = 14053;
+static const tran_high_t cospi_12_64 = 13623;
+static const tran_high_t cospi_13_64 = 13160;
+static const tran_high_t cospi_14_64 = 12665;
+static const tran_high_t cospi_15_64 = 12140;
+static const tran_high_t cospi_16_64 = 11585;
+static const tran_high_t cospi_17_64 = 11003;
+static const tran_high_t cospi_18_64 = 10394;
+static const tran_high_t cospi_19_64 = 9760;
+static const tran_high_t cospi_20_64 = 9102;
+static const tran_high_t cospi_21_64 = 8423;
+static const tran_high_t cospi_22_64 = 7723;
+static const tran_high_t cospi_23_64 = 7005;
+static const tran_high_t cospi_24_64 = 6270;
+static const tran_high_t cospi_25_64 = 5520;
+static const tran_high_t cospi_26_64 = 4756;
+static const tran_high_t cospi_27_64 = 3981;
+static const tran_high_t cospi_28_64 = 3196;
+static const tran_high_t cospi_29_64 = 2404;
+static const tran_high_t cospi_30_64 = 1606;
+static const tran_high_t cospi_31_64 = 804;
// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
-static const int sinpi_1_9 = 5283;
-static const int sinpi_2_9 = 9929;
-static const int sinpi_3_9 = 13377;
-static const int sinpi_4_9 = 15212;
-
-static INLINE int dct_const_round_shift(int input) {
- int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
+static const tran_high_t sinpi_1_9 = 5283;
+static const tran_high_t sinpi_2_9 = 9929;
+static const tran_high_t sinpi_3_9 = 13377;
+static const tran_high_t sinpi_4_9 = 15212;
+
+static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+#if CONFIG_VP9_HIGHBITDEPTH
+ // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+ // stay within the ranges:
+ // - 8 bit: signed 16 bit integer
+ // - 10 bit: signed 18 bit integer
+ // - 12 bit: signed 20 bit integer
+#elif CONFIG_COEFFICIENT_RANGE_CHECKING
// For valid VP9 input streams, intermediate stage coefficients should always
// stay within the range of a signed 16 bit integer. Coefficients can go out
// of this range for invalid/corrupt VP9 streams. However, strictly checking
@@ -91,32 +108,59 @@ static INLINE int dct_const_round_shift(int input) {
assert(INT16_MIN <= rv);
assert(rv <= INT16_MAX);
#endif
- return (int16_t)rv;
+ return (tran_low_t)rv;
}
-typedef void (*transform_1d)(const int16_t*, int16_t*);
+typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
typedef struct {
transform_1d cols, rows; // vertical and horizontal
} transform_2d;
-void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*high_transform_1d)(const tran_low_t*, tran_low_t*, int bd);
-void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int
+typedef struct {
+ high_transform_1d cols, rows; // vertical and horizontal
+} high_transform_2d;
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob);
+void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob);
+void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob);
+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, int
eob);
-void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
int eob);
-void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob);
-void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob);
-void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob);
-
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
+ uint8_t *dest, int stride, int eob, int bd);
+void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
+ uint8_t *dest, int stride, int eob, int bd);
+void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
+ uint8_t *dest, int stride, int eob, int bd);
+#endif // CONFIG_VP9_HIGHBITDEPTH
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 3b39d4274..4d03c4dcb 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1193,7 +1193,7 @@ void vp9_filter_block_plane(VP9_COMMON *const cm,
}
}
-void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
+void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
VP9_COMMON *cm,
struct macroblockd_plane planes[MAX_MB_PLANE],
int start, int stop, int y_only) {
@@ -1247,9 +1247,8 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
y_only);
}
-int vp9_loop_filter_worker(void *arg1, void *arg2) {
- LFWorkerData *const lf_data = (LFWorkerData*)arg1;
- (void)arg2;
+int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
+ (void)unused;
vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
lf_data->start, lf_data->stop, lf_data->y_only);
return 1;
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index 6fa2773e5..69e7dd08c 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -111,13 +111,13 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
int y_only, int partial_frame);
// Apply the loop filter to [start, stop) macro block rows in frame_buffer.
-void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
+void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
struct VP9Common *cm,
struct macroblockd_plane planes[MAX_MB_PLANE],
int start, int stop, int y_only);
typedef struct LoopFilterWorkerData {
- const YV12_BUFFER_CONFIG *frame_buffer;
+ YV12_BUFFER_CONFIG *frame_buffer;
struct VP9Common *cm;
struct macroblockd_plane planes[MAX_MB_PLANE];
@@ -129,8 +129,8 @@ typedef struct LoopFilterWorkerData {
int num_lf_workers;
} LFWorkerData;
-// Operates on the rows described by LFWorkerData passed as 'arg1'.
-int vp9_loop_filter_worker(void *arg1, void *arg2);
+// Operates on the rows described by 'lf_data'.
+int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 667e057b6..32bcf9a77 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -6,6 +6,7 @@ print <<EOF
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_idct.h"
struct macroblockd;
@@ -45,6 +46,13 @@ if ($opts{arch} eq "x86_64") {
$avx_x86_64 = $avx2_x86_64 = '';
}
+# optimizations which depend on multiple features
+if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) {
+ $avx2_ssse3 = 'avx2';
+} else {
+ $avx2_ssse3 = '';
+}
+
#
# RECON
#
@@ -296,15 +304,15 @@ specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc";
$vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;
add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2 avx2/;
+specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
$vp9_convolve8_neon_asm=vp9_convolve8_neon;
add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2 avx2/;
+specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
$vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;
add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2 avx2/;
+specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
$vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;
add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
@@ -322,68 +330,177 @@ $vp9_convolve8_avg_vert_neon_asm=vp9_convolve8_avg_vert_neon;
#
# dct
#
-add_proto qw/void vp9_idct4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
-$vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct4x4_1_add/;
+
+ add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct4x4_16_add/;
+
+ add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct8x8_1_add/;
+
+ add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct8x8_64_add/;
+
+ add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct8x8_12_add/;
+
+ add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct16x16_1_add/;
+
+ add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct16x16_256_add/;
+
+ add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct16x16_10_add/;
+
+ add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct32x32_1024_add/;
+
+ add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct32x32_34_add/;
+
+ add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct32x32_1_add/;
+
+ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp9_iht4x4_16_add/;
+
+ add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp9_iht8x8_64_add/;
+
+ add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+ specialize qw/vp9_iht16x16_256_add/;
+
+ # dct and add
+
+ add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_iwht4x4_1_add/;
-add_proto qw/void vp9_idct4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
-$vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
+ add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_iwht4x4_16_add/;
+} else {
+ add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
+ $vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
+
+ add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
+ $vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
+
+ add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
+ $vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+
+ add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
+ $vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
+
+ add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
+ $vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
+
+ add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
+ $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
+
+ add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
+ $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+
+ add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
+ $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+
+ add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
+ $vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
-add_proto qw/void vp9_idct8x8_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
-$vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+ add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
+ $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
-add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
-$vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
+ add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
+ $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
-add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
-$vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
+ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
+ $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
-add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
-$vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
+ add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+ specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
+ $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
-add_proto qw/void vp9_idct16x16_256_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
-$vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+ add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+ specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
+
+ # dct and add
+
+ add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_iwht4x4_1_add/;
+
+ add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+ specialize qw/vp9_iwht4x4_16_add/;
+}
+
+
+# High bitdepth functions
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+#
+# dct
+#
+add_proto qw/void vp9_high_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct4x4_1_add/;
-add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
-$vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+add_proto qw/void vp9_high_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct4x4_16_add/;
-add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
-$vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
+add_proto qw/void vp9_high_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct8x8_1_add/;
-add_proto qw/void vp9_idct32x32_34_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
-$vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
+add_proto qw/void vp9_high_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct8x8_64_add/;
-add_proto qw/void vp9_idct32x32_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
-$vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
+add_proto qw/void vp9_high_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct8x8_10_add/;
-add_proto qw/void vp9_iht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
-specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
-$vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+add_proto qw/void vp9_high_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct16x16_1_add/;
-add_proto qw/void vp9_iht8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
-specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
-$vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+add_proto qw/void vp9_high_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct16x16_256_add/;
-add_proto qw/void vp9_iht16x16_256_add/, "const int16_t *input, uint8_t *output, int pitch, int tx_type";
-specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
+add_proto qw/void vp9_high_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct16x16_10_add/;
+
+add_proto qw/void vp9_high_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct32x32_1024_add/;
+
+add_proto qw/void vp9_high_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct32x32_34_add/;
+
+add_proto qw/void vp9_high_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct32x32_1_add/;
+
+add_proto qw/void vp9_high_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+specialize qw/vp9_high_iht4x4_16_add/;
+
+add_proto qw/void vp9_high_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+specialize qw/vp9_high_iht8x8_64_add/;
+
+add_proto qw/void vp9_high_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
+specialize qw/vp9_high_iht16x16_256_add/;
# dct and add
-add_proto qw/void vp9_iwht4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_iwht4x4_1_add/;
+add_proto qw/void vp9_high_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_iwht4x4_1_add/;
-add_proto qw/void vp9_iwht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_iwht4x4_16_add/;
+add_proto qw/void vp9_high_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_iwht4x4_16_add/;
+}
#
# Encoder functions below this point.
@@ -699,23 +816,42 @@ add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
# ENCODEMB INVOKE
-add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
-
add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
specialize qw/vp9_subtract_block neon/, "$sse2_x86inc";
-add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+# the transform coefficients are held in 32-bit
+# values, so the assembler code for vp9_block_error can no longer be used.
+ add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+ specialize qw/vp9_block_error/;
+
+ add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_fp/;
-add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
+ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_fp_32x32/;
-add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
+ add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_b/;
-add_proto qw/void vp9_quantize_b_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
+ add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_b_32x32/;
+} else {
+ add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+ specialize qw/vp9_block_error avx2/;
+
+ add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
+
+ add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
+
+ add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
+
+ add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_quantize_b_32x32/;
+}
#
# Structured Similarity (SSIM)
@@ -729,44 +865,86 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
}
# fdct functions
-add_proto qw/void vp9_fht4x4/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht4x4 sse2/;
-add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht8x8 sse2/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_fht4x4/;
+
+ add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_fht8x8/;
-add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht16x16 sse2/;
+ add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_fht16x16/;
-add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+ add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fwht4x4/;
-add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct4x4_1 sse2/;
+ add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct4x4_1/;
-add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct4x4 sse2/;
+ add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct4x4/;
-add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8_1 sse2 neon/;
+ add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct8x8_1/;
-add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
+ add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct8x8/;
-add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct16x16_1 sse2/;
+ add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct16x16_1/;
-add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct16x16 sse2/;
+ add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct16x16/;
-add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct32x32_1 sse2/;
+ add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct32x32_1/;
-add_proto qw/void vp9_fdct32x32/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct32x32 sse2 avx2/;
+ add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct32x32/;
+
+ add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct32x32_rd/;
+} else {
+ add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_fht4x4 sse2/;
-add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct32x32_rd sse2 avx2/;
+ add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_fht8x8 sse2/;
+
+ add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_fht16x16 sse2/;
+
+ add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+
+ add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct4x4_1 sse2/;
+
+ add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct4x4 sse2/;
+
+ add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct8x8_1 sse2 neon/;
+
+ add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
+
+ add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct16x16_1 sse2/;
+
+ add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct16x16 sse2/;
+
+ add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct32x32_1 sse2/;
+
+ add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct32x32 sse2 avx2/;
+
+ add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_fdct32x32_rd sse2 avx2/;
+}
#
# Motion search
@@ -788,6 +966,654 @@ specialize qw/vp9_full_range_search/;
add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
specialize qw/vp9_temporal_filter_apply sse2/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+
+ # variance
+ add_proto qw/unsigned int vp9_high_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_variance4x4/;
+
+ add_proto qw/void vp9_high_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vp9_high_get8x8var/;
+
+ add_proto qw/void vp9_high_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vp9_high_get16x16var/;
+
+ add_proto qw/unsigned int vp9_high_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_variance4x4/;
+
+ add_proto qw/void vp9_high_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vp9_high_10_get8x8var/;
+
+ add_proto qw/void vp9_high_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vp9_high_10_get16x16var/;
+
+ add_proto qw/unsigned int vp9_high_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_variance4x4/;
+
+ add_proto qw/void vp9_high_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vp9_high_12_get8x8var/;
+
+ add_proto qw/void vp9_high_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+ specialize qw/vp9_high_12_get16x16var/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_sub_pixel_variance4x4/;
+
+ add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_sub_pixel_avg_variance4x4/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_sub_pixel_variance4x4/;
+
+ add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_10_sub_pixel_avg_variance4x4/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance64x64/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance32x64/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance64x32/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance32x16/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance16x32/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance32x32/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance16x16/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance8x16/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance16x8/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance8x8/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance8x4/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance4x8/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_sub_pixel_variance4x4/;
+
+ add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+ specialize qw/vp9_high_12_sub_pixel_avg_variance4x4/;
+
+ add_proto qw/unsigned int vp9_high_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad64x64/;
+
+ add_proto qw/unsigned int vp9_high_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad32x64/;
+
+ add_proto qw/unsigned int vp9_high_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad64x32/;
+
+ add_proto qw/unsigned int vp9_high_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad32x16/;
+
+ add_proto qw/unsigned int vp9_high_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad16x32/;
+
+ add_proto qw/unsigned int vp9_high_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad32x32/;
+
+ add_proto qw/unsigned int vp9_high_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad16x16/;
+
+ add_proto qw/unsigned int vp9_high_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad16x8/;
+
+ add_proto qw/unsigned int vp9_high_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad8x16/;
+
+ add_proto qw/unsigned int vp9_high_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad8x8/;
+
+ add_proto qw/unsigned int vp9_high_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad8x4/;
+
+ add_proto qw/unsigned int vp9_high_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad4x8/;
+
+ add_proto qw/unsigned int vp9_high_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+ specialize qw/vp9_high_sad4x4/;
+
+ add_proto qw/unsigned int vp9_high_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad64x64_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad32x64_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad64x32_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad32x16_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad16x32_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad32x32_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad16x16_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad16x8_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad8x16_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad8x8_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad8x4_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad4x8_avg/;
+
+ add_proto qw/unsigned int vp9_high_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+ specialize qw/vp9_high_sad4x4_avg/;
+
+ add_proto qw/void vp9_high_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad64x64x3/;
+
+ add_proto qw/void vp9_high_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad32x32x3/;
+
+ add_proto qw/void vp9_high_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad16x16x3/;
+
+ add_proto qw/void vp9_high_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad16x8x3/;
+
+ add_proto qw/void vp9_high_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad8x16x3/;
+
+ add_proto qw/void vp9_high_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad8x8x3/;
+
+ add_proto qw/void vp9_high_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad4x4x3/;
+
+ add_proto qw/void vp9_high_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad64x64x8/;
+
+ add_proto qw/void vp9_high_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad32x32x8/;
+
+ add_proto qw/void vp9_high_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad16x16x8/;
+
+ add_proto qw/void vp9_high_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad16x8x8/;
+
+ add_proto qw/void vp9_high_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad8x16x8/;
+
+ add_proto qw/void vp9_high_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad8x8x8/;
+
+ add_proto qw/void vp9_high_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad8x4x8/;
+
+ add_proto qw/void vp9_high_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad4x8x8/;
+
+ add_proto qw/void vp9_high_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+ specialize qw/vp9_high_sad4x4x8/;
+
+ add_proto qw/void vp9_high_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad64x64x4d/;
+
+ add_proto qw/void vp9_high_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad32x64x4d/;
+
+ add_proto qw/void vp9_high_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad64x32x4d/;
+
+ add_proto qw/void vp9_high_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad32x16x4d/;
+
+ add_proto qw/void vp9_high_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad16x32x4d/;
+
+ add_proto qw/void vp9_high_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad32x32x4d/;
+
+ add_proto qw/void vp9_high_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad16x16x4d/;
+
+ add_proto qw/void vp9_high_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad16x8x4d/;
+
+ add_proto qw/void vp9_high_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad8x16x4d/;
+
+ add_proto qw/void vp9_high_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad8x8x4d/;
+
+ # TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
+ add_proto qw/void vp9_high_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad8x4x4d/;
+
+ add_proto qw/void vp9_high_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad4x8x4d/;
+
+ add_proto qw/void vp9_high_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+ specialize qw/vp9_high_sad4x4x4d/;
+
+ add_proto qw/unsigned int vp9_high_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_mse16x16/;
+
+ add_proto qw/unsigned int vp9_high_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_mse8x16/;
+
+ add_proto qw/unsigned int vp9_high_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_mse16x8/;
+
+ add_proto qw/unsigned int vp9_high_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_mse8x8/;
+
+ add_proto qw/unsigned int vp9_high_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_mse16x16/;
+
+ add_proto qw/unsigned int vp9_high_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_mse8x16/;
+
+ add_proto qw/unsigned int vp9_high_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_mse16x8/;
+
+ add_proto qw/unsigned int vp9_high_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_10_mse8x8/;
+
+ add_proto qw/unsigned int vp9_high_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_mse16x16/;
+
+ add_proto qw/unsigned int vp9_high_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_mse8x16/;
+
+ add_proto qw/unsigned int vp9_high_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_mse16x8/;
+
+ add_proto qw/unsigned int vp9_high_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
+ specialize qw/vp9_high_12_mse8x8/;
+
+ # ENCODEMB INVOKE
+
+ add_proto qw/int64_t vp9_high_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+ specialize qw/vp9_high_block_error/;
+
+ add_proto qw/void vp9_high_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+ specialize qw/vp9_high_subtract_block/;
+
+ add_proto qw/void vp9_high_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_high_quantize_fp/;
+
+ add_proto qw/void vp9_high_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_high_quantize_fp_32x32/;
+
+ add_proto qw/void vp9_high_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_high_quantize_b/;
+
+ add_proto qw/void vp9_high_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/vp9_high_quantize_b_32x32/;
+
+ #
+ # Structured Similarity (SSIM)
+ #
+ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+ add_proto qw/void vp9_high_ssim_parms_8x8/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+ specialize qw/vp9_high_ssim_parms_8x8/;
+
+ add_proto qw/void vp9_high_ssim_parms_8x8_shift/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr, unsigned int bd, unsigned int shift";
+ specialize qw/vp9_high_ssim_parms_8x8_shift/;
+ }
+
+ # fdct functions
+ add_proto qw/void vp9_high_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_high_fht4x4/;
+
+ add_proto qw/void vp9_high_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_high_fht8x8/;
+
+ add_proto qw/void vp9_high_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+ specialize qw/vp9_high_fht16x16/;
+
+ add_proto qw/void vp9_high_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fwht4x4/;
+
+ add_proto qw/void vp9_high_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct4x4/;
+
+ add_proto qw/void vp9_high_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct8x8_1/;
+
+ add_proto qw/void vp9_high_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct8x8/;
+
+ add_proto qw/void vp9_high_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct16x16_1/;
+
+ add_proto qw/void vp9_high_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct16x16/;
+
+ add_proto qw/void vp9_high_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct32x32_1/;
+
+ add_proto qw/void vp9_high_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct32x32/;
+
+ add_proto qw/void vp9_high_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+ specialize qw/vp9_high_fdct32x32_rd/;
+
+ add_proto qw/void vp9_high_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+ specialize qw/vp9_high_temporal_filter_apply/;
+
+}
+# End vp9_high encoder functions
+
}
# end encoder functions
1;
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 1b4904c39..b6847b92e 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -139,25 +139,25 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
} \
}
-#if HAVE_AVX2
+#if HAVE_AVX2 && HAVE_SSSE3
filter8_1dfunction vp9_filter_block1d16_v8_avx2;
filter8_1dfunction vp9_filter_block1d16_h8_avx2;
filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-#if (ARCH_X86_64)
+#if ARCH_X86_64
filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
-#else
+#else // ARCH_X86
filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
-#endif
+#endif // ARCH_X86_64 / ARCH_X86
filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
@@ -190,9 +190,9 @@ FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
FUN_CONV_2D(, avx2);
-#endif
+#endif // HAVE_AX2 && HAVE_SSSE3
#if HAVE_SSSE3
-#if (ARCH_X86_64)
+#if ARCH_X86_64
filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
@@ -204,14 +204,14 @@ filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
-#else
+#else // ARCH_X86
filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#endif
+#endif // ARCH_X86_64 / ARCH_X86
filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
@@ -270,7 +270,7 @@ FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
// int w, int h);
FUN_CONV_2D(, ssse3);
FUN_CONV_2D(avg_ , ssse3);
-#endif
+#endif // HAVE_SSSE3
#if HAVE_SSE2
filter8_1dfunction vp9_filter_block1d16_v8_sse2;
@@ -336,4 +336,4 @@ FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
// int w, int h);
FUN_CONV_2D(, sse2);
FUN_CONV_2D(avg_ , sse2);
-#endif
+#endif // HAVE_SSE2
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index b60f8a06d..df609872b 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -3573,6 +3573,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
+ const __m128i zero = _mm_setzero_si128();
// idct constants for each stage
const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
@@ -3635,7 +3636,6 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
stp2_30, stp2_31;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i, j, i32;
- int zero_flag[2];
for (i = 0; i < 4; i++) {
i32 = (i << 5);
@@ -3710,13 +3710,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
- zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
- zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
- zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
- zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
- zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
-
- if (!zero_flag[0] && !zero_flag[1]) {
+ if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
col[i32 + 0] = _mm_setzero_si128();
col[i32 + 1] = _mm_setzero_si128();
col[i32 + 2] = _mm_setzero_si128();
@@ -3795,7 +3789,6 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
}
for (i = 0; i < 4; i++) {
- const __m128i zero = _mm_setzero_si128();
// Second 1-D idct
j = i << 3;
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index a9c03f0cc..7615cddda 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -195,7 +195,7 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
struct macroblockd_plane *const pd = &xd->plane[plane];
if (eob > 0) {
TX_TYPE tx_type = DCT_DCT;
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
if (xd->lossless) {
tx_type = DCT_DCT;
vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
@@ -668,6 +668,15 @@ static void setup_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
}
+ cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
+}
+
+static INLINE int valid_ref_frame_img_fmt(vpx_bit_depth_t ref_bit_depth,
+ int ref_xss, int ref_yss,
+ vpx_bit_depth_t this_bit_depth,
+ int this_xss, int this_yss) {
+ return ref_bit_depth == this_bit_depth && ref_xss == this_xss &&
+ ref_yss == this_yss;
}
static void setup_frame_size_with_refs(VP9_COMMON *cm,
@@ -707,6 +716,18 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
if (!has_valid_ref_frame)
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Referenced frame has invalid size");
+ for (i = 0; i < REFS_PER_FRAME; ++i) {
+ RefBuffer *const ref_frame = &cm->frame_refs[i];
+ if (!valid_ref_frame_img_fmt(
+ ref_frame->buf->bit_depth,
+ ref_frame->buf->uv_crop_width < ref_frame->buf->y_crop_width,
+ ref_frame->buf->uv_crop_height < ref_frame->buf->y_crop_height,
+ cm->bit_depth,
+ cm->subsampling_x,
+ cm->subsampling_y))
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Referenced frame has incompatible color space");
+ }
resize_context_buffers(cm, width, height);
setup_display_size(cm, rb);
@@ -723,6 +744,7 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
}
+ cm->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
}
static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@@ -938,9 +960,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
return vp9_reader_find_end(&tile_data->bit_reader);
}
-static int tile_worker_hook(void *arg1, void *arg2) {
- TileWorkerData *const tile_data = (TileWorkerData*)arg1;
- const TileInfo *const tile = (TileInfo*)arg2;
+static int tile_worker_hook(TileWorkerData *const tile_data,
+ const TileInfo *const tile) {
int mi_row, mi_col;
for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
@@ -1201,6 +1222,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
}
setup_frame_size(cm, rb);
+ pbi->need_resync = 0;
} else {
cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb);
@@ -1224,6 +1246,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
setup_frame_size(cm, rb);
+ pbi->need_resync = 0;
} else {
pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
for (i = 0; i < REFS_PER_FRAME; ++i) {
@@ -1252,6 +1275,12 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
}
}
+ if (pbi->need_resync) {
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Keyframe / intra-only frame required to reset decoder"
+ " state");
+ }
+
if (!cm->error_resilient_mode) {
cm->refresh_frame_context = vp9_rb_read_bit(rb);
cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb);
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 9106b0d14..6ee3d7037 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -60,6 +60,7 @@ VP9Decoder *vp9_decoder_create() {
}
cm->error.setjmp = 1;
+ pbi->need_resync = 1;
initialize_dec();
// Initialize the references to not point to any frame buffers.
@@ -238,6 +239,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
cm->new_fb_idx = get_free_fb(cm);
if (setjmp(cm->error.jmp)) {
+ pbi->need_resync = 1;
cm->error.setjmp = 0;
vp9_clear_system_state();
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 848d212e6..4f52bb9c4 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -58,6 +58,7 @@ typedef struct VP9Decoder {
int max_threads;
int inv_tile_order;
+ int need_resync; // wait for key/intra-only frame
} VP9Decoder;
int vp9_receive_compressed_data(struct VP9Decoder *pbi,
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 91cdf3860..76ca1ae8f 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -51,7 +51,7 @@
} while (0)
static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
- int16_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
+ tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
int ctx, const int16_t *scan, const int16_t *nb,
vp9_reader *r) {
const int max_eob = 16 << (tx_size << 1);
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c
index b82ea6a79..6635880e5 100644
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -121,10 +121,10 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
}
// Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(void *arg1, void *arg2) {
- TileWorkerData *const tile_data = (TileWorkerData*)arg1;
+static int loop_filter_row_worker(TileWorkerData *const tile_data,
+ void *unused) {
LFWorkerData *const lf_data = &tile_data->lfdata;
- (void) arg2;
+ (void)unused;
loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
lf_data->start, lf_data->stop, lf_data->y_only,
lf_data->lf_sync, lf_data->num_lf_workers);
@@ -145,15 +145,13 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);
int i;
- // Allocate memory used in thread synchronization.
- // This always needs to be done even if frame_filter_level is 0.
+ if (!frame_filter_level) return;
+
if (!lf_sync->sync_range || cm->last_height != cm->height) {
vp9_loop_filter_dealloc(lf_sync);
- vp9_loop_filter_alloc(cm, lf_sync, sb_rows, cm->width);
+ vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width);
}
- if (!frame_filter_level) return;
-
vp9_loop_filter_frame_init(cm, frame_filter_level);
// Initialize cur_sb_col to -1 for all SB rows.
@@ -216,7 +214,7 @@ static int get_sync_range(int width) {
}
// Allocate memory for lf row synchronization
-void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows,
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
int width) {
lf_sync->rows = rows;
#if CONFIG_MULTITHREAD
diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h
index 8b02ef71e..b1fbdeb74 100644
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h
@@ -42,8 +42,8 @@ typedef struct VP9LfSyncData {
} VP9LfSync;
// Allocate memory for loopfilter row synchronization.
-void vp9_loop_filter_alloc(struct VP9Common *cm, VP9LfSync *lf_sync,
- int rows, int width);
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
+ int width);
// Deallocate loopfilter synchronization related mutex and data.
void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index b72638322..767bd7f91 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -28,8 +28,8 @@ typedef struct {
struct macroblock_plane {
DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
- int16_t *qcoeff;
- int16_t *coeff;
+ tran_low_t *qcoeff;
+ tran_low_t *coeff;
uint16_t *eobs;
struct buf_2d src;
@@ -119,8 +119,12 @@ struct macroblock {
// Used to store sub partition's choices.
MV pred_mv[MAX_REF_FRAMES];
- void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
- void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
+ void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
+ void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
+#if CONFIG_VP9_HIGHBITDEPTH
+ void (*high_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+#endif
};
#ifdef __cplusplus
diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c
index 9b7a93267..12acc5114 100644
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -30,13 +30,13 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
for (i = 0; i < MAX_MB_PLANE; ++i) {
for (k = 0; k < 3; ++k) {
CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
- vpx_memalign(16, num_pix * sizeof(int16_t)));
+ vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
- vpx_memalign(16, num_pix * sizeof(int16_t)));
+ vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
- vpx_memalign(16, num_pix * sizeof(int16_t)));
+ vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));
CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
- vpx_memalign(16, num_pix * sizeof(uint16_t)));
+ vpx_memalign(16, num_pix * sizeof(*ctx->eobs[i][k])));
ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h
index 236389b6f..97f074148 100644
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -19,15 +19,15 @@ struct VP9_COMP;
typedef struct {
MODE_INFO mic;
uint8_t *zcoeff_blk;
- int16_t *coeff[MAX_MB_PLANE][3];
- int16_t *qcoeff[MAX_MB_PLANE][3];
- int16_t *dqcoeff[MAX_MB_PLANE][3];
+ tran_low_t *coeff[MAX_MB_PLANE][3];
+ tran_low_t *qcoeff[MAX_MB_PLANE][3];
+ tran_low_t *dqcoeff[MAX_MB_PLANE][3];
uint16_t *eobs[MAX_MB_PLANE][3];
// dual buffer pointers, 0: in use, 1: best in store
- int16_t *coeff_pbuf[MAX_MB_PLANE][3];
- int16_t *qcoeff_pbuf[MAX_MB_PLANE][3];
- int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
+ tran_low_t *coeff_pbuf[MAX_MB_PLANE][3];
+ tran_low_t *qcoeff_pbuf[MAX_MB_PLANE][3];
+ tran_low_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
uint16_t *eobs_pbuf[MAX_MB_PLANE][3];
int is_coded;
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 59222f0a9..eff899610 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -18,15 +18,17 @@
#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_systemdependent.h"
-static INLINE int fdct_round_shift(int input) {
- int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- assert(INT16_MIN <= rv && rv <= INT16_MAX);
+static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ // TODO(debargha, peter.derivaz): Find new bounds for this assert
+ // and make the bounds consts.
+ // assert(INT16_MIN <= rv && rv <= INT16_MAX);
return rv;
}
-static void fdct4(const int16_t *input, int16_t *output) {
- int16_t step[4];
- int temp1, temp2;
+static void fdct4(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t step[4];
+ tran_high_t temp1, temp2;
step[0] = input[0] + input[3];
step[1] = input[1] + input[2];
@@ -43,9 +45,9 @@ static void fdct4(const int16_t *input, int16_t *output) {
output[3] = fdct_round_shift(temp2);
}
-void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c;
- int16_t sum = 0;
+ tran_low_t sum = 0;
for (r = 0; r < 4; ++r)
for (c = 0; c < 4; ++c)
sum += input[r * stride + c];
@@ -54,7 +56,7 @@ void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) {
output[1] = 0;
}
-void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
@@ -63,22 +65,23 @@ void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
// in normal/row positions).
int pass;
// We need an intermediate buffer between passes.
- int16_t intermediate[4 * 4];
- const int16_t *in = input;
- int16_t *out = intermediate;
+ tran_low_t intermediate[4 * 4];
+ const int16_t *in_pass0 = input;
+ const tran_low_t *in = NULL;
+ tran_low_t *out = intermediate;
// Do the two transform/transpose passes
for (pass = 0; pass < 2; ++pass) {
- /*canbe16*/ int input[4];
- /*canbe16*/ int step[4];
- /*needs32*/ int temp1, temp2;
+ tran_high_t input[4]; // canbe16
+ tran_high_t step[4]; // canbe16
+ tran_high_t temp1, temp2; // needs32
int i;
for (i = 0; i < 4; ++i) {
// Load inputs.
if (0 == pass) {
- input[0] = in[0 * stride] * 16;
- input[1] = in[1 * stride] * 16;
- input[2] = in[2 * stride] * 16;
- input[3] = in[3 * stride] * 16;
+ input[0] = in_pass0[0 * stride] * 16;
+ input[1] = in_pass0[1 * stride] * 16;
+ input[2] = in_pass0[2 * stride] * 16;
+ input[3] = in_pass0[3 * stride] * 16;
if (i == 0 && input[0]) {
input[0] += 1;
}
@@ -102,6 +105,7 @@ void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
out[1] = fdct_round_shift(temp1);
out[3] = fdct_round_shift(temp2);
// Do next column (which is a transposed row in second/horizontal pass)
+ in_pass0++;
in++;
out += 4;
}
@@ -119,9 +123,9 @@ void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
}
}
-static void fadst4(const int16_t *input, int16_t *output) {
- int x0, x1, x2, x3;
- int s0, s1, s2, s3, s4, s5, s6, s7;
+static void fadst4(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t x0, x1, x2, x3;
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
x0 = input[0];
x1 = input[1];
@@ -166,15 +170,15 @@ static const transform_2d FHT_4[] = {
{ fadst4, fadst4 } // ADST_ADST = 3
};
-void vp9_fht4x4_c(const int16_t *input, int16_t *output,
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_fdct4x4_c(input, output, stride);
} else {
- int16_t out[4 * 4];
- int16_t *outptr = &out[0];
+ tran_low_t out[4 * 4];
+ tran_low_t *outptr = &out[0];
int i, j;
- int16_t temp_in[4], temp_out[4];
+ tran_low_t temp_in[4], temp_out[4];
const transform_2d ht = FHT_4[tx_type];
// Columns
@@ -199,10 +203,10 @@ void vp9_fht4x4_c(const int16_t *input, int16_t *output,
}
}
-static void fdct8(const int16_t *input, int16_t *output) {
- /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
- /*needs32*/ int t0, t1, t2, t3;
- /*canbe16*/ int x0, x1, x2, x3;
+static void fdct8(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
// stage 1
s0 = input[0] + input[7];
@@ -251,9 +255,9 @@ static void fdct8(const int16_t *input, int16_t *output) {
output[7] = fdct_round_shift(t3);
}
-void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c;
- int16_t sum = 0;
+ tran_low_t sum = 0;
for (r = 0; r < 8; ++r)
for (c = 0; c < 8; ++c)
sum += input[r * stride + c];
@@ -262,16 +266,16 @@ void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) {
output[1] = 0;
}
-void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
+void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
int i, j;
- int16_t intermediate[64];
+ tran_low_t intermediate[64];
// Transform columns
{
- int16_t *output = intermediate;
- /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
- /*needs32*/ int t0, t1, t2, t3;
- /*canbe16*/ int x0, x1, x2, x3;
+ tran_low_t *output = intermediate;
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
int i;
for (i = 0; i < 8; i++) {
@@ -333,9 +337,9 @@ void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
}
}
-void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c;
- int16_t sum = 0;
+ tran_low_t sum = 0;
for (r = 0; r < 16; ++r)
for (c = 0; c < 16; ++c)
sum += input[r * stride + c];
@@ -344,7 +348,7 @@ void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) {
output[1] = 0;
}
-void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
@@ -353,37 +357,38 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
// in normal/row positions).
int pass;
// We need an intermediate buffer between passes.
- int16_t intermediate[256];
- const int16_t *in = input;
- int16_t *out = intermediate;
+ tran_low_t intermediate[256];
+ const int16_t *in_pass0 = input;
+ const tran_low_t *in = NULL;
+ tran_low_t *out = intermediate;
// Do the two transform/transpose passes
for (pass = 0; pass < 2; ++pass) {
- /*canbe16*/ int step1[8];
- /*canbe16*/ int step2[8];
- /*canbe16*/ int step3[8];
- /*canbe16*/ int input[8];
- /*needs32*/ int temp1, temp2;
+ tran_high_t step1[8]; // canbe16
+ tran_high_t step2[8]; // canbe16
+ tran_high_t step3[8]; // canbe16
+ tran_high_t input[8]; // canbe16
+ tran_high_t temp1, temp2; // needs32
int i;
for (i = 0; i < 16; i++) {
if (0 == pass) {
// Calculate input for the first 8 results.
- input[0] = (in[0 * stride] + in[15 * stride]) * 4;
- input[1] = (in[1 * stride] + in[14 * stride]) * 4;
- input[2] = (in[2 * stride] + in[13 * stride]) * 4;
- input[3] = (in[3 * stride] + in[12 * stride]) * 4;
- input[4] = (in[4 * stride] + in[11 * stride]) * 4;
- input[5] = (in[5 * stride] + in[10 * stride]) * 4;
- input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;
- input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;
+ input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
+ input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
+ input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
+ input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
+ input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
+ input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
+ input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
+ input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
// Calculate input for the next 8 results.
- step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;
- step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;
- step1[2] = (in[5 * stride] - in[10 * stride]) * 4;
- step1[3] = (in[4 * stride] - in[11 * stride]) * 4;
- step1[4] = (in[3 * stride] - in[12 * stride]) * 4;
- step1[5] = (in[2 * stride] - in[13 * stride]) * 4;
- step1[6] = (in[1 * stride] - in[14 * stride]) * 4;
- step1[7] = (in[0 * stride] - in[15 * stride]) * 4;
+ step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
+ step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
+ step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
+ step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
+ step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
+ step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
+ step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
+ step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
} else {
// Calculate input for the first 8 results.
input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
@@ -406,9 +411,9 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
}
// Work on the first eight values; fdct8(input, even_results);
{
- /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
- /*needs32*/ int t0, t1, t2, t3;
- /*canbe16*/ int x0, x1, x2, x3;
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
// stage 1
s0 = input[0] + input[7];
@@ -514,6 +519,7 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
}
// Do next column (which is a transposed row in second/horizontal pass)
in++;
+ in_pass0++;
out += 16;
}
// Setup in/out for next pass.
@@ -522,17 +528,17 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
}
}
-static void fadst8(const int16_t *input, int16_t *output) {
- int s0, s1, s2, s3, s4, s5, s6, s7;
+static void fadst8(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
- int x0 = input[7];
- int x1 = input[0];
- int x2 = input[5];
- int x3 = input[2];
- int x4 = input[3];
- int x5 = input[4];
- int x6 = input[1];
- int x7 = input[6];
+ tran_high_t x0 = input[7];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[5];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[3];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[1];
+ tran_high_t x7 = input[6];
// stage 1
s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
@@ -600,15 +606,15 @@ static const transform_2d FHT_8[] = {
{ fadst8, fadst8 } // ADST_ADST = 3
};
-void vp9_fht8x8_c(const int16_t *input, int16_t *output,
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_fdct8x8_c(input, output, stride);
} else {
- int16_t out[64];
- int16_t *outptr = &out[0];
+ tran_low_t out[64];
+ tran_low_t *outptr = &out[0];
int i, j;
- int16_t temp_in[8], temp_out[8];
+ tran_low_t temp_in[8], temp_out[8];
const transform_2d ht = FHT_8[tx_type];
// Columns
@@ -633,17 +639,18 @@ void vp9_fht8x8_c(const int16_t *input, int16_t *output,
/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
pixel. */
-void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
int i;
- int a1, b1, c1, d1, e1;
- const int16_t *ip = input;
- int16_t *op = output;
+ tran_high_t a1, b1, c1, d1, e1;
+ const int16_t *ip_pass0 = input;
+ const tran_low_t *ip = NULL;
+ tran_low_t *op = output;
for (i = 0; i < 4; i++) {
- a1 = ip[0 * stride];
- b1 = ip[1 * stride];
- c1 = ip[2 * stride];
- d1 = ip[3 * stride];
+ a1 = ip_pass0[0 * stride];
+ b1 = ip_pass0[1 * stride];
+ c1 = ip_pass0[2 * stride];
+ d1 = ip_pass0[3 * stride];
a1 += b1;
d1 = d1 - c1;
@@ -657,7 +664,7 @@ void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
op[8] = d1;
op[12] = b1;
- ip++;
+ ip_pass0++;
op++;
}
ip = output;
@@ -687,12 +694,12 @@ void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
}
// Rewrote to use same algorithm as others.
-static void fdct16(const int16_t in[16], int16_t out[16]) {
- /*canbe16*/ int step1[8];
- /*canbe16*/ int step2[8];
- /*canbe16*/ int step3[8];
- /*canbe16*/ int input[8];
- /*needs32*/ int temp1, temp2;
+static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
+ tran_high_t step1[8]; // canbe16
+ tran_high_t step2[8]; // canbe16
+ tran_high_t step3[8]; // canbe16
+ tran_high_t input[8]; // canbe16
+ tran_high_t temp1, temp2; // needs32
// step 1
input[0] = in[0] + in[15];
@@ -715,9 +722,9 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
// fdct8(step, step);
{
- /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
- /*needs32*/ int t0, t1, t2, t3;
- /*canbe16*/ int x0, x1, x2, x3;
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
+ tran_high_t t0, t1, t2, t3; // needs32
+ tran_high_t x0, x1, x2, x3; // canbe16
// stage 1
s0 = input[0] + input[7];
@@ -828,25 +835,26 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
out[15] = fdct_round_shift(temp2);
}
-static void fadst16(const int16_t *input, int16_t *output) {
- int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
-
- int x0 = input[15];
- int x1 = input[0];
- int x2 = input[13];
- int x3 = input[2];
- int x4 = input[11];
- int x5 = input[4];
- int x6 = input[9];
- int x7 = input[6];
- int x8 = input[7];
- int x9 = input[8];
- int x10 = input[5];
- int x11 = input[10];
- int x12 = input[3];
- int x13 = input[12];
- int x14 = input[1];
- int x15 = input[14];
+static void fadst16(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+ tran_high_t x0 = input[15];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[13];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[11];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[9];
+ tran_high_t x7 = input[6];
+ tran_high_t x8 = input[7];
+ tran_high_t x9 = input[8];
+ tran_high_t x10 = input[5];
+ tran_high_t x11 = input[10];
+ tran_high_t x12 = input[3];
+ tran_high_t x13 = input[12];
+ tran_high_t x14 = input[1];
+ tran_high_t x15 = input[14];
// stage 1
s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
@@ -997,15 +1005,15 @@ static const transform_2d FHT_16[] = {
{ fadst16, fadst16 } // ADST_ADST = 3
};
-void vp9_fht16x16_c(const int16_t *input, int16_t *output,
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_fdct16x16_c(input, output, stride);
} else {
- int16_t out[256];
- int16_t *outptr = &out[0];
+ tran_low_t out[256];
+ tran_low_t *outptr = &out[0];
int i, j;
- int16_t temp_in[16], temp_out[16];
+ tran_low_t temp_in[16], temp_out[16];
const transform_2d ht = FHT_16[tx_type];
// Columns
@@ -1028,19 +1036,21 @@ void vp9_fht16x16_c(const int16_t *input, int16_t *output,
}
}
-static INLINE int dct_32_round(int input) {
- int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- assert(-131072 <= rv && rv <= 131071);
+static INLINE tran_high_t dct_32_round(tran_high_t input) {
+ tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ // TODO(debargha, peter.derivaz): Find new bounds for this assert,
+ // and make the bounds consts.
+ // assert(-131072 <= rv && rv <= 131071);
return rv;
}
-static INLINE int half_round_shift(int input) {
- int rv = (input + 1 + (input < 0)) >> 2;
+static INLINE tran_high_t half_round_shift(tran_high_t input) {
+ tran_high_t rv = (input + 1 + (input < 0)) >> 2;
return rv;
}
-static void fdct32(const int *input, int *output, int round) {
- int step[32];
+static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {
+ tran_high_t step[32];
// Stage 1
step[0] = input[0] + input[(32 - 1)];
step[1] = input[1] + input[(32 - 2)];
@@ -1362,9 +1372,9 @@ static void fdct32(const int *input, int *output, int round) {
output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
}
-void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
int r, c;
- int16_t sum = 0;
+ tran_low_t sum = 0;
for (r = 0; r < 32; ++r)
for (c = 0; c < 32; ++c)
sum += input[r * stride + c];
@@ -1373,13 +1383,13 @@ void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) {
output[1] = 0;
}
-void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
+void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
int i, j;
- int output[32 * 32];
+ tran_high_t output[32 * 32];
// Columns
for (i = 0; i < 32; ++i) {
- int temp_in[32], temp_out[32];
+ tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = input[j * stride + i] * 4;
fdct32(temp_in, temp_out, 0);
@@ -1389,7 +1399,7 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
// Rows
for (i = 0; i < 32; ++i) {
- int temp_in[32], temp_out[32];
+ tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = output[j + i * 32];
fdct32(temp_in, temp_out, 0);
@@ -1401,13 +1411,13 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
// Note that although we use dct_32_round in dct32 computation flow,
// this 2d fdct32x32 for rate-distortion optimization loop is operating
// within 16 bits precision.
-void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
+void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
int i, j;
- int output[32 * 32];
+ tran_high_t output[32 * 32];
// Columns
for (i = 0; i < 32; ++i) {
- int temp_in[32], temp_out[32];
+ tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = input[j * stride + i] * 4;
fdct32(temp_in, temp_out, 0);
@@ -1420,7 +1430,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
// Rows
for (i = 0; i < 32; ++i) {
- int temp_in[32], temp_out[32];
+ tran_high_t temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = output[j + i * 32];
fdct32(temp_in, temp_out, 1);
@@ -1428,3 +1438,61 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
out[j + i * 32] = temp_out[j];
}
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+ vp9_fdct4x4_c(input, output, stride);
+}
+
+void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ vp9_fht4x4_c(input, output, stride, tx_type);
+}
+
+void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ vp9_fdct8x8_1_c(input, final_output, stride);
+}
+
+void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ vp9_fdct8x8_c(input, final_output, stride);
+}
+
+void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vp9_fdct16x16_1_c(input, output, stride);
+}
+
+void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ vp9_fdct16x16_c(input, output, stride);
+}
+
+void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ vp9_fht8x8_c(input, output, stride, tx_type);
+}
+
+void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+ vp9_fwht4x4_c(input, output, stride);
+}
+
+void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output,
+ int stride, int tx_type) {
+ vp9_fht16x16_c(input, output, stride, tx_type);
+}
+
+void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) {
+ vp9_fdct32x32_1_c(input, out, stride);
+}
+
+void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+ vp9_fdct32x32_c(input, out, stride);
+}
+
+void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+ int stride) {
+ vp9_fdct32x32_rd_c(input, out, stride);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index c4cf5eeb6..75b94499d 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -89,9 +89,9 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride,
int total_adj = 0;
int shift_inc = 1;
- /* If motion_magnitude is small, making the denoiser more aggressive by
- * increasing the adjustment for each level. Add another increment for
- * blocks that are labeled for increase denoising. */
+ // If motion_magnitude is small, making the denoiser more aggressive by
+ // increasing the adjustment for each level. Add another increment for
+ // blocks that are labeled for increase denoising.
if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
if (increase_denoising) {
shift_inc = 2;
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h
index a913add86..fa714b132 100644
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -18,7 +18,7 @@
extern "C" {
#endif
-#define MOTION_MAGNITUDE_THRESHOLD (8*3)
+#define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
typedef enum vp9_denoiser_decision {
COPY_BLOCK,
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 72ced0529..c62b52fb5 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -2466,6 +2466,9 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
vp9_zero(cpi->mb.pred_mv);
cpi->pc_root->index = 0;
+ // TODO(yunqingwang): use_lastframe_partitioning is no longer used in good-
+ // quality encoding. Need to evaluate it in real-time encoding later to
+ // decide if it can be removed too. And then, do the code cleanup.
if ((sf->partition_search_type == SEARCH_PARTITION &&
sf->use_lastframe_partitioning) ||
sf->partition_search_type == FIXED_PARTITION ||
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 667845072..794e6d0e3 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -107,9 +107,9 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
vp9_token_state tokens[1025][2];
unsigned best_index[1025][2];
uint8_t token_cache[1024];
- const int16_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
- int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
const int eob = p->eobs[block];
const PLANE_TYPE type = pd->plane_type;
const int default_eob = 16 << (tx_size << 1);
@@ -294,22 +294,33 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
}
static INLINE void fdct32x32(int rd_transform,
- const int16_t *src, int16_t *dst, int src_stride) {
+ const int16_t *src, tran_low_t *dst,
+ int src_stride) {
if (rd_transform)
vp9_fdct32x32_rd(src, dst, src_stride);
else
vp9_fdct32x32(src, dst, src_stride);
}
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void high_fdct32x32(int rd_transform, const int16_t *src,
+ tran_low_t *dst, int src_stride) {
+ if (rd_transform)
+ vp9_high_fdct32x32_rd(src, dst, src_stride);
+ else
+ vp9_high_fdct32x32(src, dst, src_stride);
+}
+#endif
+
void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
- int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
- int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
uint16_t *const eob = &p->eobs[block];
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
int i, j;
@@ -357,9 +368,9 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
- int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
- int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
uint16_t *const eob = &p->eobs[block];
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
int i, j;
@@ -405,9 +416,9 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
- int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
- int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
uint16_t *const eob = &p->eobs[block];
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
int i, j;
@@ -458,7 +469,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
struct optimize_ctx *const ctx = args->ctx;
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
int i, j;
uint8_t *dst;
ENTROPY_CONTEXT *a, *l;
@@ -538,7 +549,7 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
MACROBLOCKD *const xd = &x->e_mbd;
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
int i, j;
uint8_t *dst;
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
@@ -587,9 +598,9 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
struct macroblock_plane *const p = &x->plane[plane];
struct macroblockd_plane *const pd = &xd->plane[plane];
- int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
- int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
- int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
const scan_order *scan_order;
TX_TYPE tx_type;
PREDICTION_MODE mode;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index b6e606d78..b3884d056 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -556,6 +556,9 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
cm->profile = oxcf->profile;
cm->bit_depth = oxcf->bit_depth;
+#if CONFIG_VP9_HIGHBITDEPTH
+ cm->use_highbitdepth = oxcf->use_highbitdepth;
+#endif
cm->color_space = UNKNOWN;
cm->width = oxcf->width;
@@ -613,6 +616,11 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
assert(cm->bit_depth > VPX_BITS_8);
cpi->oxcf = *oxcf;
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cpi->oxcf.use_highbitdepth) {
+ cpi->mb.e_mbd.bd = (int)cm->bit_depth;
+ }
+#endif
rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
@@ -1272,7 +1280,10 @@ static void generate_psnr_packet(VP9_COMP *cpi) {
pkt.data.psnr.psnr[i] = psnr.psnr[i];
}
pkt.kind = VPX_CODEC_PSNR_PKT;
- vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+ if (is_two_pass_svc(cpi))
+ cpi->svc.layer_context[cpi->svc.spatial_layer_id].psnr_pkt = pkt.data.psnr;
+ else
+ vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
}
int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags) {
@@ -2768,7 +2779,16 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
if (oxcf->pass == 1 &&
(!cpi->use_svc || is_two_pass_svc(cpi))) {
const int lossless = is_lossless_requested(oxcf);
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (cpi->oxcf.use_highbitdepth)
+ cpi->mb.fwd_txm4x4 = lossless ? vp9_high_fwht4x4 : vp9_high_fdct4x4;
+ else
+ cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+ cpi->mb.high_itxm_add = lossless ? vp9_high_iwht4x4_add :
+ vp9_high_idct4x4_add;
+#else
cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+#endif
cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
vp9_first_pass(cpi, source);
} else if (oxcf->pass == 2 &&
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 0d3c4c19a..80774de92 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -217,6 +217,9 @@ typedef struct VP9EncoderConfig {
vp8e_tuning tuning;
vp9e_tune_content content;
+#if CONFIG_VP9_HIGHBITDEPTH
+ int use_highbitdepth;
+#endif
} VP9EncoderConfig;
static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 8041b59cf..54b57cf88 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -437,41 +437,51 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
vp9_set_quantizer(cm, find_fp_qindex());
if (lc != NULL) {
- MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
twopass = &lc->twopass;
- if (cpi->common.current_video_frame == 0) {
- cpi->ref_frame_flags = 0;
+ cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
+ cpi->ref_frame_flags = VP9_LAST_FLAG;
+
+ if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id <
+ REF_FRAMES) {
+ cpi->gld_fb_idx =
+ cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id;
+ cpi->ref_frame_flags |= VP9_GOLD_FLAG;
+ cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0);
} else {
- if (lc->current_video_frame_in_layer <
- (unsigned int)cpi->svc.number_temporal_layers)
- cpi->ref_frame_flags = VP9_GOLD_FLAG;
- else
- cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+ cpi->refresh_golden_frame = 0;
}
+ if (lc->current_video_frame_in_layer == 0)
+ cpi->ref_frame_flags = 0;
+
vp9_scale_references(cpi);
// Use either last frame or alt frame for motion search.
if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
- ref_frame = LAST_FRAME;
if (first_ref_buf == NULL)
first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME);
- } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
- first_ref_buf = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
- ref_frame = GOLDEN_FRAME;
- if (first_ref_buf == NULL)
- first_ref_buf = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ }
+
+ if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+ const int ref_idx =
+ cm->ref_frame_map[get_ref_frame_idx(cpi, GOLDEN_FRAME)];
+ const int scaled_idx = cpi->scaled_ref_idx[GOLDEN_FRAME - 1];
+
+ gld_yv12 = (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf :
+ get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ } else {
+ gld_yv12 = NULL;
}
recon_y_stride = new_yv12->y_stride;
recon_uv_stride = new_yv12->uv_stride;
uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
- // Disable golden frame for svc first pass for now.
- gld_yv12 = NULL;
- set_ref_ptrs(cm, xd, ref_frame, NONE);
+ set_ref_ptrs(cm, xd,
+ (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME: NONE,
+ (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
&cpi->scaled_source);
@@ -581,7 +591,8 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
// Other than for the first frame do a motion search.
- if (cm->current_video_frame > 0) {
+ if ((lc == NULL && cm->current_video_frame > 0) ||
+ (lc != NULL && lc->current_video_frame_in_layer > 0)) {
int tmp_err, motion_error, raw_motion_error;
// Assume 0,0 motion with no mv overhead.
MV mv = {0, 0} , tmp_mv = {0, 0};
@@ -628,7 +639,9 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
}
// Search in an older reference frame.
- if (cm->current_video_frame > 1 && gld_yv12 != NULL) {
+ if (((lc == NULL && cm->current_video_frame > 1) ||
+ (lc != NULL && lc->current_video_frame_in_layer > 1))
+ && gld_yv12 != NULL) {
// Assume 0,0 motion with no mv overhead.
int gf_motion_error;
@@ -893,7 +906,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
// Special case for the first frame. Copy into the GF buffer as a second
// reference.
- if (cm->current_video_frame == 0 && gld_yv12 != NULL) {
+ if (cm->current_video_frame == 0 && gld_yv12 != NULL && lc == NULL) {
vp8_yv12_copy_frame(lst_yv12, gld_yv12);
}
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index d6f6b2563..a25dc61aa 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -320,23 +320,23 @@ int vp9_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
switch (whichdir) {
case 0:
CHECK_BETTER(left, tr, tc - hstep);
- CHECK_BETTER(up, tr - hstep, tc);
- CHECK_BETTER(diag, tr - hstep, tc - hstep);
+ CHECK_BETTER(down, tr + hstep, tc);
+ CHECK_BETTER(diag, tr + hstep, tc - hstep);
break;
case 1:
CHECK_BETTER(right, tr, tc + hstep);
- CHECK_BETTER(up, tr - hstep, tc);
- CHECK_BETTER(diag, tr - hstep, tc + hstep);
+ CHECK_BETTER(down, tr + hstep, tc);
+ CHECK_BETTER(diag, tr + hstep, tc + hstep);
break;
case 2:
CHECK_BETTER(left, tr, tc - hstep);
- CHECK_BETTER(down, tr + hstep, tc);
- CHECK_BETTER(diag, tr + hstep, tc - hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(diag, tr - hstep, tc - hstep);
break;
case 3:
CHECK_BETTER(right, tr, tc + hstep);
- CHECK_BETTER(down, tr + hstep, tc);
- CHECK_BETTER(diag, tr + hstep, tc + hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(diag, tr - hstep, tc + hstep);
break;
}
} else {
@@ -648,11 +648,11 @@ static int vp9_pattern_search(const MACROBLOCK *x,
// Returns the one-away integer pel sad values around the best as follows:
// sad_list[0]: sad at the best integer pel
// sad_list[1]: sad at delta {0, -1} (left) from the best integer pel
- // sad_list[2]: sad at delta {-1, 0} (top) from the best integer pel
+ // sad_list[2]: sad at delta { 1, 0} (bottom) from the best integer pel
// sad_list[3]: sad at delta { 0, 1} (right) from the best integer pel
- // sad_list[4]: sad at delta { 1, 0} (bottom) from the best integer pel
+ // sad_list[4]: sad at delta {-1, 0} (top) from the best integer pel
if (sad_list) {
- static const MV neighbors[4] = {{0, -1}, {-1, 0}, {0, 1}, {1, 0}};
+ static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
sad_list[0] = bestsad;
if (check_bounds(x, br, bc, 1)) {
for (i = 0; i < 4; i++) {
@@ -660,7 +660,10 @@ static int vp9_pattern_search(const MACROBLOCK *x,
bc + neighbors[i].col};
sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
get_buf_from_mv(in_what, &this_mv),
- in_what->stride);
+ in_what->stride) +
+ (use_mvcost ?
+ mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit) :
+ 0);
}
} else {
for (i = 0; i < 4; i++) {
@@ -671,7 +674,300 @@ static int vp9_pattern_search(const MACROBLOCK *x,
else
sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
get_buf_from_mv(in_what, &this_mv),
+ in_what->stride) +
+ (use_mvcost ?
+ mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit) :
+ 0);
+ }
+ }
+ }
+ best_mv->row = br;
+ best_mv->col = bc;
+ return bestsad;
+}
+
+// A specialized function where the smallest scale search candidates
+// are 4 1-away neighbors, and sad_list is non-null
+// TODO(debargha): Merge this function with the one above. Also remove
+// use_mvcost option since it is always 1, to save unnecessary branches.
+static int vp9_pattern_search_sad(const MACROBLOCK *x,
+ MV *ref_mv,
+ int search_param,
+ int sad_per_bit,
+ int do_init_search,
+ int *sad_list,
+ const vp9_variance_fn_ptr_t *vfp,
+ int use_mvcost,
+ const MV *center_mv,
+ MV *best_mv,
+ const int num_candidates[MAX_PATTERN_SCALES],
+ const MV candidates[MAX_PATTERN_SCALES]
+ [MAX_PATTERN_CANDIDATES]) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+ 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+ };
+ int i, s, t;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ int br, bc;
+ int bestsad = INT_MAX;
+ int thissad;
+ int k = -1;
+ const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+ int best_init_s = search_param_to_steps[search_param];
+ // adjust ref_mv to make sure it is within MV range
+ clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+ br = ref_mv->row;
+ bc = ref_mv->col;
+ if (sad_list != NULL) {
+ sad_list[0] = sad_list[1] = sad_list[2] = sad_list[3] = sad_list[4] =
+ INT_MAX;
+ }
+
+ // Work out the start point for the search
+ bestsad = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, ref_mv), in_what->stride) +
+ mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+
+ // Search all possible scales upto the search param around the center point
+ // pick the scale of the point that is best as the starting scale of
+ // further steps around it.
+ if (do_init_search) {
+ s = best_init_s;
+ best_init_s = -1;
+ for (t = 0; t <= s; ++t) {
+ int best_site = -1;
+ if (check_bounds(x, br, bc, 1 << t)) {
+ for (i = 0; i < num_candidates[t]; i++) {
+ const MV this_mv = {br + candidates[t][i].row,
+ bc + candidates[t][i].col};
+ thissad = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[t]; i++) {
+ const MV this_mv = {br + candidates[t][i].row,
+ bc + candidates[t][i].col};
+ if (!is_mv_in(x, &this_mv))
+ continue;
+ thissad = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride);
+ CHECK_BETTER
+ }
+ }
+ if (best_site == -1) {
+ continue;
+ } else {
+ best_init_s = t;
+ k = best_site;
+ }
+ }
+ if (best_init_s != -1) {
+ br += candidates[best_init_s][k].row;
+ bc += candidates[best_init_s][k].col;
+ }
+ }
+
+ // If the center point is still the best, just skip this and move to
+ // the refinement step.
+ if (best_init_s != -1) {
+ int do_sad = (num_candidates[0] == 4 && sad_list != NULL);
+ int best_site = -1;
+ s = best_init_s;
+
+ for (; s >= do_sad; s--) {
+ if (!do_init_search || s != best_init_s) {
+ if (check_bounds(x, br, bc, 1 << s)) {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = {br + candidates[s][i].row,
+ bc + candidates[s][i].col};
+ thissad = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = {br + candidates[s][i].row,
+ bc + candidates[s][i].col};
+ if (!is_mv_in(x, &this_mv))
+ continue;
+ thissad = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site == -1) {
+ continue;
+ } else {
+ br += candidates[s][best_site].row;
+ bc += candidates[s][best_site].col;
+ k = best_site;
+ }
+ }
+
+ do {
+ int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+ best_site = -1;
+ next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+ next_chkpts_indices[1] = k;
+ next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+ if (check_bounds(x, br, bc, 1 << s)) {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col};
+ thissad = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col};
+ if (!is_mv_in(x, &this_mv))
+ continue;
+ thissad = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site != -1) {
+ k = next_chkpts_indices[best_site];
+ br += candidates[s][k].row;
+ bc += candidates[s][k].col;
+ }
+ } while (best_site != -1);
+ }
+
+ // Note: If we enter the if below, then sad_list must be non-NULL.
+ if (s == 0) {
+ sad_list[0] = bestsad;
+ if (!do_init_search || s != best_init_s) {
+ if (check_bounds(x, br, bc, 1 << s)) {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = {br + candidates[s][i].row,
+ bc + candidates[s][i].col};
+ sad_list[i + 1] =
+ thissad = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = {br + candidates[s][i].row,
+ bc + candidates[s][i].col};
+ if (!is_mv_in(x, &this_mv))
+ continue;
+ sad_list[i + 1] =
+ thissad = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site != -1) {
+ br += candidates[s][best_site].row;
+ bc += candidates[s][best_site].col;
+ k = best_site;
+ }
+ }
+ while (best_site != -1) {
+ int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+ best_site = -1;
+ next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+ next_chkpts_indices[1] = k;
+ next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+ sad_list[1] = sad_list[2] = sad_list[3] = sad_list[4] = INT_MAX;
+ sad_list[((k + 2) % 4) + 1] = sad_list[0];
+ sad_list[0] = bestsad;
+
+ if (check_bounds(x, br, bc, 1 << s)) {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col};
+ sad_list[next_chkpts_indices[i] + 1] =
+ thissad = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col};
+ if (!is_mv_in(x, &this_mv)) {
+ sad_list[next_chkpts_indices[i] + 1] = INT_MAX;
+ continue;
+ }
+ sad_list[next_chkpts_indices[i] + 1] =
+ thissad = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site != -1) {
+ k = next_chkpts_indices[best_site];
+ br += candidates[s][k].row;
+ bc += candidates[s][k].col;
+ }
+ }
+ }
+ }
+
+ // Returns the one-away integer pel sad values around the best as follows:
+ // sad_list[0]: sad at the best integer pel
+ // sad_list[1]: sad at delta {0, -1} (left) from the best integer pel
+ // sad_list[2]: sad at delta { 1, 0} (bottom) from the best integer pel
+ // sad_list[3]: sad at delta { 0, 1} (right) from the best integer pel
+ // sad_list[4]: sad at delta {-1, 0} (top) from the best integer pel
+ if (sad_list) {
+ static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
+ if (sad_list[0] == INT_MAX) {
+ sad_list[0] = bestsad;
+ if (check_bounds(x, br, bc, 1)) {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = {br + neighbors[i].row,
+ bc + neighbors[i].col};
+ sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
in_what->stride);
+ }
+ } else {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = {br + neighbors[i].row,
+ bc + neighbors[i].col};
+ if (!is_mv_in(x, &this_mv))
+ sad_list[i + 1] = INT_MAX;
+ else
+ sad_list[i + 1] = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv),
+ in_what->stride);
+ }
+ }
+ } else {
+ if (use_mvcost) {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = {br + neighbors[i].row,
+ bc + neighbors[i].col};
+ if (sad_list[i + 1] != INT_MAX) {
+ sad_list[i + 1] +=
+ mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+ }
+ }
}
}
}
@@ -784,10 +1080,10 @@ int vp9_bigdia_search(const MACROBLOCK *x,
{{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024},
{-512, 512}, {-1024, 0}},
};
- return vp9_pattern_search(x, ref_mv, search_param, sad_per_bit,
- do_init_search, sad_list, vfp, use_mvcost,
- center_mv, best_mv,
- bigdia_num_candidates, bigdia_candidates);
+ return vp9_pattern_search_sad(x, ref_mv, search_param, sad_per_bit,
+ do_init_search, sad_list, vfp, use_mvcost,
+ center_mv, best_mv,
+ bigdia_num_candidates, bigdia_candidates);
}
int vp9_square_search(const MACROBLOCK *x,
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index d36548996..5557d7fe7 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -77,7 +77,6 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
while (filter_step > 0) {
const int filt_high = MIN(filt_mid + filter_step, max_filter_level);
const int filt_low = MAX(filt_mid - filter_step, min_filter_level);
- int filt_err;
// Bias against raising loop filter in favor of lowering it.
int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
@@ -92,17 +91,14 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
if (filt_direction <= 0 && filt_low != filt_mid) {
// Get Low filter error score
if (ss_err[filt_low] < 0) {
- filt_err = try_filter_frame(sd, cpi, filt_low, partial_frame);
- ss_err[filt_low] = filt_err;
- } else {
- filt_err = ss_err[filt_low];
+ ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame);
}
// If value is close to the best so far then bias towards a lower loop
// filter value.
- if ((filt_err - bias) < best_err) {
+ if ((ss_err[filt_low] - bias) < best_err) {
// Was it actually better than the previous best?
- if (filt_err < best_err)
- best_err = filt_err;
+ if (ss_err[filt_low] < best_err)
+ best_err = ss_err[filt_low];
filt_best = filt_low;
}
@@ -111,14 +107,11 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
// Now look at filt_high
if (filt_direction >= 0 && filt_high != filt_mid) {
if (ss_err[filt_high] < 0) {
- filt_err = try_filter_frame(sd, cpi, filt_high, partial_frame);
- ss_err[filt_high] = filt_err;
- } else {
- filt_err = ss_err[filt_high];
+ ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
}
// Was it better than the previous best?
- if (filt_err < (best_err - bias)) {
- best_err = filt_err;
+ if (ss_err[filt_high] < (best_err - bias)) {
+ best_err = ss_err[filt_high];
filt_best = filt_high;
}
}
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index eababdbca..d49eb956f 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -19,9 +19,9 @@
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/encoder/vp9_rd.h"
-void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
const int16_t *round_ptr, const int16_t quant,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr) {
const int rc = 0;
const int coeff = coeff_ptr[rc];
@@ -40,9 +40,9 @@ void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
*eob_ptr = eob + 1;
}
-void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
const int16_t *round_ptr, const int16_t quant,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr) {
const int rc = 0;
const int coeff = coeff_ptr[rc];
@@ -62,11 +62,11 @@ void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
*eob_ptr = eob + 1;
}
-void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
int zbin_oq_value, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
@@ -78,13 +78,13 @@ void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,
(void)zbin_oq_value;
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
- vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+ vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Quantization pass: All coefficients with index >= zero_flag are
// skippable. Note: zero_flag can be zero.
- for (i = 0; i < count; i++) {
+ for (i = 0; i < n_coeffs; i++) {
const int rc = scan[i];
const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
@@ -105,12 +105,12 @@ void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,
// TODO(jingning) Refactor this file and combine functions with similar
// operations.
-void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
int zbin_oq_value, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
@@ -120,8 +120,8 @@ void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
(void)zbin_oq_value;
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
- vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+ vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
for (i = 0; i < n_coeffs; i++) {
@@ -146,27 +146,27 @@ void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
*eob_ptr = eob + 1;
}
-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
+void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
int zbin_oq_value, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- int i, non_zero_count = (int)count, eob = -1;
+ int i, non_zero_count = (int)n_coeffs, eob = -1;
const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
zbin_ptr[1] + zbin_oq_value };
const int nzbins[2] = { zbins[0] * -1,
zbins[1] * -1 };
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
- vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+ vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Pre-scan pass
- for (i = (int)count - 1; i >= 0; i--) {
+ for (i = (int)n_coeffs - 1; i >= 0; i--) {
const int rc = scan[i];
const int coeff = coeff_ptr[rc];
@@ -199,12 +199,12 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
*eob_ptr = eob + 1;
}
-void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr,
const int16_t *quant_shift_ptr,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr,
int zbin_oq_value, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
@@ -217,8 +217,8 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
int i, eob = -1;
(void)iscan;
- vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
- vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+ vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (!skip_block) {
// Pre-scan pass
@@ -280,13 +280,19 @@ static void invert_quant(int16_t *quant, int16_t *shift, int d) {
*shift = 1 << (16 - l);
}
+static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {
+ int quant = vp9_dc_quant(q, 0);
+ (void) bit_depth;
+ return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+}
+
void vp9_init_quantizer(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
QUANTS *const quants = &cpi->quants;
int i, q, quant;
for (q = 0; q < QINDEX_RANGE; q++) {
- const int qzbin_factor = q == 0 ? 64 : (vp9_dc_quant(q, 0) < 148 ? 84 : 80);
+ const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
const int qrounding_factor = q == 0 ? 64 : 48;
for (i = 0; i < 2; ++i) {
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 262529b05..d7edb0bdc 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -37,17 +37,29 @@ typedef struct {
DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
} QUANTS;
-void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
const int16_t *round_ptr, const int16_t quant_ptr,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr);
-void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
const int16_t *round_ptr, const int16_t quant_ptr,
- int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t dequant_ptr, uint16_t *eob_ptr);
void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
const int16_t *scan, const int16_t *iscan);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr, const int16_t quant_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vp9_high_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+ const int16_t *round_ptr,
+ const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr,
+ const int16_t dequant_ptr, uint16_t *eob_ptr);
+#endif
+
struct VP9_COMP;
struct VP9Common;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 939351d2d..9df85defd 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -249,7 +249,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
*out_dist_sum = dist_sum << 4;
}
-int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff,
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
intptr_t block_size, int64_t *ssz) {
int i;
int64_t error = 0, sqcoeff = 0;
@@ -288,7 +288,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
const PLANE_TYPE type = pd->plane_type;
const int16_t *band_count = &band_counts[tx_size][1];
const int eob = p->eobs[block];
- const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
x->token_costs[tx_size][type][is_inter_block(mbmi)];
uint8_t token_cache[32 * 32];
@@ -358,8 +358,8 @@ static void dist_block(int plane, int block, TX_SIZE tx_size,
const struct macroblockd_plane *const pd = &xd->plane[plane];
int64_t this_sse;
int shift = tx_size == TX_32X32 ? 0 : 2;
- int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
- int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
&this_sse) >> shift;
args->sse = this_sse >> shift;
@@ -405,8 +405,8 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
dist_block(plane, block, tx_size, args);
} else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
// compute DC coefficient
- int16_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
- int16_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
+ tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
args->sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
args->dist = args->sse;
@@ -690,7 +690,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
p->src_diff);
- int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+ tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
xd->mi[0]->bmi[block].as_mode = mode;
vp9_predict_intra_block(xd, block, 1,
TX_4X4, mode,
@@ -1137,7 +1137,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
for (idy = 0; idy < height / 4; ++idy) {
for (idx = 0; idx < width / 4; ++idx) {
int64_t ssz, rd, rd1, rd2;
- int16_t* coeff;
+ tran_low_t* coeff;
k += (idy * 2 + idx);
coeff = BLOCK_OFFSET(p->coeff, k);
diff --git a/vp9/encoder/vp9_sad.c b/vp9/encoder/vp9_sad.c
index d06263676..cee6ce140 100644
--- a/vp9/encoder/vp9_sad.c
+++ b/vp9/encoder/vp9_sad.c
@@ -14,6 +14,9 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_common.h"
+#endif
#include "vp9/encoder/vp9_variance.h"
static INLINE unsigned int sad(const uint8_t *a, int a_stride,
@@ -131,3 +134,138 @@ sadMxN(4, 4)
sadMxNxK(4, 4, 3)
sadMxNxK(4, 4, 8)
sadMxNx4D(4, 4)
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE unsigned int high_sad(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ sad += abs(a[x] - b[x]);
+
+ a += a_stride;
+ b += b_stride;
+ }
+ return sad;
+}
+
+static INLINE unsigned int high_sadb(const uint8_t *a8, int a_stride,
+ const uint16_t *b, int b_stride,
+ int width, int height) {
+ int y, x;
+ unsigned int sad = 0;
+ const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++)
+ sad += abs(a[x] - b[x]);
+
+ a += a_stride;
+ b += b_stride;
+ }
+ return sad;
+}
+
+#define high_sadMxN(m, n) \
+unsigned int vp9_high_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride) { \
+ return high_sad(src, src_stride, ref, ref_stride, m, n); \
+} \
+unsigned int vp9_high_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred) { \
+ uint16_t comp_pred[m * n]; \
+ vp9_high_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+ return high_sadb(src, src_stride, comp_pred, m, m, n); \
+}
+
+#define high_sadMxNxK(m, n, k) \
+void vp9_high_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sads) { \
+ int i; \
+ for (i = 0; i < k; ++i) \
+ sads[i] = vp9_high_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \
+}
+
+#define high_sadMxNx4D(m, n) \
+void vp9_high_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+ const uint8_t *const refs[], \
+ int ref_stride, unsigned int *sads) { \
+ int i; \
+ for (i = 0; i < 4; ++i) \
+ sads[i] = vp9_high_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \
+}
+
+// 64x64
+high_sadMxN(64, 64)
+high_sadMxNxK(64, 64, 3)
+high_sadMxNxK(64, 64, 8)
+high_sadMxNx4D(64, 64)
+
+// 64x32
+high_sadMxN(64, 32)
+high_sadMxNx4D(64, 32)
+
+// 32x64
+high_sadMxN(32, 64)
+high_sadMxNx4D(32, 64)
+
+// 32x32
+high_sadMxN(32, 32)
+high_sadMxNxK(32, 32, 3)
+high_sadMxNxK(32, 32, 8)
+high_sadMxNx4D(32, 32)
+
+// 32x16
+high_sadMxN(32, 16)
+high_sadMxNx4D(32, 16)
+
+// 16x32
+high_sadMxN(16, 32)
+high_sadMxNx4D(16, 32)
+
+// 16x16
+high_sadMxN(16, 16)
+high_sadMxNxK(16, 16, 3)
+high_sadMxNxK(16, 16, 8)
+high_sadMxNx4D(16, 16)
+
+// 16x8
+high_sadMxN(16, 8)
+high_sadMxNxK(16, 8, 3)
+high_sadMxNxK(16, 8, 8)
+high_sadMxNx4D(16, 8)
+
+// 8x16
+high_sadMxN(8, 16)
+high_sadMxNxK(8, 16, 3)
+high_sadMxNxK(8, 16, 8)
+high_sadMxNx4D(8, 16)
+
+// 8x8
+high_sadMxN(8, 8)
+high_sadMxNxK(8, 8, 3)
+high_sadMxNxK(8, 8, 8)
+high_sadMxNx4D(8, 8)
+
+// 8x4
+high_sadMxN(8, 4)
+high_sadMxNxK(8, 4, 8)
+high_sadMxNx4D(8, 4)
+
+// 4x8
+high_sadMxN(4, 8)
+high_sadMxNxK(4, 8, 8)
+high_sadMxNx4D(4, 8)
+
+// 4x4
+high_sadMxN(4, 4)
+high_sadMxNxK(4, 4, 3)
+high_sadMxNxK(4, 4, 8)
+high_sadMxNx4D(4, 4)
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index d9ac12262..889a8be21 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -67,15 +67,11 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
: USE_LARGESTALL;
if (MIN(cm->width, cm->height) >= 720) {
- sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
- sf->last_partitioning_redo_frequency = 3;
sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
: DISABLE_ALL_INTER_SPLIT;
sf->adaptive_pred_interp_filter = 0;
} else {
sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
- sf->last_partitioning_redo_frequency = 2;
- sf->lf_motion_threshold = NO_MOTION_THRESHOLD;
}
sf->reference_masking = 1;
@@ -86,7 +82,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
sf->disable_filter_search_var_thresh = 100;
sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
sf->auto_min_max_partition_size = CONSTRAIN_NEIGHBORING_MIN_MAX;
- sf->adjust_partitioning_from_last_frame = 1;
if (MIN(cm->width, cm->height) >= 720)
sf->partition_search_breakout_dist_thr = (1 << 24);
@@ -110,8 +105,6 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
sf->cb_pred_filter_search = 1;
sf->alt_ref_search_fp = 1;
sf->motion_field_mode_search = !boosted;
- sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
- sf->last_partitioning_redo_frequency = 2;
sf->recode_loop = ALLOW_RECODE_KFMAXBW;
sf->adaptive_rd_thresh = 3;
sf->mode_skip_start = 6;
@@ -130,11 +123,12 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
sf->use_square_partition_only = 1;
sf->tx_size_search_method = USE_LARGESTALL;
sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ sf->mv.search_method = BIGDIA;
+ sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
sf->adaptive_rd_thresh = 4;
sf->mode_search_skip_flags |= FLAG_SKIP_COMP_REFMISMATCH |
FLAG_EARLY_TERMINATE;
sf->disable_filter_search_var_thresh = 200;
- sf->use_lastframe_partitioning = LAST_FRAME_PARTITION_ALL;
sf->use_lp32x32fdct = 1;
sf->use_fast_coef_updates = ONE_LOOP_REDUCED;
sf->use_fast_coef_costing = 1;
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 1fc43a427..d180d1a8c 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -36,6 +36,7 @@ typedef struct {
int gold_ref_idx;
int has_alt_frame;
size_t layer_size;
+ struct vpx_psnr_pkt psnr_pkt;
} LAYER_CONTEXT;
typedef struct {
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 6068b85a0..263883f66 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -212,7 +212,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
TOKENEXTRA *t = *tp; /* store tokens starting here */
int eob = p->eobs[block];
const PLANE_TYPE type = pd->plane_type;
- const int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
const int segment_id = mbmi->segment_id;
const int16_t *scan, *nb;
const scan_order *so;
diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
index afbb191ad..c97f93fda 100644
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -267,3 +267,375 @@ void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
ref += ref_stride;
}
}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void high_variance64(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h, uint64_t *sse,
+ uint64_t *sum) {
+ int i, j;
+
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ *sum = 0;
+ *sse = 0;
+
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; j++) {
+ const int diff = a[j] - b[j];
+ *sum += diff;
+ *sse += diff * diff;
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+}
+
+void high_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h, unsigned int *sse,
+ int *sum) {
+ uint64_t sse_long = 0;
+ uint64_t sum_long = 0;
+ high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sse = sse_long;
+ *sum = sum_long;
+}
+
+void high_10_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h, unsigned int *sse,
+ int *sum) {
+ uint64_t sse_long = 0;
+ uint64_t sum_long = 0;
+ high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+ *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+void high_12_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h, unsigned int *sse,
+ int *sum) {
+ uint64_t sse_long = 0;
+ uint64_t sum_long = 0;
+ high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+ *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+ *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+static void high_var_filter_block2d_bil_first_pass(
+ const uint8_t *src_ptr8,
+ uint16_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const int16_t *vp9_filter) {
+ unsigned int i, j;
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+ for (i = 0; i < output_height; i++) {
+ for (j = 0; j < output_width; j++) {
+ output_ptr[j] =
+ ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+ (int)src_ptr[pixel_step] * vp9_filter[1],
+ FILTER_BITS);
+
+ src_ptr++;
+ }
+
+ // Next row...
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+static void high_var_filter_block2d_bil_second_pass(
+ const uint16_t *src_ptr,
+ uint16_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ unsigned int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const int16_t *vp9_filter) {
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; i++) {
+ for (j = 0; j < output_width; j++) {
+ output_ptr[j] =
+ ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+ (int)src_ptr[pixel_step] * vp9_filter[1],
+ FILTER_BITS);
+ src_ptr++;
+ }
+
+ src_ptr += src_pixels_per_line - output_width;
+ output_ptr += output_width;
+ }
+}
+
+#define HIGH_VAR(W, H) \
+unsigned int vp9_high_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ high_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vp9_high_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ high_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vp9_high_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ high_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define HIGH_SUBPIX_VAR(W, H) \
+unsigned int vp9_high_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+\
+ high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, BILINEAR_FILTERS_2TAP(xoffset)); \
+ high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+ return vp9_high_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+ dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_10_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+\
+ high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, BILINEAR_FILTERS_2TAP(xoffset)); \
+ high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+ return vp9_high_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+ dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_12_sub_pixel_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+\
+ high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, BILINEAR_FILTERS_2TAP(xoffset)); \
+ high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+ return vp9_high_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+ dst_stride, sse); \
+}
+
+#define HIGH_SUBPIX_AVG_VAR(W, H) \
+unsigned int vp9_high_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+\
+ high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, BILINEAR_FILTERS_2TAP(xoffset)); \
+ high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+ vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \
+ W); \
+\
+ return vp9_high_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_10_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+\
+ high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, BILINEAR_FILTERS_2TAP(xoffset)); \
+ high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+ vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \
+ W); \
+\
+ return vp9_high_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_12_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, \
+ int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, \
+ unsigned int *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+\
+ high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+ W, BILINEAR_FILTERS_2TAP(xoffset)); \
+ high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+ BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+ vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \
+ W); \
+\
+ return vp9_high_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
+}
+
+#define HIGH_GET_VAR(S) \
+void vp9_high_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse, int *sum) { \
+ high_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vp9_high_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse, int *sum) { \
+ high_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vp9_high_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse, int *sum) { \
+ high_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+}
+
+#define HIGH_MSE(W, H) \
+unsigned int vp9_high_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ high_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+} \
+\
+unsigned int vp9_high_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ high_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+} \
+\
+unsigned int vp9_high_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+ const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ int sum; \
+ high_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+ return *sse; \
+}
+
+HIGH_GET_VAR(8)
+HIGH_GET_VAR(16)
+
+HIGH_MSE(16, 16)
+HIGH_MSE(16, 8)
+HIGH_MSE(8, 16)
+HIGH_MSE(8, 8)
+
+HIGH_VAR(4, 4)
+HIGH_SUBPIX_VAR(4, 4)
+HIGH_SUBPIX_AVG_VAR(4, 4)
+
+HIGH_VAR(4, 8)
+HIGH_SUBPIX_VAR(4, 8)
+HIGH_SUBPIX_AVG_VAR(4, 8)
+
+HIGH_VAR(8, 4)
+HIGH_SUBPIX_VAR(8, 4)
+HIGH_SUBPIX_AVG_VAR(8, 4)
+
+HIGH_VAR(8, 8)
+HIGH_SUBPIX_VAR(8, 8)
+HIGH_SUBPIX_AVG_VAR(8, 8)
+
+HIGH_VAR(8, 16)
+HIGH_SUBPIX_VAR(8, 16)
+HIGH_SUBPIX_AVG_VAR(8, 16)
+
+HIGH_VAR(16, 8)
+HIGH_SUBPIX_VAR(16, 8)
+HIGH_SUBPIX_AVG_VAR(16, 8)
+
+HIGH_VAR(16, 16)
+HIGH_SUBPIX_VAR(16, 16)
+HIGH_SUBPIX_AVG_VAR(16, 16)
+
+HIGH_VAR(16, 32)
+HIGH_SUBPIX_VAR(16, 32)
+HIGH_SUBPIX_AVG_VAR(16, 32)
+
+HIGH_VAR(32, 16)
+HIGH_SUBPIX_VAR(32, 16)
+HIGH_SUBPIX_AVG_VAR(32, 16)
+
+HIGH_VAR(32, 32)
+HIGH_SUBPIX_VAR(32, 32)
+HIGH_SUBPIX_AVG_VAR(32, 32)
+
+HIGH_VAR(32, 64)
+HIGH_SUBPIX_VAR(32, 64)
+HIGH_SUBPIX_AVG_VAR(32, 64)
+
+HIGH_VAR(64, 32)
+HIGH_SUBPIX_VAR(64, 32)
+HIGH_SUBPIX_AVG_VAR(64, 32)
+
+HIGH_VAR(64, 64)
+HIGH_SUBPIX_VAR(64, 64)
+HIGH_SUBPIX_AVG_VAR(64, 64)
+
+void vp9_high_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride) {
+ int i, j;
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ const int tmp = pred[j] + ref[j];
+ comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+ }
+ comp_pred += width;
+ pred += width;
+ ref += ref_stride;
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 4a194b72c..c51d08d04 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -22,6 +22,23 @@ void variance(const uint8_t *a, int a_stride,
int w, int h,
unsigned int *sse, int *sum);
+#if CONFIG_VP9_HIGHBITDEPTH
+void high_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h,
+ unsigned int *sse, int *sum);
+
+void high_10_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h,
+ unsigned int *sse, int *sum);
+
+void high_12_variance(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ int w, int h,
+ unsigned int *sse, int *sum);
+#endif
+
typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -81,6 +98,11 @@ typedef struct vp9_variance_vtable {
void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
int height, const uint8_t *ref, int ref_stride);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride);
+#endif
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 0f0b7a5ab..4b24960ef 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -555,7 +555,7 @@ static vpx_codec_err_t ctrl_set_enable_auto_alt_ref(vpx_codec_alg_priv_t *ctx,
static vpx_codec_err_t ctrl_set_noise_sensitivity(vpx_codec_alg_priv_t *ctx,
va_list args) {
struct vp9_extracfg extra_cfg = ctx->extra_cfg;
- extra_cfg.noise_sensitivity = CAST(VP8E_SET_NOISE_SENSITIVITY, args);
+ extra_cfg.noise_sensitivity = CAST(VP9E_SET_NOISE_SENSITIVITY, args);
return update_extra_cfg(ctx, &extra_cfg);
}
@@ -686,6 +686,10 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
if (res == VPX_CODEC_OK) {
set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
+#if CONFIG_VP9_HIGHBITDEPTH
+ priv->oxcf.use_highbitdepth =
+ (ctx->init_flags & VPX_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
+#endif
priv->cpi = vp9_create_compressor(&priv->oxcf);
if (priv->cpi == NULL)
res = VPX_CODEC_MEM_ERROR;
@@ -981,15 +985,20 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
cx_data_sz -= size;
#if CONFIG_SPATIAL_SVC
if (is_two_pass_svc(cpi)) {
- vpx_codec_cx_pkt_t pkt;
+ vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr;
int i;
- vp9_zero(pkt);
- pkt.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
+ vp9_zero(pkt_sizes);
+ vp9_zero(pkt_psnr);
+ pkt_sizes.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
+ pkt_psnr.kind = VPX_CODEC_SPATIAL_SVC_LAYER_PSNR;
for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
- pkt.data.layer_sizes[i] = cpi->svc.layer_context[i].layer_size;
- cpi->svc.layer_context[i].layer_size = 0;
+ LAYER_CONTEXT *lc = &cpi->svc.layer_context[i];
+ pkt_sizes.data.layer_sizes[i] = lc->layer_size;
+ pkt_psnr.data.layer_psnr[i] = lc->psnr_pkt;
+ lc->layer_size = 0;
}
- vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+ vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_sizes);
+ vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr);
}
#endif
}
@@ -1231,7 +1240,6 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{VP8E_SET_ACTIVEMAP, ctrl_set_active_map},
{VP8E_SET_SCALEMODE, ctrl_set_scale_mode},
{VP8E_SET_CPUUSED, ctrl_set_cpuused},
- {VP8E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity},
{VP8E_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref},
{VP8E_SET_SHARPNESS, ctrl_set_sharpness},
{VP8E_SET_STATIC_THRESHOLD, ctrl_set_static_thresh},
@@ -1251,6 +1259,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{VP9E_SET_SVC_PARAMETERS, ctrl_set_svc_parameters},
{VP9E_SET_SVC_LAYER_ID, ctrl_set_svc_layer_id},
{VP9E_SET_TUNE_CONTENT, ctrl_set_tune_content},
+ {VP9E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity},
// Getters
{VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer},
@@ -1333,6 +1342,9 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
CODEC_INTERFACE(vpx_codec_vp9_cx) = {
"WebM Project VP9 Encoder" VERSION_STRING,
VPX_CODEC_INTERNAL_ABI_VERSION,
+#if CONFIG_VP9_HIGHBITDEPTH
+ VPX_CODEC_CAP_HIGHBITDEPTH |
+#endif
VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR, // vpx_codec_caps_t
encoder_init, // vpx_codec_init_fn_t
encoder_destroy, // vpx_codec_destroy_fn_t