summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/mips/dspr2/vp9_common_dspr2.h3
-rw-r--r--vp9/common/mips/dspr2/vp9_itrans16_dspr2.c1315
-rw-r--r--vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c1073
-rw-r--r--vp9/common/mips/dspr2/vp9_itrans32_dspr2.c1013
-rw-r--r--vp9/common/mips/dspr2/vp9_itrans4_dspr2.c438
-rw-r--r--vp9/common/mips/dspr2/vp9_itrans8_dspr2.c745
-rw-r--r--vp9/common/vp9_alloccommon.c28
-rw-r--r--vp9/common/vp9_blockd.h42
-rw-r--r--vp9/common/vp9_common.h8
-rw-r--r--vp9/common/vp9_entropy.c18
-rw-r--r--vp9/common/vp9_entropy.h16
-rw-r--r--vp9/common/vp9_entropymode.c167
-rw-r--r--vp9/common/vp9_entropymode.h4
-rw-r--r--vp9/common/vp9_entropymv.c61
-rw-r--r--vp9/common/vp9_enums.h11
-rw-r--r--vp9/common/vp9_filter.c24
-rw-r--r--vp9/common/vp9_filter.h5
-rw-r--r--vp9/common/vp9_findnearmv.c3
-rw-r--r--vp9/common/vp9_findnearmv.h46
-rw-r--r--vp9/common/vp9_idct.c28
-rw-r--r--vp9/common/vp9_loopfilter.c44
-rw-r--r--vp9/common/vp9_loopfilter.h13
-rw-r--r--vp9/common/vp9_mvref_common.c16
-rw-r--r--vp9/common/vp9_mvref_common.h4
-rw-r--r--vp9/common/vp9_onyxc_int.h106
-rw-r--r--vp9/common/vp9_pred_common.c8
-rw-r--r--vp9/common/vp9_pred_common.h14
-rw-r--r--vp9/common/vp9_reconinter.c78
-rw-r--r--vp9/common/vp9_reconinter.h8
-rw-r--r--vp9/common/vp9_reconintra.c2
-rw-r--r--vp9/common/vp9_reconintra.h8
-rw-r--r--vp9/common/vp9_rtcd_defs.sh72
-rw-r--r--vp9/common/vp9_scale.c115
-rw-r--r--vp9/common/vp9_scale.h28
-rw-r--r--vp9/common/vp9_scan.h3
-rw-r--r--vp9/common/vp9_tile_common.c21
-rw-r--r--vp9/common/vp9_tile_common.h12
-rw-r--r--vp9/common/vp9_treecoder.c23
-rw-r--r--vp9/common/vp9_treecoder.h41
-rw-r--r--vp9/common/x86/vp9_idct_intrin_sse2.c1041
-rw-r--r--vp9/common/x86/vp9_intrapred_ssse3.asm2
-rw-r--r--vp9/common/x86/vp9_loopfilter_intrin_avx2.c943
-rw-r--r--vp9/decoder/vp9_dboolhuff.h2
-rw-r--r--vp9/decoder/vp9_decodemv.c42
-rw-r--r--vp9/decoder/vp9_decodemv.h3
-rw-r--r--vp9/decoder/vp9_decodframe.c736
-rw-r--r--vp9/decoder/vp9_detokenize.c61
-rw-r--r--vp9/decoder/vp9_detokenize.h7
-rw-r--r--vp9/decoder/vp9_onyxd_if.c39
-rw-r--r--vp9/decoder/vp9_onyxd_int.h8
-rw-r--r--vp9/encoder/vp9_bitstream.c200
-rw-r--r--vp9/encoder/vp9_block.h9
-rw-r--r--vp9/encoder/vp9_dct.c67
-rw-r--r--vp9/encoder/vp9_dct.h24
-rw-r--r--vp9/encoder/vp9_encodeframe.c253
-rw-r--r--vp9/encoder/vp9_encodemb.c53
-rw-r--r--vp9/encoder/vp9_encodemv.c44
-rw-r--r--vp9/encoder/vp9_firstpass.c17
-rw-r--r--vp9/encoder/vp9_mbgraph.c4
-rw-r--r--vp9/encoder/vp9_modecosts.c2
-rw-r--r--vp9/encoder/vp9_onyx_if.c68
-rw-r--r--vp9/encoder/vp9_onyx_int.h31
-rw-r--r--vp9/encoder/vp9_quantize.c57
-rw-r--r--vp9/encoder/vp9_quantize.h5
-rw-r--r--vp9/encoder/vp9_rdopt.c172
-rw-r--r--vp9/encoder/vp9_rdopt.h18
-rw-r--r--vp9/encoder/vp9_segmentation.c36
-rw-r--r--vp9/encoder/vp9_subexp.c2
-rw-r--r--vp9/encoder/vp9_temporal_filter.c25
-rw-r--r--vp9/encoder/vp9_tokenize.c156
-rw-r--r--vp9/encoder/vp9_tokenize.h10
-rw-r--r--vp9/encoder/vp9_vaq.c4
-rw-r--r--vp9/encoder/x86/vp9_dct32x32_sse2.c20
-rw-r--r--vp9/encoder/x86/vp9_dct_sse2.c42
-rw-r--r--vp9/vp9_common.mk6
-rw-r--r--vp9/vp9_cx_iface.c34
-rw-r--r--vp9/vp9_dx_iface.c6
-rw-r--r--vp9/vp9cx.mk1
78 files changed, 7996 insertions, 1918 deletions
diff --git a/vp9/common/mips/dspr2/vp9_common_dspr2.h b/vp9/common/mips/dspr2/vp9_common_dspr2.h
index dc88f1603..644264f65 100644
--- a/vp9/common/mips/dspr2/vp9_common_dspr2.h
+++ b/vp9/common/mips/dspr2/vp9_common_dspr2.h
@@ -81,6 +81,9 @@ static INLINE void vp9_prefetch_store_streamed(unsigned char *dst) {
);
}
+void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride);
+
void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
new file mode 100644
index 000000000..1b2f5506a
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c
@@ -0,0 +1,1315 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct16_1d_rows_dspr2(const int16_t *input, int16_t *output,
+ uint32_t no_rows) {
+ int i;
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ int step1_10, step1_11, step1_12, step1_13;
+ int step2_0, step2_1, step2_2, step2_3;
+ int step2_8, step2_9, step2_10, step2_11;
+ int step2_12, step2_13, step2_14, step2_15;
+ int load1, load2, load3, load4, load5, load6, load7, load8;
+ int result1, result2, result3, result4;
+ const int const_2_power_13 = 8192;
+
+ for (i = no_rows; i--; ) {
+ /* prefetch row */
+ vp9_prefetch_load((const uint8_t *)(input + 16));
+
+ __asm__ __volatile__ (
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 16(%[input]) \n\t"
+ "lh %[load3], 8(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[step2_0], $ac1, 31 \n\t"
+ "extp %[step2_1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[step2_2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[step2_3], $ac1, 31 \n\t"
+
+ "add %[step1_0], %[step2_0], %[step2_3] \n\t"
+ "add %[step1_1], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+ [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+ [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+ [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load5], 2(%[input]) \n\t"
+ "lh %[load6], 30(%[input]) \n\t"
+ "lh %[load7], 18(%[input]) \n\t"
+ "lh %[load8], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_2_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_14_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_18_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_14_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_2_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_30_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "sub %[load5], %[result1], %[result2] \n\t"
+ "sub %[load6], %[result4], %[result3] \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load6], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load5], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load5], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_8_64] \n\t"
+
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[result1], %[result2] \n\t"
+ "add %[step2_15], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [load7] "=&r" (load7), [load8] "=&r" (load8),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+ [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+ [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 22(%[input]) \n\t"
+ "lh %[load3], 26(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_26_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_22_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac2, %[load4], %[cospi_6_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[result2], %[result1] \n\t"
+ "sub %[load2], %[result4], %[result3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[result1], %[result2] \n\t"
+ "add %[step2_12], %[result4], %[result3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+ [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+ [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load5], 4(%[input]) \n\t"
+ "lh %[load6], 28(%[input]) \n\t"
+ "lh %[load7], 20(%[input]) \n\t"
+ "lh %[load8], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_4_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_12_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_20_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_12_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_4_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_28_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[result4], %[result3] \n\t"
+ "sub %[load5], %[load5], %[result1] \n\t"
+ "add %[load5], %[load5], %[result2] \n\t"
+
+ "sub %[load6], %[result1], %[result2] \n\t"
+ "sub %[load6], %[load6], %[result3] \n\t"
+ "add %[load6], %[load6], %[result4] \n\t"
+
+ "madd $ac1, %[load5], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+ "add %[step1_4], %[result1], %[result2] \n\t"
+ "add %[step1_7], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [load7] "=&r" (load7), [load8] "=&r" (load8),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+ [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "sub %[load5], %[step2_14], %[step2_13] \n\t"
+ "sub %[load5], %[load5], %[step2_9] \n\t"
+ "add %[load5], %[load5], %[step2_10] \n\t"
+
+ "madd $ac0, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_14], %[step2_13] \n\t"
+ "sub %[load6], %[load6], %[step2_10] \n\t"
+ "add %[load6], %[load6], %[step2_9] \n\t"
+
+ "madd $ac1, %[load6], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[step2_15], %[step2_12] \n\t"
+ "sub %[load5], %[load5], %[step2_8] \n\t"
+ "add %[load5], %[load5], %[step2_11] \n\t"
+
+ "madd $ac2, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_15], %[step2_12] \n\t"
+ "sub %[load6], %[load6], %[step2_11] \n\t"
+ "add %[load6], %[load6], %[step2_8] \n\t"
+
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_10], $ac0, 31 \n\t"
+ "extp %[step1_13], $ac1, 31 \n\t"
+ "extp %[step1_11], $ac2, 31 \n\t"
+ "extp %[step1_12], $ac3, 31 \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+ [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+ [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+ [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+ [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "add %[load5], %[step1_0], %[step1_7] \n\t"
+ "add %[load5], %[load5], %[step2_12] \n\t"
+ "add %[load5], %[load5], %[step2_15] \n\t"
+ "add %[load6], %[step1_1], %[step1_6] \n\t"
+ "add %[load6], %[load6], %[step2_13] \n\t"
+ "add %[load6], %[load6], %[step2_14] \n\t"
+ "sh %[load5], 0(%[output]) \n\t"
+ "sh %[load6], 32(%[output]) \n\t"
+ "sub %[load5], %[step1_1], %[step1_6] \n\t"
+ "add %[load5], %[load5], %[step2_9] \n\t"
+ "add %[load5], %[load5], %[step2_10] \n\t"
+ "sub %[load6], %[step1_0], %[step1_7] \n\t"
+ "add %[load6], %[load6], %[step2_8] \n\t"
+ "add %[load6], %[load6], %[step2_11] \n\t"
+ "sh %[load5], 192(%[output]) \n\t"
+ "sh %[load6], 224(%[output]) \n\t"
+ "sub %[load5], %[step1_0], %[step1_7] \n\t"
+ "sub %[load5], %[load5], %[step2_8] \n\t"
+ "sub %[load5], %[load5], %[step2_11] \n\t"
+ "sub %[load6], %[step1_1], %[step1_6] \n\t"
+ "sub %[load6], %[load6], %[step2_9] \n\t"
+ "sub %[load6], %[load6], %[step2_10] \n\t"
+ "sh %[load5], 256(%[output]) \n\t"
+ "sh %[load6], 288(%[output]) \n\t"
+ "add %[load5], %[step1_1], %[step1_6] \n\t"
+ "sub %[load5], %[load5], %[step2_13] \n\t"
+ "sub %[load5], %[load5], %[step2_14] \n\t"
+ "add %[load6], %[step1_0], %[step1_7] \n\t"
+ "sub %[load6], %[load6], %[step2_12] \n\t"
+ "sub %[load6], %[load6], %[step2_15] \n\t"
+ "sh %[load5], 448(%[output]) \n\t"
+ "sh %[load6], 480(%[output]) \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6)
+ : [output] "r" (output),
+ [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+ [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+ [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+ [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+ [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+ [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
+ );
+
+ __asm__ __volatile__ (
+ "add %[load5], %[step1_2], %[step1_5] \n\t"
+ "add %[load5], %[load5], %[step1_13] \n\t"
+ "add %[load6], %[step1_3], %[step1_4] \n\t"
+ "add %[load6], %[load6], %[step1_12] \n\t"
+ "sh %[load5], 64(%[output]) \n\t"
+ "sh %[load6], 96(%[output]) \n\t"
+ "sub %[load5], %[step1_3], %[step1_4] \n\t"
+ "add %[load5], %[load5], %[step1_11] \n\t"
+ "sub %[load6], %[step1_2], %[step1_5] \n\t"
+ "add %[load6], %[load6], %[step1_10] \n\t"
+ "sh %[load5], 128(%[output]) \n\t"
+ "sh %[load6], 160(%[output]) \n\t"
+ "sub %[load5], %[step1_2], %[step1_5] \n\t"
+ "sub %[load5], %[load5], %[step1_10] \n\t"
+ "sub %[load6], %[step1_3], %[step1_4] \n\t"
+ "sub %[load6], %[load6], %[step1_11] \n\t"
+ "sh %[load5], 320(%[output]) \n\t"
+ "sh %[load6], 352(%[output]) \n\t"
+ "add %[load5], %[step1_3], %[step1_4] \n\t"
+ "sub %[load5], %[load5], %[step1_12] \n\t"
+ "add %[load6], %[step1_2], %[step1_5] \n\t"
+ "sub %[load6], %[load6], %[step1_13] \n\t"
+ "sh %[load5], 384(%[output]) \n\t"
+ "sh %[load6], 416(%[output]) \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6)
+ : [output] "r" (output),
+ [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+ [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+ [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+ [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
+ );
+
+ input += 16;
+ output += 1;
+ }
+}
+
+static void idct16_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int i;
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ int step1_8, step1_9, step1_10, step1_11;
+ int step1_12, step1_13, step1_14, step1_15;
+ int step2_0, step2_1, step2_2, step2_3;
+ int step2_8, step2_9, step2_10, step2_11;
+ int step2_12, step2_13, step2_14, step2_15;
+ int load1, load2, load3, load4, load5, load6, load7, load8;
+ int result1, result2, result3, result4;
+ const int const_2_power_13 = 8192;
+ uint8_t *dest_pix;
+ uint8_t *cm = vp9_ff_cropTbl;
+
+ /* prefetch vp9_ff_cropTbl */
+ vp9_prefetch_load(vp9_ff_cropTbl);
+ vp9_prefetch_load(vp9_ff_cropTbl + 32);
+ vp9_prefetch_load(vp9_ff_cropTbl + 64);
+ vp9_prefetch_load(vp9_ff_cropTbl + 96);
+ vp9_prefetch_load(vp9_ff_cropTbl + 128);
+ vp9_prefetch_load(vp9_ff_cropTbl + 160);
+ vp9_prefetch_load(vp9_ff_cropTbl + 192);
+ vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+ for (i = 0; i < 16; ++i) {
+ dest_pix = (dest + i);
+ __asm__ __volatile__ (
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 16(%[input]) \n\t"
+ "lh %[load3], 8(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[step2_0], $ac1, 31 \n\t"
+ "extp %[step2_1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[step2_2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[step2_3], $ac1, 31 \n\t"
+
+ "add %[step1_0], %[step2_0], %[step2_3] \n\t"
+ "add %[step1_1], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
+ "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
+ [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
+ [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+ [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load5], 2(%[input]) \n\t"
+ "lh %[load6], 30(%[input]) \n\t"
+ "lh %[load7], 18(%[input]) \n\t"
+ "lh %[load8], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_2_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_14_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_18_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_14_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_2_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_30_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "sub %[load5], %[result1], %[result2] \n\t"
+ "sub %[load6], %[result4], %[result3] \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load6], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load5], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load5], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_8_64] \n\t"
+
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[result1], %[result2] \n\t"
+ "add %[step2_15], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [load7] "=&r" (load7), [load8] "=&r" (load8),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
+ [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+ [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 22(%[input]) \n\t"
+ "lh %[load3], 26(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_26_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_22_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac2, %[load4], %[cospi_6_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[result2], %[result1] \n\t"
+ "sub %[load2], %[result4], %[result3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[result1], %[result2] \n\t"
+ "add %[step2_12], %[result4], %[result3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+ [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+ [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load5], 4(%[input]) \n\t"
+ "lh %[load6], 28(%[input]) \n\t"
+ "lh %[load7], 20(%[input]) \n\t"
+ "lh %[load8], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load5], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load6], %[cospi_4_64] \n\t"
+ "extp %[result1], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load7], %[cospi_12_64] \n\t"
+ "msub $ac3, %[load8], %[cospi_20_64] \n\t"
+ "extp %[result2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac1, %[load7], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load8], %[cospi_12_64] \n\t"
+ "extp %[result3], $ac1, 31 \n\t"
+
+ "madd $ac2, %[load5], %[cospi_4_64] \n\t"
+ "madd $ac2, %[load6], %[cospi_28_64] \n\t"
+ "extp %[result4], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[result4], %[result3] \n\t"
+ "sub %[load5], %[load5], %[result1] \n\t"
+ "add %[load5], %[load5], %[result2] \n\t"
+
+ "sub %[load6], %[result1], %[result2] \n\t"
+ "sub %[load6], %[load6], %[result3] \n\t"
+ "add %[load6], %[load6], %[result4] \n\t"
+
+ "madd $ac1, %[load5], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+
+ "add %[step1_4], %[result1], %[result2] \n\t"
+ "add %[step1_7], %[result4], %[result3] \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [load7] "=&r" (load7), [load8] "=&r" (load8),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [result3] "=&r" (result3), [result4] "=&r" (result4),
+ [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+ [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "sub %[load5], %[step2_14], %[step2_13] \n\t"
+ "sub %[load5], %[load5], %[step2_9] \n\t"
+ "add %[load5], %[load5], %[step2_10] \n\t"
+
+ "madd $ac0, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_14], %[step2_13] \n\t"
+ "sub %[load6], %[load6], %[step2_10] \n\t"
+ "add %[load6], %[load6], %[step2_9] \n\t"
+
+ "madd $ac1, %[load6], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load5], %[step2_15], %[step2_12] \n\t"
+ "sub %[load5], %[load5], %[step2_8] \n\t"
+ "add %[load5], %[load5], %[step2_11] \n\t"
+
+ "madd $ac2, %[load5], %[cospi_16_64] \n\t"
+
+ "sub %[load6], %[step2_15], %[step2_12] \n\t"
+ "sub %[load6], %[load6], %[step2_11] \n\t"
+ "add %[load6], %[load6], %[step2_8] \n\t"
+
+ "madd $ac3, %[load6], %[cospi_16_64] \n\t"
+
+ "extp %[step1_10], $ac0, 31 \n\t"
+ "extp %[step1_13], $ac1, 31 \n\t"
+ "extp %[step1_11], $ac2, 31 \n\t"
+ "extp %[step1_12], $ac3, 31 \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6),
+ [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
+ [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
+ [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+ [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
+ [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ step1_8 = step2_8 + step2_11;
+ step1_9 = step2_9 + step2_10;
+ step1_14 = step2_13 + step2_14;
+ step1_15 = step2_12 + step2_15;
+
+ __asm__ __volatile__ (
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_0], %[step1_7] \n\t"
+ "add %[load5], %[load5], %[step1_15] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_1], %[step1_6] \n\t"
+ "add %[load6], %[load6], %[step1_14] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_2], %[step1_5] \n\t"
+ "add %[load5], %[load5], %[step1_13] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_3], %[step1_4] \n\t"
+ "add %[load6], %[load6], %[step1_12] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "sub %[load5], %[step1_3], %[step1_4] \n\t"
+ "add %[load5], %[load5], %[step1_11] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_2], %[step1_5] \n\t"
+ "add %[load6], %[load6], %[step1_10] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "sub %[load5], %[step1_1], %[step1_6] \n\t"
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[load5], %[step1_9] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_0], %[step1_7] \n\t"
+ "add %[load6], %[load6], %[step1_8] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "sub %[load5], %[step1_0], %[step1_7] \n\t"
+ "sub %[load5], %[load5], %[step1_8] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_1], %[step1_6] \n\t"
+ "sub %[load6], %[load6], %[step1_9] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "sub %[load5], %[step1_2], %[step1_5] \n\t"
+ "sub %[load5], %[load5], %[step1_10] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "sub %[load6], %[step1_3], %[step1_4] \n\t"
+ "sub %[load6], %[load6], %[step1_11] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_3], %[step1_4] \n\t"
+ "sub %[load5], %[load5], %[step1_12] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_2], %[step1_5] \n\t"
+ "sub %[load6], %[load6], %[step1_13] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[load7], 0(%[dest_pix]) \n\t"
+ "add %[load5], %[step1_1], %[step1_6] \n\t"
+ "sub %[load5], %[load5], %[step1_14] \n\t"
+ "addi %[load5], %[load5], 32 \n\t"
+ "sra %[load5], %[load5], 6 \n\t"
+ "add %[load7], %[load7], %[load5] \n\t"
+ "lbux %[load5], %[load7](%[cm]) \n\t"
+ "add %[load6], %[step1_0], %[step1_7] \n\t"
+ "sub %[load6], %[load6], %[step1_15] \n\t"
+ "sb %[load5], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[load8], 0(%[dest_pix]) \n\t"
+ "addi %[load6], %[load6], 32 \n\t"
+ "sra %[load6], %[load6], 6 \n\t"
+ "add %[load8], %[load8], %[load6] \n\t"
+ "lbux %[load6], %[load8](%[cm]) \n\t"
+ "sb %[load6], 0(%[dest_pix]) \n\t"
+
+ : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
+ [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+ [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+ [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+ [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+ [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+ [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+ [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+ [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
+ );
+
+ input += 16;
+ }
+}
+
+void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // First transform rows
+ idct16_1d_rows_dspr2(input, out, 16);
+
+ // Then transform columns and add to dest
+ idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+static void iadst16_1d(const int16_t *input, int16_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
+
+ int x0 = input[15];
+ int x1 = input[0];
+ int x2 = input[13];
+ int x3 = input[2];
+ int x4 = input[11];
+ int x5 = input[4];
+ int x6 = input[9];
+ int x7 = input[6];
+ int x8 = input[7];
+ int x9 = input[8];
+ int x10 = input[5];
+ int x11 = input[10];
+ int x12 = input[3];
+ int x13 = input[12];
+ int x14 = input[1];
+ int x15 = input[14];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+ | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+ output[0] = output[1] = output[2] = output[3] = output[4]
+ = output[5] = output[6] = output[7] = output[8]
+ = output[9] = output[10] = output[11] = output[12]
+ = output[13] = output[14] = output[15] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+
+ x0 = dct_const_round_shift(s0 + s8);
+ x1 = dct_const_round_shift(s1 + s9);
+ x2 = dct_const_round_shift(s2 + s10);
+ x3 = dct_const_round_shift(s3 + s11);
+ x4 = dct_const_round_shift(s4 + s12);
+ x5 = dct_const_round_shift(s5 + s13);
+ x6 = dct_const_round_shift(s6 + s14);
+ x7 = dct_const_round_shift(s7 + s15);
+ x8 = dct_const_round_shift(s0 - s8);
+ x9 = dct_const_round_shift(s1 - s9);
+ x10 = dct_const_round_shift(s2 - s10);
+ x11 = dct_const_round_shift(s3 - s11);
+ x12 = dct_const_round_shift(s4 - s12);
+ x13 = dct_const_round_shift(s5 - s13);
+ x14 = dct_const_round_shift(s6 - s14);
+ x15 = dct_const_round_shift(s7 - s15);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = s0 + s4;
+ x1 = s1 + s5;
+ x2 = s2 + s6;
+ x3 = s3 + s7;
+ x4 = s0 - s4;
+ x5 = s1 - s5;
+ x6 = s2 - s6;
+ x7 = s3 - s7;
+ x8 = dct_const_round_shift(s8 + s12);
+ x9 = dct_const_round_shift(s9 + s13);
+ x10 = dct_const_round_shift(s10 + s14);
+ x11 = dct_const_round_shift(s11 + s15);
+ x12 = dct_const_round_shift(s8 - s12);
+ x13 = dct_const_round_shift(s9 - s13);
+ x14 = dct_const_round_shift(s10 - s14);
+ x15 = dct_const_round_shift(s11 - s15);
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+ x0 = s0 + s2;
+ x1 = s1 + s3;
+ x2 = s0 - s2;
+ x3 = s1 - s3;
+ x4 = dct_const_round_shift(s4 + s6);
+ x5 = dct_const_round_shift(s5 + s7);
+ x6 = dct_const_round_shift(s4 - s6);
+ x7 = dct_const_round_shift(s5 - s7);
+ x8 = s8 + s10;
+ x9 = s9 + s11;
+ x10 = s8 - s10;
+ x11 = s9 - s11;
+ x12 = dct_const_round_shift(s12 + s14);
+ x13 = dct_const_round_shift(s13 + s15);
+ x14 = dct_const_round_shift(s12 - s14);
+ x15 = dct_const_round_shift(s13 - s15);
+
+ // stage 4
+ s2 = (- cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (- x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (- x10 + x11);
+ s14 = (- cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
+
+ x2 = dct_const_round_shift(s2);
+ x3 = dct_const_round_shift(s3);
+ x6 = dct_const_round_shift(s6);
+ x7 = dct_const_round_shift(s7);
+ x10 = dct_const_round_shift(s10);
+ x11 = dct_const_round_shift(s11);
+ x14 = dct_const_round_shift(s14);
+ x15 = dct_const_round_shift(s15);
+
+ output[0] = x0;
+ output[1] = -x8;
+ output[2] = x12;
+ output[3] = -x4;
+ output[4] = x6;
+ output[5] = x14;
+ output[6] = x10;
+ output[7] = x2;
+ output[8] = x3;
+ output[9] = x11;
+ output[10] = x15;
+ output[11] = x7;
+ output[12] = x5;
+ output[13] = -x13;
+ output[14] = x9;
+ output[15] = -x1;
+}
+
+void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
+ int pitch, int tx_type) {
+ int i, j;
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ int16_t *outptr = out;
+ int16_t temp_out[16];
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ switch (tx_type) {
+ case DCT_DCT: // DCT in both horizontal and vertical
+ idct16_1d_rows_dspr2(input, outptr, 16);
+ idct16_1d_cols_add_blk_dspr2(out, dest, pitch);
+ break;
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ idct16_1d_rows_dspr2(input, outptr, 16);
+
+ outptr = out;
+
+ for (i = 0; i < 16; ++i) {
+ iadst16_1d(outptr, temp_out);
+
+ for (j = 0; j < 16; ++j)
+ dest[j * pitch + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * pitch + i]);
+ outptr += 16;
+ }
+ break;
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ {
+ int16_t temp_in[16 * 16];
+
+ for (i = 0; i < 16; ++i) {
+ /* prefetch row */
+ vp9_prefetch_load((const uint8_t *)(input + 16));
+
+ iadst16_1d(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ for (i = 0; i < 16; ++i)
+ for (j = 0; j < 16; ++j)
+ temp_in[j * 16 + i] = out[i * 16 + j];
+
+ idct16_1d_cols_add_blk_dspr2(temp_in, dest, pitch);
+ }
+ break;
+ case ADST_ADST: // ADST in both directions
+ {
+ int16_t temp_in[16];
+
+ for (i = 0; i < 16; ++i) {
+ /* prefetch row */
+ vp9_prefetch_load((const uint8_t *)(input + 16));
+
+ iadst16_1d(input, outptr);
+ input += 16;
+ outptr += 16;
+ }
+
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j)
+ temp_in[j] = out[j * 16 + i];
+ iadst16_1d(temp_in, temp_out);
+ for (j = 0; j < 16; ++j)
+ dest[j * pitch + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * pitch + i]);
+ }
+ }
+ break;
+ default:
+ printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");
+ break;
+ }
+}
+
+void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
+ int16_t *outptr = out;
+ uint32_t i;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 4x4 area, we only need to calculate first 4 rows here.
+ idct16_1d_rows_dspr2(input, outptr, 4);
+
+ outptr += 4;
+ for (i = 0; i < 6; ++i) {
+ __asm__ __volatile__ (
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 64(%[outptr]) \n\t"
+ "sw $zero, 96(%[outptr]) \n\t"
+ "sw $zero, 128(%[outptr]) \n\t"
+ "sw $zero, 160(%[outptr]) \n\t"
+ "sw $zero, 192(%[outptr]) \n\t"
+ "sw $zero, 224(%[outptr]) \n\t"
+ "sw $zero, 256(%[outptr]) \n\t"
+ "sw $zero, 288(%[outptr]) \n\t"
+ "sw $zero, 320(%[outptr]) \n\t"
+ "sw $zero, 352(%[outptr]) \n\t"
+ "sw $zero, 384(%[outptr]) \n\t"
+ "sw $zero, 416(%[outptr]) \n\t"
+ "sw $zero, 448(%[outptr]) \n\t"
+ "sw $zero, 480(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r" (outptr)
+ );
+
+ outptr += 2;
+ }
+
+ // Then transform columns
+ idct16_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ uint32_t pos = 45;
+ int32_t out;
+ int32_t r;
+ int32_t a1, absa1;
+ int32_t vector_a1;
+ int32_t t1, t2, t3, t4;
+ int32_t vector_1, vector_2, vector_3, vector_4;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r" (pos)
+ );
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+ __asm__ __volatile__ (
+ "addi %[out], %[out], 32 \n\t"
+ "sra %[a1], %[out], 6 \n\t"
+
+ : [out] "+r" (out), [a1] "=r" (a1)
+ :
+ );
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 16; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+ [dest] "+&r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "replv.qb %[vector_a1], %[a1] \n\t"
+
+ : [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 16; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+ [dest] "+&r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
new file mode 100644
index 000000000..5e92db3d2
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans32_cols_dspr2.c
@@ -0,0 +1,1073 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_idct32_1d_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+ int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+ int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19;
+ int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26;
+ int16_t step1_27, step1_28, step1_29, step1_30, step1_31;
+ int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+ int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+ int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+ int16_t step2_28, step2_29, step2_30, step2_31;
+ int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+ int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+ int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27;
+ int16_t step3_28, step3_29, step3_30, step3_31;
+ int temp0, temp1, temp2, temp3;
+ int load1, load2, load3, load4;
+ int result1, result2;
+ int i, temp21;
+ uint8_t *dest_pix, *dest_pix1;
+ const int const_2_power_13 = 8192;
+ uint8_t *cm = vp9_ff_cropTbl;
+
+ /* prefetch vp9_ff_cropTbl */
+ vp9_prefetch_load(vp9_ff_cropTbl);
+ vp9_prefetch_load(vp9_ff_cropTbl + 32);
+ vp9_prefetch_load(vp9_ff_cropTbl + 64);
+ vp9_prefetch_load(vp9_ff_cropTbl + 96);
+ vp9_prefetch_load(vp9_ff_cropTbl + 128);
+ vp9_prefetch_load(vp9_ff_cropTbl + 160);
+ vp9_prefetch_load(vp9_ff_cropTbl + 192);
+ vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+ for (i = 0; i < 32; ++i) {
+ dest_pix = dest + i;
+ dest_pix1 = dest + i + 31 * dest_stride;
+
+ __asm__ __volatile__ (
+ "lh %[load1], 2(%[input]) \n\t"
+ "lh %[load2], 62(%[input]) \n\t"
+ "lh %[load3], 34(%[input]) \n\t"
+ "lh %[load4], 30(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_31_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_1_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_1_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_31_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_15_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_17_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_17_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_15_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_17], $ac1, 31 \n\t"
+ "extp %[step1_30], $ac3, 31 \n\t"
+ "add %[step1_16], %[temp0], %[temp1] \n\t"
+ "add %[step1_31], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
+ [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
+ [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 18(%[input]) \n\t"
+ "lh %[load2], 46(%[input]) \n\t"
+ "lh %[load3], 50(%[input]) \n\t"
+ "lh %[load4], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_23_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_9_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_9_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_23_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_7_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_25_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_25_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_7_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_18], $ac1, 31 \n\t"
+ "extp %[step1_29], $ac3, 31 \n\t"
+ "add %[step1_19], %[temp0], %[temp1] \n\t"
+ "add %[step1_28], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
+ [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
+ [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 54(%[input]) \n\t"
+ "lh %[load3], 42(%[input]) \n\t"
+ "lh %[load4], 22(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_27_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_5_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_5_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_27_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_11_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_21_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_21_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_11_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+
+ "madd $ac1, %[load2], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_12_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_20_64] \n\t"
+
+ "extp %[step1_21], $ac1, 31 \n\t"
+ "extp %[step1_26], $ac3, 31 \n\t"
+ "add %[step1_20], %[temp0], %[temp1] \n\t"
+ "add %[step1_27], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
+ [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
+ [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
+ [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 26(%[input]) \n\t"
+ "lh %[load2], 38(%[input]) \n\t"
+ "lh %[load3], 58(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_19_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_13_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_13_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_19_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_3_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_29_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_29_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_3_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+ "msub $ac1, %[load1], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_20_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_12_64] \n\t"
+ "extp %[step1_22], $ac1, 31 \n\t"
+ "extp %[step1_25], $ac3, 31 \n\t"
+ "add %[step1_23], %[temp0], %[temp1] \n\t"
+ "add %[step1_24], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
+ [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
+ [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
+ [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 4(%[input]) \n\t"
+ "lh %[load2], 60(%[input]) \n\t"
+ "lh %[load3], 36(%[input]) \n\t"
+ "lh %[load4], 28(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_2_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_2_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_30_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_14_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_18_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_14_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+ "msub $ac1, %[load1], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_8_64] \n\t"
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[temp0], %[temp1] \n\t"
+ "add %[step2_15], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
+ [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+ [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 20(%[input]) \n\t"
+ "lh %[load2], 44(%[input]) \n\t"
+ "lh %[load3], 52(%[input]) \n\t"
+ "lh %[load4], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_22_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_26_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_6_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[temp0], %[temp1] \n\t"
+ "add %[step2_12], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2), [load3] "=&r" (load3),
+ [load4] "=&r" (load4), [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+ [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+ [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+ );
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "sub %[temp0], %[step2_14], %[step2_13] \n\t"
+ "sub %[temp0], %[temp0], %[step2_9] \n\t"
+ "add %[temp0], %[temp0], %[step2_10] \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp1], %[step2_14], %[step2_13] \n\t"
+ "add %[temp1], %[temp1], %[step2_9] \n\t"
+ "sub %[temp1], %[temp1], %[step2_10] \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "sub %[temp0], %[step2_15], %[step2_12] \n\t"
+ "sub %[temp0], %[temp0], %[step2_8] \n\t"
+ "add %[temp0], %[temp0], %[step2_11] \n\t"
+ "madd $ac2, %[temp0], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sub %[temp1], %[step2_15], %[step2_12] \n\t"
+ "add %[temp1], %[temp1], %[step2_8] \n\t"
+ "sub %[temp1], %[temp1], %[step2_11] \n\t"
+ "madd $ac3, %[temp1], %[cospi_16_64] \n\t"
+
+ "add %[step3_8], %[step2_8], %[step2_11] \n\t"
+ "add %[step3_9], %[step2_9], %[step2_10] \n\t"
+ "add %[step3_14], %[step2_13], %[step2_14] \n\t"
+ "add %[step3_15], %[step2_12], %[step2_15] \n\t"
+ "extp %[step3_10], $ac0, 31 \n\t"
+ "extp %[step3_13], $ac1, 31 \n\t"
+ "extp %[step3_11], $ac2, 31 \n\t"
+ "extp %[step3_12], $ac3, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
+ [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
+ [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
+ [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
+ : [const_2_power_13] "r" (const_2_power_13), [step2_8] "r" (step2_8),
+ [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
+ [step2_11] "r" (step2_11), [step2_12] "r" (step2_12),
+ [step2_13] "r" (step2_13), [step2_14] "r" (step2_14),
+ [step2_15] "r" (step2_15), [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ step2_18 = step1_17 - step1_18;
+ step2_29 = step1_30 - step1_29;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
+ "extp %[step3_18], $ac0, 31 \n\t"
+
+ : [step3_18] "=r" (step3_18)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+ step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_19 = step1_16 - step1_19;
+ step2_28 = step1_31 - step1_28;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
+ "extp %[step3_19], $ac0, 31 \n\t"
+
+ : [step3_19] "=r" (step3_19)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+ step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_16 = step1_16 + step1_19;
+ step3_17 = step1_17 + step1_18;
+ step3_30 = step1_29 + step1_30;
+ step3_31 = step1_28 + step1_31;
+
+ step2_20 = step1_23 - step1_20;
+ step2_27 = step1_24 - step1_27;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
+ "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
+ "extp %[step3_20], $ac0, 31 \n\t"
+
+ : [step3_20] "=r" (step3_20)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+ step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_21 = step1_22 - step1_21;
+ step2_26 = step1_25 - step1_26;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
+ "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
+ "extp %[step3_21], $ac1, 31 \n\t"
+
+ : [step3_21] "=r" (step3_21)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+ step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_22 = step1_21 + step1_22;
+ step3_23 = step1_20 + step1_23;
+ step3_24 = step1_24 + step1_27;
+ step3_25 = step1_25 + step1_26;
+
+ step2_16 = step3_16 + step3_23;
+ step2_17 = step3_17 + step3_22;
+ step2_18 = step3_18 + step3_21;
+ step2_19 = step3_19 + step3_20;
+ step2_20 = step3_19 - step3_20;
+ step2_21 = step3_18 - step3_21;
+ step2_22 = step3_17 - step3_22;
+ step2_23 = step3_16 - step3_23;
+
+ step2_24 = step3_31 - step3_24;
+ step2_25 = step3_30 - step3_25;
+ step2_26 = step3_29 - step3_26;
+ step2_27 = step3_28 - step3_27;
+ step2_28 = step3_28 + step3_27;
+ step2_29 = step3_29 + step3_26;
+ step2_30 = step3_30 + step3_25;
+ step2_31 = step3_31 + step3_24;
+
+ __asm__ __volatile__ (
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 32(%[input]) \n\t"
+ "lh %[load3], 16(%[input]) \n\t"
+ "lh %[load4], 48(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[temp2], $ac3, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[temp3], $ac1, 31 \n\t"
+ "add %[step1_0], %[temp0], %[temp3] \n\t"
+ "add %[step1_1], %[temp1], %[temp2] \n\t"
+ "sub %[step1_2], %[temp1], %[temp2] \n\t"
+ "sub %[step1_3], %[temp0], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+ [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 8(%[input]) \n\t"
+ "lh %[load2], 56(%[input]) \n\t"
+ "lh %[load3], 40(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_12_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_20_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+ "madd $ac1, %[load3], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_12_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load1], %[load1], %[temp0] \n\t"
+ "add %[load1], %[load1], %[temp1] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[load2], %[temp2] \n\t"
+ "add %[load2], %[load2], %[temp3] \n\t"
+ "madd $ac1, %[load1], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+ "add %[step1_4], %[temp0], %[temp1] \n\t"
+ "add %[step1_7], %[temp3], %[temp2] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+ [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ step2_0 = step1_0 + step1_7;
+ step2_1 = step1_1 + step1_6;
+ step2_2 = step1_2 + step1_5;
+ step2_3 = step1_3 + step1_4;
+ step2_4 = step1_3 - step1_4;
+ step2_5 = step1_2 - step1_5;
+ step2_6 = step1_1 - step1_6;
+ step2_7 = step1_0 - step1_7;
+
+ // stage 7
+ step1_0 = step2_0 + step3_15;
+ step1_1 = step2_1 + step3_14;
+ step1_2 = step2_2 + step3_13;
+ step1_3 = step2_3 + step3_12;
+ step1_4 = step2_4 + step3_11;
+ step1_5 = step2_5 + step3_10;
+ step1_6 = step2_6 + step3_9;
+ step1_7 = step2_7 + step3_8;
+ step1_8 = step2_7 - step3_8;
+ step1_9 = step2_6 - step3_9;
+ step1_10 = step2_5 - step3_10;
+ step1_11 = step2_4 - step3_11;
+ step1_12 = step2_3 - step3_12;
+ step1_13 = step2_2 - step3_13;
+ step1_14 = step2_1 - step3_14;
+ step1_15 = step2_0 - step3_15;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_27], %[step2_20] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_20], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
+ : [const_2_power_13] "r" (const_2_power_13), [step2_20] "r" (step2_20),
+ [step2_27] "r" (step2_27), [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_20 + step2_27) * cospi_16_64;
+ step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_26], %[step2_21] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_21], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
+ : [const_2_power_13] "r" (const_2_power_13), [step2_26] "r" (step2_26),
+ [step2_21] "r" (step2_21), [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_21 + step2_26) * cospi_16_64;
+ step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_25], %[step2_22] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_22], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
+ : [const_2_power_13] "r" (const_2_power_13), [step2_25] "r" (step2_25),
+ [step2_22] "r" (step2_22), [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_22 + step2_25) * cospi_16_64;
+ step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_24], %[step2_23] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_23], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
+ : [const_2_power_13] "r" (const_2_power_13), [step2_24] "r" (step2_24),
+ [step2_23] "r" (step2_23), [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_23 + step2_24) * cospi_16_64;
+ step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_0], %[step2_31] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_1], %[step2_30] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_2], %[step2_29] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_3], %[step2_28] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
+ [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
+ [step2_28] "r" (step2_28), [step2_29] "r" (step2_29),
+ [step2_30] "r" (step2_30), [step2_31] "r" (step2_31)
+ );
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+ [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+ );
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_4], %[step1_27] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_5], %[step1_26] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_6], %[step1_25] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_7], %[step1_24] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
+ [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
+ [step1_24] "r" (step1_24), [step1_25] "r" (step1_25),
+ [step1_26] "r" (step1_26), [step1_27] "r" (step1_27)
+ );
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+ [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+ );
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_8], %[step1_23] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_9], %[step1_22] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_10], %[step1_21] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_11], %[step1_20] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
+ [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
+ [step1_20] "r" (step1_20), [step1_21] "r" (step1_21),
+ [step1_22] "r" (step1_22), [step1_23] "r" (step1_23)
+ );
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+ [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+ );
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_12], %[step2_19] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_13], %[step2_18] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix]) \n\t"
+ "add %[temp0], %[step1_14], %[step2_17] \n\t"
+ "addi %[temp0], %[temp0], 32 \n\t"
+ "sra %[temp0], %[temp0], 6 \n\t"
+ "add %[temp2], %[temp2], %[temp0] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "add %[temp1], %[step1_15], %[step2_16] \n\t"
+ "sb %[temp0], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix]) \n\t"
+ "addi %[temp1], %[temp1], 32 \n\t"
+ "sra %[temp1], %[temp1], 6 \n\t"
+ "add %[temp3], %[temp3], %[temp1] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix]) \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix] "+r" (dest_pix)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
+ [step1_14] "r" (step1_14), [step1_15] "r" (step1_15),
+ [step2_16] "r" (step2_16), [step2_17] "r" (step2_17),
+ [step2_18] "r" (step2_18), [step2_19] "r" (step2_19)
+ );
+
+ step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
+ step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
+ step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
+ step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
+
+ __asm__ __volatile__ (
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_15] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_14] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+
+ "lbu %[temp2], 0(%[dest_pix1]) \n\t"
+ "add %[temp2], %[temp2], %[step3_13] \n\t"
+ "lbux %[temp0], %[temp2](%[cm]) \n\t"
+ "sb %[temp0], 0(%[dest_pix1]) \n\t"
+ "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t"
+ "lbu %[temp3], 0(%[dest_pix1]) \n\t"
+ "add %[temp3], %[temp3], %[step3_12] \n\t"
+ "lbux %[temp1], %[temp3](%[cm]) \n\t"
+ "sb %[temp1], 0(%[dest_pix1]) \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1), [temp2] "=&r" (temp2),
+ [temp3] "=&r" (temp3), [dest_pix1] "+r" (dest_pix1)
+ : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
+ [step3_12] "r" (step3_12), [step3_13] "r" (step3_13),
+ [step3_14] "r" (step3_14), [step3_15] "r" (step3_15)
+ );
+
+ input += 32;
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
new file mode 100644
index 000000000..d3aee73cb
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
@@ -0,0 +1,1013 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+ int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
+ int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
+ int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
+ int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
+ int16_t step1_28, step1_29, step1_30, step1_31;
+ int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+ int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
+ int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
+ int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
+ int16_t step2_28, step2_29, step2_30, step2_31;
+ int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
+ int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
+ int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
+ int16_t step3_29, step3_30, step3_31;
+ int temp0, temp1, temp2, temp3;
+ int load1, load2, load3, load4;
+ int result1, result2;
+ int temp21;
+ int i;
+ const int const_2_power_13 = 8192;
+ const int32_t *input_int;
+
+ for (i = 32; i--; ) {
+ input_int = (const int32_t *)input;
+
+ if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
+ input_int[4] | input_int[5] | input_int[6] | input_int[7] |
+ input_int[8] | input_int[9] | input_int[10] | input_int[11] |
+ input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
+ input += 32;
+
+ __asm__ __volatile__ (
+ "sh $zero, 0(%[output]) \n\t"
+ "sh $zero, 64(%[output]) \n\t"
+ "sh $zero, 128(%[output]) \n\t"
+ "sh $zero, 192(%[output]) \n\t"
+ "sh $zero, 256(%[output]) \n\t"
+ "sh $zero, 320(%[output]) \n\t"
+ "sh $zero, 384(%[output]) \n\t"
+ "sh $zero, 448(%[output]) \n\t"
+ "sh $zero, 512(%[output]) \n\t"
+ "sh $zero, 576(%[output]) \n\t"
+ "sh $zero, 640(%[output]) \n\t"
+ "sh $zero, 704(%[output]) \n\t"
+ "sh $zero, 768(%[output]) \n\t"
+ "sh $zero, 832(%[output]) \n\t"
+ "sh $zero, 896(%[output]) \n\t"
+ "sh $zero, 960(%[output]) \n\t"
+ "sh $zero, 1024(%[output]) \n\t"
+ "sh $zero, 1088(%[output]) \n\t"
+ "sh $zero, 1152(%[output]) \n\t"
+ "sh $zero, 1216(%[output]) \n\t"
+ "sh $zero, 1280(%[output]) \n\t"
+ "sh $zero, 1344(%[output]) \n\t"
+ "sh $zero, 1408(%[output]) \n\t"
+ "sh $zero, 1472(%[output]) \n\t"
+ "sh $zero, 1536(%[output]) \n\t"
+ "sh $zero, 1600(%[output]) \n\t"
+ "sh $zero, 1664(%[output]) \n\t"
+ "sh $zero, 1728(%[output]) \n\t"
+ "sh $zero, 1792(%[output]) \n\t"
+ "sh $zero, 1856(%[output]) \n\t"
+ "sh $zero, 1920(%[output]) \n\t"
+ "sh $zero, 1984(%[output]) \n\t"
+
+ :
+ : [output] "r" (output)
+ );
+
+ output += 1;
+
+ continue;
+ }
+
+ /* prefetch row */
+ vp9_prefetch_load((const uint8_t *)(input + 32));
+ vp9_prefetch_load((const uint8_t *)(input + 48));
+
+ __asm__ __volatile__ (
+ "lh %[load1], 2(%[input]) \n\t"
+ "lh %[load2], 62(%[input]) \n\t"
+ "lh %[load3], 34(%[input]) \n\t"
+ "lh %[load4], 30(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_31_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_1_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_1_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_31_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_15_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_17_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_17_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_15_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_17], $ac1, 31 \n\t"
+ "extp %[step1_30], $ac3, 31 \n\t"
+ "add %[step1_16], %[temp0], %[temp1] \n\t"
+ "add %[step1_31], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_16] "=r" (step1_16), [step1_17] "=r" (step1_17),
+ [step1_30] "=r" (step1_30), [step1_31] "=r" (step1_31)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_31_64] "r" (cospi_31_64), [cospi_1_64] "r" (cospi_1_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_17_64] "r" (cospi_17_64),
+ [cospi_15_64] "r" (cospi_15_64), [cospi_28_64] "r" (cospi_28_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 18(%[input]) \n\t"
+ "lh %[load2], 46(%[input]) \n\t"
+ "lh %[load3], 50(%[input]) \n\t"
+ "lh %[load4], 14(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_23_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_9_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_9_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_23_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_7_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_25_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_25_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_7_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+
+ "extp %[step1_18], $ac1, 31 \n\t"
+ "extp %[step1_29], $ac3, 31 \n\t"
+ "add %[step1_19], %[temp0], %[temp1] \n\t"
+ "add %[step1_28], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_18] "=r" (step1_18), [step1_19] "=r" (step1_19),
+ [step1_28] "=r" (step1_28), [step1_29] "=r" (step1_29)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_23_64] "r" (cospi_23_64), [cospi_9_64] "r" (cospi_9_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_7_64] "r" (cospi_7_64),
+ [cospi_25_64] "r" (cospi_25_64), [cospi_28_64] "r" (cospi_28_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 10(%[input]) \n\t"
+ "lh %[load2], 54(%[input]) \n\t"
+ "lh %[load3], 42(%[input]) \n\t"
+ "lh %[load4], 22(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_27_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_5_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_5_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_27_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_11_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_21_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_21_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_11_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+
+ "madd $ac1, %[load2], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_12_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_20_64] \n\t"
+
+ "extp %[step1_21], $ac1, 31 \n\t"
+ "extp %[step1_26], $ac3, 31 \n\t"
+ "add %[step1_20], %[temp0], %[temp1] \n\t"
+ "add %[step1_27], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_20] "=r" (step1_20), [step1_21] "=r" (step1_21),
+ [step1_26] "=r" (step1_26), [step1_27] "=r" (step1_27)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_27_64] "r" (cospi_27_64), [cospi_5_64] "r" (cospi_5_64),
+ [cospi_11_64] "r" (cospi_11_64), [cospi_21_64] "r" (cospi_21_64),
+ [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 26(%[input]) \n\t"
+ "lh %[load2], 38(%[input]) \n\t"
+ "lh %[load3], 58(%[input]) \n\t"
+ "lh %[load4], 6(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_19_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_13_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_13_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_19_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_3_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_29_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_29_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_3_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_12_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_20_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_20_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_12_64] \n\t"
+
+ "extp %[step1_22], $ac1, 31 \n\t"
+ "extp %[step1_25], $ac3, 31 \n\t"
+ "add %[step1_23], %[temp0], %[temp1] \n\t"
+ "add %[step1_24], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_22] "=r" (step1_22), [step1_23] "=r" (step1_23),
+ [step1_24] "=r" (step1_24), [step1_25] "=r" (step1_25)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_19_64] "r" (cospi_19_64), [cospi_13_64] "r" (cospi_13_64),
+ [cospi_3_64] "r" (cospi_3_64), [cospi_29_64] "r" (cospi_29_64),
+ [cospi_12_64] "r" (cospi_12_64), [cospi_20_64] "r" (cospi_20_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 4(%[input]) \n\t"
+ "lh %[load2], 60(%[input]) \n\t"
+ "lh %[load3], 36(%[input]) \n\t"
+ "lh %[load4], 28(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_30_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_2_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_2_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_30_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_14_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_18_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_18_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_14_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[temp3], %[temp2] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load2], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load1], %[cospi_24_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_8_64] \n\t"
+
+ "extp %[step2_9], $ac1, 31 \n\t"
+ "extp %[step2_14], $ac3, 31 \n\t"
+ "add %[step2_8], %[temp0], %[temp1] \n\t"
+ "add %[step2_15], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step2_8] "=r" (step2_8), [step2_9] "=r" (step2_9),
+ [step2_14] "=r" (step2_14), [step2_15] "=r" (step2_15)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
+ [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 20(%[input]) \n\t"
+ "lh %[load2], 44(%[input]) \n\t"
+ "lh %[load3], 52(%[input]) \n\t"
+ "lh %[load4], 12(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_22_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_10_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_10_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_22_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_6_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_26_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_26_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_6_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp1], %[temp0] \n\t"
+ "sub %[load2], %[temp2], %[temp3] \n\t"
+
+ "msub $ac1, %[load1], %[cospi_24_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_8_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load1], %[cospi_8_64] \n\t"
+
+ "extp %[step2_10], $ac1, 31 \n\t"
+ "extp %[step2_13], $ac3, 31 \n\t"
+ "add %[step2_11], %[temp0], %[temp1] \n\t"
+ "add %[step2_12], %[temp2], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
+ [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
+ [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_24_64] "r" (cospi_24_64)
+ );
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "sub %[temp0], %[step2_14], %[step2_13] \n\t"
+ "sub %[temp0], %[temp0], %[step2_9] \n\t"
+ "add %[temp0], %[temp0], %[step2_10] \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "sub %[temp1], %[step2_14], %[step2_13] \n\t"
+ "add %[temp1], %[temp1], %[step2_9] \n\t"
+ "sub %[temp1], %[temp1], %[step2_10] \n\t"
+ "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "sub %[temp0], %[step2_15], %[step2_12] \n\t"
+ "sub %[temp0], %[temp0], %[step2_8] \n\t"
+ "add %[temp0], %[temp0], %[step2_11] \n\t"
+ "madd $ac2, %[temp0], %[cospi_16_64] \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "sub %[temp1], %[step2_15], %[step2_12] \n\t"
+ "add %[temp1], %[temp1], %[step2_8] \n\t"
+ "sub %[temp1], %[temp1], %[step2_11] \n\t"
+ "madd $ac3, %[temp1], %[cospi_16_64] \n\t"
+
+ "add %[step3_8], %[step2_8], %[step2_11] \n\t"
+ "add %[step3_9], %[step2_9], %[step2_10] \n\t"
+ "add %[step3_14], %[step2_13], %[step2_14] \n\t"
+ "add %[step3_15], %[step2_12], %[step2_15] \n\t"
+
+ "extp %[step3_10], $ac0, 31 \n\t"
+ "extp %[step3_13], $ac1, 31 \n\t"
+ "extp %[step3_11], $ac2, 31 \n\t"
+ "extp %[step3_12], $ac3, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [step3_8] "=r" (step3_8), [step3_9] "=r" (step3_9),
+ [step3_10] "=r" (step3_10), [step3_11] "=r" (step3_11),
+ [step3_12] "=r" (step3_12), [step3_13] "=r" (step3_13),
+ [step3_14] "=r" (step3_14), [step3_15] "=r" (step3_15)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
+ [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
+ [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
+ [step2_14] "r" (step2_14), [step2_15] "r" (step2_15),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ step2_18 = step1_17 - step1_18;
+ step2_29 = step1_30 - step1_29;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_18], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_29], %[cospi_24_64] \n\t"
+ "extp %[step3_18], $ac0, 31 \n\t"
+
+ : [step3_18] "=r" (step3_18)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_18] "r" (step2_18), [step2_29] "r" (step2_29),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64;
+ step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_19 = step1_16 - step1_19;
+ step2_28 = step1_31 - step1_28;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_19], %[cospi_8_64] \n\t"
+ "madd $ac0, %[step2_28], %[cospi_24_64] \n\t"
+ "extp %[step3_19], $ac0, 31 \n\t"
+
+ : [step3_19] "=r" (step3_19)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_19] "r" (step2_19), [step2_28] "r" (step2_28),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64;
+ step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_16 = step1_16 + step1_19;
+ step3_17 = step1_17 + step1_18;
+ step3_30 = step1_29 + step1_30;
+ step3_31 = step1_28 + step1_31;
+
+ step2_20 = step1_23 - step1_20;
+ step2_27 = step1_24 - step1_27;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "msub $ac0, %[step2_20], %[cospi_24_64] \n\t"
+ "msub $ac0, %[step2_27], %[cospi_8_64] \n\t"
+ "extp %[step3_20], $ac0, 31 \n\t"
+
+ : [step3_20] "=r" (step3_20)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64;
+ step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step2_21 = step1_22 - step1_21;
+ step2_26 = step1_25 - step1_26;
+
+ __asm__ __volatile__ (
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "msub $ac1, %[step2_21], %[cospi_24_64] \n\t"
+ "msub $ac1, %[step2_26], %[cospi_8_64] \n\t"
+ "extp %[step3_21], $ac1, 31 \n\t"
+
+ : [step3_21] "=r" (step3_21)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_21] "r" (step2_21), [step2_26] "r" (step2_26),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+ );
+
+ temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64;
+ step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ step3_22 = step1_21 + step1_22;
+ step3_23 = step1_20 + step1_23;
+ step3_24 = step1_24 + step1_27;
+ step3_25 = step1_25 + step1_26;
+
+ step2_16 = step3_16 + step3_23;
+ step2_17 = step3_17 + step3_22;
+ step2_18 = step3_18 + step3_21;
+ step2_19 = step3_19 + step3_20;
+ step2_20 = step3_19 - step3_20;
+ step2_21 = step3_18 - step3_21;
+ step2_22 = step3_17 - step3_22;
+ step2_23 = step3_16 - step3_23;
+
+ step2_24 = step3_31 - step3_24;
+ step2_25 = step3_30 - step3_25;
+ step2_26 = step3_29 - step3_26;
+ step2_27 = step3_28 - step3_27;
+ step2_28 = step3_28 + step3_27;
+ step2_29 = step3_29 + step3_26;
+ step2_30 = step3_30 + step3_25;
+ step2_31 = step3_31 + step3_24;
+
+ __asm__ __volatile__ (
+ "lh %[load1], 0(%[input]) \n\t"
+ "lh %[load2], 32(%[input]) \n\t"
+ "lh %[load3], 16(%[input]) \n\t"
+ "lh %[load4], 48(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+ "add %[result1], %[load1], %[load2] \n\t"
+ "sub %[result2], %[load1], %[load2] \n\t"
+ "madd $ac1, %[result1], %[cospi_16_64] \n\t"
+ "madd $ac2, %[result2], %[cospi_16_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+ "madd $ac3, %[load3], %[cospi_24_64] \n\t"
+ "msub $ac3, %[load4], %[cospi_8_64] \n\t"
+ "extp %[temp2], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "madd $ac1, %[load3], %[cospi_8_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_24_64] \n\t"
+ "extp %[temp3], $ac1, 31 \n\t"
+
+ "add %[step1_0], %[temp0], %[temp3] \n\t"
+ "add %[step1_1], %[temp1], %[temp2] \n\t"
+ "sub %[step1_2], %[temp1], %[temp2] \n\t"
+ "sub %[step1_3], %[temp0], %[temp3] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [result1] "=&r" (result1), [result2] "=&r" (result2),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
+ [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_16_64] "r" (cospi_16_64),
+ [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
+
+ );
+
+ __asm__ __volatile__ (
+ "lh %[load1], 8(%[input]) \n\t"
+ "lh %[load2], 56(%[input]) \n\t"
+ "lh %[load3], 40(%[input]) \n\t"
+ "lh %[load4], 24(%[input]) \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "madd $ac1, %[load1], %[cospi_28_64] \n\t"
+ "msub $ac1, %[load2], %[cospi_4_64] \n\t"
+ "extp %[temp0], $ac1, 31 \n\t"
+
+ "madd $ac3, %[load1], %[cospi_4_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_28_64] \n\t"
+ "extp %[temp3], $ac3, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac2 \n\t"
+ "mthi $zero, $ac2 \n\t"
+
+ "madd $ac2, %[load3], %[cospi_12_64] \n\t"
+ "msub $ac2, %[load4], %[cospi_20_64] \n\t"
+ "extp %[temp1], $ac2, 31 \n\t"
+
+ "madd $ac1, %[load3], %[cospi_20_64] \n\t"
+ "madd $ac1, %[load4], %[cospi_12_64] \n\t"
+ "extp %[temp2], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "mtlo %[const_2_power_13], $ac3 \n\t"
+ "mthi $zero, $ac3 \n\t"
+
+ "sub %[load1], %[temp3], %[temp2] \n\t"
+ "sub %[load1], %[load1], %[temp0] \n\t"
+ "add %[load1], %[load1], %[temp1] \n\t"
+
+ "sub %[load2], %[temp0], %[temp1] \n\t"
+ "sub %[load2], %[load2], %[temp2] \n\t"
+ "add %[load2], %[load2], %[temp3] \n\t"
+
+ "madd $ac1, %[load1], %[cospi_16_64] \n\t"
+ "madd $ac3, %[load2], %[cospi_16_64] \n\t"
+
+ "extp %[step1_5], $ac1, 31 \n\t"
+ "extp %[step1_6], $ac3, 31 \n\t"
+ "add %[step1_4], %[temp0], %[temp1] \n\t"
+ "add %[step1_7], %[temp3], %[temp2] \n\t"
+
+ : [load1] "=&r" (load1), [load2] "=&r" (load2),
+ [load3] "=&r" (load3), [load4] "=&r" (load4),
+ [temp0] "=&r" (temp0), [temp1] "=&r" (temp1),
+ [temp2] "=&r" (temp2), [temp3] "=&r" (temp3),
+ [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
+ [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
+ : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ step2_0 = step1_0 + step1_7;
+ step2_1 = step1_1 + step1_6;
+ step2_2 = step1_2 + step1_5;
+ step2_3 = step1_3 + step1_4;
+ step2_4 = step1_3 - step1_4;
+ step2_5 = step1_2 - step1_5;
+ step2_6 = step1_1 - step1_6;
+ step2_7 = step1_0 - step1_7;
+
+ step1_0 = step2_0 + step3_15;
+ step1_1 = step2_1 + step3_14;
+ step1_2 = step2_2 + step3_13;
+ step1_3 = step2_3 + step3_12;
+ step1_4 = step2_4 + step3_11;
+ step1_5 = step2_5 + step3_10;
+ step1_6 = step2_6 + step3_9;
+ step1_7 = step2_7 + step3_8;
+ step1_8 = step2_7 - step3_8;
+ step1_9 = step2_6 - step3_9;
+ step1_10 = step2_5 - step3_10;
+ step1_11 = step2_4 - step3_11;
+ step1_12 = step2_3 - step3_12;
+ step1_13 = step2_2 - step3_13;
+ step1_14 = step2_1 - step3_14;
+ step1_15 = step2_0 - step3_15;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_27], %[step2_20] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_20], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_20] "=r" (step1_20)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_20] "r" (step2_20), [step2_27] "r" (step2_27),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_20 + step2_27) * cospi_16_64;
+ step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_26], %[step2_21] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_21], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_21] "=r" (step1_21)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_26] "r" (step2_26), [step2_21] "r" (step2_21),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_21 + step2_26) * cospi_16_64;
+ step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_25], %[step2_22] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_22], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_22] "=r" (step1_22)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_25] "r" (step2_25), [step2_22] "r" (step2_22),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_22 + step2_25) * cospi_16_64;
+ step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ __asm__ __volatile__ (
+ "sub %[temp0], %[step2_24], %[step2_23] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
+ "extp %[step1_23], $ac0, 31 \n\t"
+
+ : [temp0] "=&r" (temp0), [step1_23] "=r" (step1_23)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [step2_24] "r" (step2_24), [step2_23] "r" (step2_23),
+ [cospi_16_64] "r" (cospi_16_64)
+ );
+
+ temp21 = (step2_23 + step2_24) * cospi_16_64;
+ step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS;
+
+ // final stage
+ output[0 * 32] = step1_0 + step2_31;
+ output[1 * 32] = step1_1 + step2_30;
+ output[2 * 32] = step1_2 + step2_29;
+ output[3 * 32] = step1_3 + step2_28;
+ output[4 * 32] = step1_4 + step1_27;
+ output[5 * 32] = step1_5 + step1_26;
+ output[6 * 32] = step1_6 + step1_25;
+ output[7 * 32] = step1_7 + step1_24;
+ output[8 * 32] = step1_8 + step1_23;
+ output[9 * 32] = step1_9 + step1_22;
+ output[10 * 32] = step1_10 + step1_21;
+ output[11 * 32] = step1_11 + step1_20;
+ output[12 * 32] = step1_12 + step2_19;
+ output[13 * 32] = step1_13 + step2_18;
+ output[14 * 32] = step1_14 + step2_17;
+ output[15 * 32] = step1_15 + step2_16;
+ output[16 * 32] = step1_15 - step2_16;
+ output[17 * 32] = step1_14 - step2_17;
+ output[18 * 32] = step1_13 - step2_18;
+ output[19 * 32] = step1_12 - step2_19;
+ output[20 * 32] = step1_11 - step1_20;
+ output[21 * 32] = step1_10 - step1_21;
+ output[22 * 32] = step1_9 - step1_22;
+ output[23 * 32] = step1_8 - step1_23;
+ output[24 * 32] = step1_7 - step1_24;
+ output[25 * 32] = step1_6 - step1_25;
+ output[26 * 32] = step1_5 - step1_26;
+ output[27 * 32] = step1_4 - step1_27;
+ output[28 * 32] = step1_3 - step2_28;
+ output[29 * 32] = step1_2 - step2_29;
+ output[30 * 32] = step1_1 - step2_30;
+ output[31 * 32] = step1_0 - step2_31;
+
+ input += 32;
+ output += 1;
+ }
+}
+
+void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // Rows
+ idct32_1d_rows_dspr2(input, outptr);
+
+ // Columns
+ vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride);
+}
+
+void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ int r, out;
+ int32_t a1, absa1;
+ int32_t vector_a1;
+ int32_t t1, t2, t3, t4;
+ int32_t vector_1, vector_2, vector_3, vector_4;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r" (pos)
+ );
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+ __asm__ __volatile__ (
+ "addi %[out], %[out], 32 \n\t"
+ "sra %[a1], %[out], 6 \n\t"
+
+ : [out] "+r" (out), [a1] "=r" (a1)
+ :
+ );
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 32; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+
+ "lw %[t1], 16(%[dest]) \n\t"
+ "lw %[t2], 20(%[dest]) \n\t"
+ "lw %[t3], 24(%[dest]) \n\t"
+ "lw %[t4], 28(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 16(%[dest]) \n\t"
+ "sw %[vector_2], 20(%[dest]) \n\t"
+ "sw %[vector_3], 24(%[dest]) \n\t"
+ "sw %[vector_4], 28(%[dest]) \n\t"
+
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+ [dest] "+&r" (dest)
+ : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "replv.qb %[vector_a1], %[a1] \n\t"
+
+ : [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 32; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "lw %[t3], 8(%[dest]) \n\t"
+ "lw %[t4], 12(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "sw %[vector_3], 8(%[dest]) \n\t"
+ "sw %[vector_4], 12(%[dest]) \n\t"
+
+ "lw %[t1], 16(%[dest]) \n\t"
+ "lw %[t2], 20(%[dest]) \n\t"
+ "lw %[t3], 24(%[dest]) \n\t"
+ "lw %[t4], 28(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
+ "sw %[vector_1], 16(%[dest]) \n\t"
+ "sw %[vector_2], 20(%[dest]) \n\t"
+ "sw %[vector_3], 24(%[dest]) \n\t"
+ "sw %[vector_4], 28(%[dest]) \n\t"
+
+ "add %[dest], %[dest], %[stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
+ [dest] "+&r" (dest)
+ : [stride] "r" (stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
new file mode 100644
index 000000000..5b7aa5e71
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void vp9_idct4_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+ int16_t step_0, step_1, step_2, step_3;
+ int Temp0, Temp1, Temp2, Temp3;
+ const int const_2_power_13 = 8192;
+ int i;
+
+ for (i = 4; i--; ) {
+ __asm__ __volatile__ (
+ /*
+ temp_1 = (input[0] + input[2]) * cospi_16_64;
+ step_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[2]) * cospi_16_64;
+ step_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 4(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "extp %[step_0], $ac0, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "extp %[step_1], $ac1, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ /*
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ step_2 = dct_const_round_shift(temp1);
+ */
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "extp %[step_2], $ac0, 31 \n\t"
+
+ /*
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step_3 = dct_const_round_shift(temp2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[step_3], $ac1, 31 \n\t"
+
+ /*
+ output[0] = step_0 + step_3;
+ output[4] = step_1 + step_2;
+ output[8] = step_1 - step_2;
+ output[12] = step_0 - step_3;
+ */
+ "add %[Temp0], %[step_0], %[step_3] \n\t"
+ "sh %[Temp0], 0(%[output]) \n\t"
+
+ "add %[Temp1], %[step_1], %[step_2] \n\t"
+ "sh %[Temp1], 8(%[output]) \n\t"
+
+ "sub %[Temp2], %[step_1], %[step_2] \n\t"
+ "sh %[Temp2], 16(%[output]) \n\t"
+
+ "sub %[Temp3], %[step_0], %[step_3] \n\t"
+ "sh %[Temp3], 24(%[output]) \n\t"
+
+ : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+ [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+ [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+ [output] "+r" (output)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+ [cospi_24_64] "r" (cospi_24_64),
+ [input] "r" (input)
+ );
+
+ input += 4;
+ output += 1;
+ }
+}
+
+static void vp9_idct4_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int16_t step_0, step_1, step_2, step_3;
+ int Temp0, Temp1, Temp2, Temp3;
+ const int const_2_power_13 = 8192;
+ int i;
+ uint8_t *dest_pix;
+ uint8_t *cm = vp9_ff_cropTbl;
+
+ /* prefetch vp9_ff_cropTbl */
+ vp9_prefetch_load(vp9_ff_cropTbl);
+ vp9_prefetch_load(vp9_ff_cropTbl + 32);
+ vp9_prefetch_load(vp9_ff_cropTbl + 64);
+ vp9_prefetch_load(vp9_ff_cropTbl + 96);
+ vp9_prefetch_load(vp9_ff_cropTbl + 128);
+ vp9_prefetch_load(vp9_ff_cropTbl + 160);
+ vp9_prefetch_load(vp9_ff_cropTbl + 192);
+ vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+ for (i = 0; i < 4; ++i) {
+ dest_pix = (dest + i);
+
+ __asm__ __volatile__ (
+ /*
+ temp_1 = (input[0] + input[2]) * cospi_16_64;
+ step_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[2]) * cospi_16_64;
+ step_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 4(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "extp %[step_0], $ac0, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "extp %[step_1], $ac1, 31 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ /*
+ temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+ step_2 = dct_const_round_shift(temp1);
+ */
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "extp %[step_2], $ac0, 31 \n\t"
+
+ /*
+ temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+ step_3 = dct_const_round_shift(temp2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[step_3], $ac1, 31 \n\t"
+
+ /*
+ output[0] = step_0 + step_3;
+ output[4] = step_1 + step_2;
+ output[8] = step_1 - step_2;
+ output[12] = step_0 - step_3;
+ */
+ "add %[Temp0], %[step_0], %[step_3] \n\t"
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step_1], %[step_2] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step_1], %[step_2] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step_0], %[step_3] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "addi %[Temp0], %[Temp0], 8 \n\t"
+ "sra %[Temp0], %[Temp0], 4 \n\t"
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+
+ : [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+ [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [step_0] "=&r" (step_0), [step_1] "=&r" (step_1),
+ [step_2] "=&r" (step_2), [step_3] "=&r" (step_3),
+ [dest_pix] "+r" (dest_pix)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [cospi_8_64] "r" (cospi_8_64), [cospi_16_64] "r" (cospi_16_64),
+ [cospi_24_64] "r" (cospi_24_64),
+ [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+ );
+
+ input += 4;
+ }
+}
+
+void vp9_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // Rows
+ vp9_idct4_1d_rows_dspr2(input, outptr);
+
+ // Columns
+ vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vp9_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int a1, absa1;
+ int r;
+ int32_t out;
+ int t2, vector_a1, vector_a;
+ uint32_t pos = 45;
+ int16_t input_dc = input[0];
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r" (pos)
+ );
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
+ __asm__ __volatile__ (
+ "addi %[out], %[out], 8 \n\t"
+ "sra %[a1], %[out], 4 \n\t"
+
+ : [out] "+r" (out), [a1] "=r" (a1)
+ :
+ );
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 4; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t2], 0(%[dest]) \n\t"
+ "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_a], 0(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+ [dest] "+&r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "replv.qb %[vector_a1], %[a1] \n\t"
+ : [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 4; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t2], 0(%[dest]) \n\t"
+ "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_a], 0(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
+ [dest] "+&r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ }
+}
+
+static void iadst4_1d_dspr2(const int16_t *input, int16_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+ int x0, x1, x2, x3;
+
+ x0 = input[0];
+ x1 = input[1];
+ x2 = input[2];
+ x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_2_9 * x0;
+ s2 = sinpi_3_9 * x1;
+ s3 = sinpi_4_9 * x2;
+ s4 = sinpi_1_9 * x2;
+ s5 = sinpi_2_9 * x3;
+ s6 = sinpi_4_9 * x3;
+ s7 = x0 - x2 + x3;
+
+ x0 = s0 + s3 + s5;
+ x1 = s1 - s4 - s6;
+ x2 = sinpi_3_9 * s7;
+ x3 = s2;
+
+ s0 = x0 + x3;
+ s1 = x1 + x3;
+ s2 = x2;
+ s3 = x0 + x1 - x3;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = dct_const_round_shift(s0);
+ output[1] = dct_const_round_shift(s1);
+ output[2] = dct_const_round_shift(s2);
+ output[3] = dct_const_round_shift(s3);
+}
+
+void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ int i, j;
+ DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
+ int16_t *outptr = out;
+ int16_t temp_in[4 * 4], temp_out[4];
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ switch (tx_type) {
+ case DCT_DCT: // DCT in both horizontal and vertical
+ vp9_idct4_1d_rows_dspr2(input, outptr);
+ vp9_idct4_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ break;
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ vp9_idct4_1d_rows_dspr2(input, outptr);
+
+ outptr = out;
+
+ for (i = 0; i < 4; ++i) {
+ iadst4_1d_dspr2(outptr, temp_out);
+
+ for (j = 0; j < 4; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ + dest[j * dest_stride + i]);
+
+ outptr += 4;
+ }
+ break;
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ for (i = 0; i < 4; ++i) {
+ iadst4_1d_dspr2(input, outptr);
+ input += 4;
+ outptr += 4;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j) {
+ temp_in[i * 4 + j] = out[j * 4 + i];
+ }
+ }
+ vp9_idct4_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+ break;
+ case ADST_ADST: // ADST in both directions
+ for (i = 0; i < 4; ++i) {
+ iadst4_1d_dspr2(input, outptr);
+ input += 4;
+ outptr += 4;
+ }
+
+ for (i = 0; i < 4; ++i) {
+ for (j = 0; j < 4; ++j)
+ temp_in[j] = out[j * 4 + i];
+ iadst4_1d_dspr2(temp_in, temp_out);
+
+ for (j = 0; j < 4; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+ + dest[j * dest_stride + i]);
+ }
+ break;
+ default:
+ printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n");
+ break;
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
new file mode 100644
index 000000000..93a08401d
--- /dev/null
+++ b/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c
@@ -0,0 +1,745 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+static void idct8_1d_rows_dspr2(const int16_t *input, int16_t *output,
+ uint32_t no_rows) {
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ const int const_2_power_13 = 8192;
+ int Temp0, Temp1, Temp2, Temp3, Temp4;
+ int i;
+
+ for (i = no_rows; i--; ) {
+ __asm__ __volatile__ (
+ /*
+ temp_1 = (input[0] + input[4]) * cospi_16_64;
+ step2_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[4]) * cospi_16_64;
+ step2_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 8(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "extp %[Temp4], $ac0, 31 \n\t"
+
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+ step2_2 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 4(%[input]) \n\t"
+ "lh %[Temp1], 12(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "extp %[Temp3], $ac0, 31 \n\t"
+
+ /*
+ step1_1 = step2_1 + step2_2;
+ step1_2 = step2_1 - step2_2;
+ */
+ "add %[step1_1], %[Temp2], %[Temp3] \n\t"
+ "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
+
+ /*
+ temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+ step2_3 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ /*
+ step1_0 = step2_0 + step2_3;
+ step1_3 = step2_0 - step2_3;
+ */
+ "add %[step1_0], %[Temp4], %[Temp1] \n\t"
+ "sub %[step1_3], %[Temp4], %[Temp1] \n\t"
+
+ /*
+ temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ step1_4 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp1], 14(%[input]) \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
+ "extp %[step1_4], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1_7 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
+ "extp %[step1_7], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ step1_5 = dct_const_round_shift(temp_1);
+ */
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
+ "extp %[step1_5], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1_6 = dct_const_round_shift(temp_2);
+ */
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+ temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+ */
+ "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
+ "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
+ "add %[Temp0], %[Temp0], %[step1_5] \n\t"
+ "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
+ "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
+ "add %[Temp1], %[Temp1], %[step1_7] \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
+
+ /*
+ step1_4 = step1_4 + step1_5;
+ step1_7 = step1_6 + step1_7;
+ */
+ "add %[step1_4], %[step1_4], %[step1_5] \n\t"
+ "add %[step1_7], %[step1_7], %[step1_6] \n\t"
+
+ "extp %[step1_5], $ac0, 31 \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ "add %[Temp0], %[step1_0], %[step1_7] \n\t"
+ "sh %[Temp0], 0(%[output]) \n\t"
+ "add %[Temp1], %[step1_1], %[step1_6] \n\t"
+ "sh %[Temp1], 16(%[output]) \n\t"
+ "add %[Temp0], %[step1_2], %[step1_5] \n\t"
+ "sh %[Temp0], 32(%[output]) \n\t"
+ "add %[Temp1], %[step1_3], %[step1_4] \n\t"
+ "sh %[Temp1], 48(%[output]) \n\t"
+
+ "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
+ "sh %[Temp0], 64(%[output]) \n\t"
+ "sub %[Temp1], %[step1_2], %[step1_5] \n\t"
+ "sh %[Temp1], 80(%[output]) \n\t"
+ "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
+ "sh %[Temp0], 96(%[output]) \n\t"
+ "sub %[Temp1], %[step1_0], %[step1_7] \n\t"
+ "sh %[Temp1], 112(%[output]) \n\t"
+
+ : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+ [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+ [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+ [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+ [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+ [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [Temp4] "=&r" (Temp4)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+ [cospi_24_64] "r" (cospi_24_64),
+ [output] "r" (output), [input] "r" (input)
+ );
+
+ input += 8;
+ output += 1;
+ }
+}
+
+static void idct8_1d_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
+ int Temp0, Temp1, Temp2, Temp3;
+ int i;
+ const int const_2_power_13 = 8192;
+ uint8_t *dest_pix;
+ uint8_t *cm = vp9_ff_cropTbl;
+
+ /* prefetch vp9_ff_cropTbl */
+ vp9_prefetch_load(vp9_ff_cropTbl);
+ vp9_prefetch_load(vp9_ff_cropTbl + 32);
+ vp9_prefetch_load(vp9_ff_cropTbl + 64);
+ vp9_prefetch_load(vp9_ff_cropTbl + 96);
+ vp9_prefetch_load(vp9_ff_cropTbl + 128);
+ vp9_prefetch_load(vp9_ff_cropTbl + 160);
+ vp9_prefetch_load(vp9_ff_cropTbl + 192);
+ vp9_prefetch_load(vp9_ff_cropTbl + 224);
+
+ for (i = 0; i < 8; ++i) {
+ dest_pix = (dest + i);
+
+ __asm__ __volatile__ (
+ /*
+ temp_1 = (input[0] + input[4]) * cospi_16_64;
+ step2_0 = dct_const_round_shift(temp_1);
+
+ temp_2 = (input[0] - input[4]) * cospi_16_64;
+ step2_1 = dct_const_round_shift(temp_2);
+ */
+ "lh %[Temp0], 0(%[input]) \n\t"
+ "lh %[Temp1], 8(%[input]) \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "add %[Temp2], %[Temp0], %[Temp1] \n\t"
+ "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
+ "extp %[step1_6], $ac0, 31 \n\t"
+
+ "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
+ "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "extp %[Temp2], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
+ step2_2 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 4(%[input]) \n\t"
+ "lh %[Temp1], 12(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
+ "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "extp %[Temp3], $ac0, 31 \n\t"
+
+ /*
+ step1_1 = step2_1 + step2_2;
+ step1_2 = step2_1 - step2_2;
+ */
+ "add %[step1_1], %[Temp2], %[Temp3] \n\t"
+ "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
+
+ /*
+ temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
+ step2_3 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
+ "extp %[Temp1], $ac1, 31 \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+
+ /*
+ step1_0 = step2_0 + step2_3;
+ step1_3 = step2_0 - step2_3;
+ */
+ "add %[step1_0], %[step1_6], %[Temp1] \n\t"
+ "sub %[step1_3], %[step1_6], %[Temp1] \n\t"
+
+ /*
+ temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+ step1_4 = dct_const_round_shift(temp_1);
+ */
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp1], 14(%[input]) \n\t"
+ "lh %[Temp0], 2(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
+ "extp %[step1_4], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+ step1_7 = dct_const_round_shift(temp_2);
+ */
+ "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
+ "extp %[step1_7], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+ step1_5 = dct_const_round_shift(temp_1);
+ */
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
+ "extp %[step1_5], $ac0, 31 \n\t"
+
+ /*
+ temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+ step1_6 = dct_const_round_shift(temp_2);
+ */
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+ "lh %[Temp0], 10(%[input]) \n\t"
+ "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
+ "lh %[Temp1], 6(%[input]) \n\t"
+ "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ /*
+ temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
+ temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
+ */
+ "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
+ "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
+ "add %[Temp0], %[Temp0], %[step1_5] \n\t"
+ "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
+ "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
+ "add %[Temp1], %[Temp1], %[step1_7] \n\t"
+
+ "mtlo %[const_2_power_13], $ac0 \n\t"
+ "mthi $zero, $ac0 \n\t"
+ "mtlo %[const_2_power_13], $ac1 \n\t"
+ "mthi $zero, $ac1 \n\t"
+
+ "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
+ "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
+
+ /*
+ step1_4 = step1_4 + step1_5;
+ step1_7 = step1_6 + step1_7;
+ */
+ "add %[step1_4], %[step1_4], %[step1_5] \n\t"
+ "add %[step1_7], %[step1_7], %[step1_6] \n\t"
+
+ "extp %[step1_5], $ac0, 31 \n\t"
+ "extp %[step1_6], $ac1, 31 \n\t"
+
+ /* add block */
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "add %[Temp0], %[step1_0], %[step1_7] \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step1_1], %[step1_6] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step1_2], %[step1_5] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "add %[Temp0], %[step1_3], %[step1_4] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_2], %[step1_5] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "sub %[Temp0], %[step1_0], %[step1_7] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+ "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
+
+ "lbu %[Temp1], 0(%[dest_pix]) \n\t"
+ "addi %[Temp0], %[Temp0], 16 \n\t"
+ "sra %[Temp0], %[Temp0], 5 \n\t"
+ "add %[Temp1], %[Temp1], %[Temp0] \n\t"
+ "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
+ "sb %[Temp2], 0(%[dest_pix]) \n\t"
+
+ : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
+ [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
+ [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
+ [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
+ [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
+ [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
+ [dest_pix] "+r" (dest_pix)
+ : [const_2_power_13] "r" (const_2_power_13),
+ [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
+ [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
+ [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
+ [cospi_24_64] "r" (cospi_24_64),
+ [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
+ );
+
+ input += 8;
+ }
+}
+
+void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // First transform rows
+ idct8_1d_rows_dspr2(input, outptr, 8);
+
+ // Then transform columns and add to dest
+ idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+static void iadst8_1d_dspr2(const int16_t *input, int16_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+ int x0, x1, x2, x3, x4, x5, x6, x7;
+
+ x0 = input[7];
+ x1 = input[0];
+ x2 = input[5];
+ x3 = input[2];
+ x4 = input[3];
+ x5 = input[4];
+ x6 = input[1];
+ x7 = input[6];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ output[0] = output[1] = output[2] = output[3] = output[4]
+ = output[5] = output[6] = output[7] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+ s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+
+ x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
+ x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
+ x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
+ x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
+ x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
+ x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
+ x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
+ x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+
+ x0 = s0 + s2;
+ x1 = s1 + s3;
+ x2 = s0 - s2;
+ x3 = s1 - s3;
+ x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
+ x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
+ x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
+ x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
+
+ // stage 3
+ s2 = cospi_16_64 * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (x6 - x7);
+
+ x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
+ x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
+ x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
+ x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
+
+ output[0] = x0;
+ output[1] = -x4;
+ output[2] = x6;
+ output[3] = -x2;
+ output[4] = x3;
+ output[5] = -x7;
+ output[6] = x5;
+ output[7] = -x1;
+}
+
+void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride, int tx_type) {
+ int i, j;
+ DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+ int16_t *outptr = out;
+ int16_t temp_in[8 * 8], temp_out[8];
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ switch (tx_type) {
+ case DCT_DCT: // DCT in both horizontal and vertical
+ idct8_1d_rows_dspr2(input, outptr, 8);
+ idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+ break;
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ idct8_1d_rows_dspr2(input, outptr, 8);
+
+ for (i = 0; i < 8; ++i) {
+ iadst8_1d_dspr2(&out[i * 8], temp_out);
+
+ for (j = 0; j < 8; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * dest_stride + i]);
+ }
+ break;
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ for (i = 0; i < 8; ++i) {
+ iadst8_1d_dspr2(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) {
+ temp_in[i * 8 + j] = out[j * 8 + i];
+ }
+ }
+ idct8_1d_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
+ break;
+ case ADST_ADST: // ADST in both directions
+ for (i = 0; i < 8; ++i) {
+ iadst8_1d_dspr2(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j)
+ temp_in[j] = out[j * 8 + i];
+
+ iadst8_1d_dspr2(temp_in, temp_out);
+
+ for (j = 0; j < 8; ++j)
+ dest[j * dest_stride + i] =
+ clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * dest_stride + i]);
+ }
+ break;
+ default:
+ printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n");
+ break;
+ }
+}
+
+void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
+ int16_t *outptr = out;
+ uint32_t pos = 45;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+ :
+ : [pos] "r" (pos)
+ );
+
+ // First transform rows
+ idct8_1d_rows_dspr2(input, outptr, 4);
+
+ outptr += 4;
+
+ __asm__ __volatile__ (
+ "sw $zero, 0(%[outptr]) \n\t"
+ "sw $zero, 4(%[outptr]) \n\t"
+ "sw $zero, 16(%[outptr]) \n\t"
+ "sw $zero, 20(%[outptr]) \n\t"
+ "sw $zero, 32(%[outptr]) \n\t"
+ "sw $zero, 36(%[outptr]) \n\t"
+ "sw $zero, 48(%[outptr]) \n\t"
+ "sw $zero, 52(%[outptr]) \n\t"
+ "sw $zero, 64(%[outptr]) \n\t"
+ "sw $zero, 68(%[outptr]) \n\t"
+ "sw $zero, 80(%[outptr]) \n\t"
+ "sw $zero, 84(%[outptr]) \n\t"
+ "sw $zero, 96(%[outptr]) \n\t"
+ "sw $zero, 100(%[outptr]) \n\t"
+ "sw $zero, 112(%[outptr]) \n\t"
+ "sw $zero, 116(%[outptr]) \n\t"
+
+ :
+ : [outptr] "r" (outptr)
+ );
+
+
+ // Then transform columns and add to dest
+ idct8_1d_columns_add_blk_dspr2(&out[0], dest, dest_stride);
+}
+
+void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ uint32_t pos = 45;
+ int32_t out;
+ int32_t r;
+ int32_t a1, absa1;
+ int32_t t1, t2, vector_a1, vector_1, vector_2;
+
+ /* bit positon for extract from acc */
+ __asm__ __volatile__ (
+ "wrdsp %[pos], 1 \n\t"
+
+ :
+ : [pos] "r" (pos)
+ );
+
+ out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
+ __asm__ __volatile__ (
+ "addi %[out], %[out], 16 \n\t"
+ "sra %[a1], %[out], 5 \n\t"
+
+ : [out] "+r" (out), [a1] "=r" (a1)
+ :
+ );
+
+ if (a1 < 0) {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "abs %[absa1], %[a1] \n\t"
+ "replv.qb %[vector_a1], %[absa1] \n\t"
+
+ : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 8; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [dest] "+&r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ } else {
+ /* use quad-byte
+ * input and output memory are four byte aligned */
+ __asm__ __volatile__ (
+ "replv.qb %[vector_a1], %[a1] \n\t"
+
+ : [vector_a1] "=r" (vector_a1)
+ : [a1] "r" (a1)
+ );
+
+ for (r = 8; r--;) {
+ __asm__ __volatile__ (
+ "lw %[t1], 0(%[dest]) \n\t"
+ "lw %[t2], 4(%[dest]) \n\t"
+ "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
+ "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
+ "sw %[vector_1], 0(%[dest]) \n\t"
+ "sw %[vector_2], 4(%[dest]) \n\t"
+ "add %[dest], %[dest], %[dest_stride] \n\t"
+
+ : [t1] "=&r" (t1), [t2] "=&r" (t2),
+ [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
+ [dest] "+r" (dest)
+ : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
+ );
+ }
+ }
+}
+#endif // #if HAVE_DSPR2
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 0f50f374d..0d65651f0 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -41,17 +41,12 @@ void vp9_free_frame_buffers(VP9_COMMON *cm) {
vpx_free(cm->mip);
vpx_free(cm->prev_mip);
- vpx_free(cm->above_seg_context);
vpx_free(cm->last_frame_seg_map);
vpx_free(cm->mi_grid_base);
vpx_free(cm->prev_mi_grid_base);
- vpx_free(cm->above_context[0]);
- for (i = 0; i < MAX_MB_PLANE; i++)
- cm->above_context[i] = 0;
cm->mip = NULL;
cm->prev_mip = NULL;
- cm->above_seg_context = NULL;
cm->last_frame_seg_map = NULL;
cm->mi_grid_base = NULL;
cm->prev_mi_grid_base = NULL;
@@ -85,7 +80,7 @@ static void setup_mi(VP9_COMMON *cm) {
}
int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
- int i, mi_cols;
+ int i;
const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
@@ -140,21 +135,6 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
setup_mi(cm);
- // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling
- // information is exposed at this level
- mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-
- // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
- // block where mi unit size is 8x8.
- cm->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE *
- (2 * mi_cols), 1);
- if (!cm->above_context[0])
- goto fail;
-
- cm->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1);
- if (!cm->above_seg_context)
- goto fail;
-
// Create the segmentation map structure and set to 0.
cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
if (!cm->last_frame_seg_map)
@@ -186,18 +166,12 @@ void vp9_initialize_common() {
}
void vp9_update_frame_size(VP9_COMMON *cm) {
- int i, mi_cols;
const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, MI_SIZE_LOG2);
const int aligned_height = ALIGN_POWER_OF_TWO(cm->height, MI_SIZE_LOG2);
set_mb_mi(cm, aligned_width, aligned_height);
setup_mi(cm);
- mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
- for (i = 1; i < MAX_MB_PLANE; i++)
- cm->above_context[i] =
- cm->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols;
-
// Initialize the previous frame segment map to 0.
if (cm->last_frame_seg_map)
vpx_memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 1a03269fb..d0d485272 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -74,10 +74,6 @@ typedef enum {
MB_MODE_COUNT
} MB_PREDICTION_MODE;
-static INLINE int is_intra_mode(MB_PREDICTION_MODE mode) {
- return mode <= TM_PRED;
-}
-
static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
return mode >= NEARESTMV && mode <= NEWMV;
}
@@ -140,7 +136,7 @@ typedef struct {
// Flags used for prediction status of various bit-stream signals
unsigned char seg_id_predicted;
- INTERPOLATIONFILTERTYPE interp_filter;
+ INTERPOLATION_TYPE interp_filter;
BLOCK_SIZE sb_type;
} MB_MODE_INFO;
@@ -226,6 +222,13 @@ typedef struct macroblockd {
unsigned char ab_index; // index of 4x4 block inside the 8x8 block
int q_index;
+
+ /* Y,U,V,(A) */
+ ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+ ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+
+ PARTITION_CONTEXT *above_seg_context;
+ PARTITION_CONTEXT left_seg_context[8];
} MACROBLOCKD;
@@ -414,7 +417,7 @@ static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
*y = (raster_mb >> tx_cols_log2) << tx_size;
}
-static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
+static void extend_for_intra(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize,
int plane, int block, TX_SIZE tx_size) {
struct macroblockd_plane *const pd = &xd->plane[plane];
uint8_t *const buf = pd->dst.buf;
@@ -439,19 +442,22 @@ static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
}
if (xd->mb_to_bottom_edge < 0) {
- const int bh = 4 << b_height_log2(plane_bsize);
- const int umv_border_start = bh + (xd->mb_to_bottom_edge >>
- (3 + pd->subsampling_y));
- int i;
- const uint8_t c = buf[(umv_border_start - 1) * stride + x];
- uint8_t *d = &buf[umv_border_start * stride + x];
-
- if (y + bh > umv_border_start)
- for (i = 0; i < bh; ++i, d += stride)
- *d = c;
+ if (xd->left_available || x >= 0) {
+ const int bh = 4 << b_height_log2(plane_bsize);
+ const int umv_border_start =
+ bh + (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y));
+
+ if (y + bh > umv_border_start) {
+ const uint8_t c = buf[(umv_border_start - 1) * stride + x];
+ uint8_t *d = &buf[umv_border_start * stride + x];
+ int i;
+ for (i = 0; i < bh; ++i, d += stride)
+ *d = c;
+ }
+ }
}
}
-static void set_contexts_on_border(MACROBLOCKD *xd,
+static void set_contexts_on_border(const MACROBLOCKD *xd,
struct macroblockd_plane *pd,
BLOCK_SIZE plane_bsize,
int tx_size_in_blocks, int has_eob,
@@ -489,7 +495,7 @@ static void set_contexts_on_border(MACROBLOCKD *xd,
L[pt] = 0;
}
-static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd,
+static void set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
int has_eob, int aoff, int loff) {
ENTROPY_CONTEXT *const A = pd->above_context + aoff;
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 3ac192b4a..36d1cdf14 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -84,9 +84,11 @@ static int get_unsigned_bits(unsigned int num_values) {
} while (0)
#endif
-#define SYNC_CODE_0 0x49
-#define SYNC_CODE_1 0x83
-#define SYNC_CODE_2 0x42
+#define VP9_SYNC_CODE_0 0x49
+#define VP9_SYNC_CODE_1 0x83
+#define VP9_SYNC_CODE_2 0x42
+
+#define VP9_FRAME_MARKER 0x2
#endif // VP9_COMMON_VP9_COMMON_H_
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 2640ac72b..d3a867c3f 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -322,9 +322,8 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size];
unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
cm->counts.eob_branch[tx_size];
- int t, i, j, k, l;
+ int i, j, k, l, m;
unsigned int branch_ct[UNCONSTRAINED_NODES][2];
- vp9_prob coef_probs[UNCONSTRAINED_NODES];
for (i = 0; i < BLOCK_TYPES; ++i)
for (j = 0; j < REF_TYPES; ++j)
@@ -332,15 +331,14 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
if (l >= 3 && k == 0)
continue;
- vp9_tree_probs_from_distribution(vp9_coefmodel_tree, coef_probs,
- branch_ct, coef_counts[i][j][k][l],
- 0);
+ vp9_tree_probs_from_distribution(vp9_coefmodel_tree, branch_ct,
+ coef_counts[i][j][k][l], 0);
branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
- coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
- for (t = 0; t < UNCONSTRAINED_NODES; ++t)
- dst_coef_probs[i][j][k][l][t] = merge_probs(
- pre_coef_probs[i][j][k][l][t], coef_probs[t],
- branch_ct[t], count_sat, update_factor);
+ for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+ dst_coef_probs[i][j][k][l][m] = merge_probs(
+ pre_coef_probs[i][j][k][l][m],
+ branch_ct[m],
+ count_sat, update_factor);
}
}
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index ec7d09a00..c58e852fe 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -153,8 +153,8 @@ typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
-static int get_entropy_context(TX_SIZE tx_size,
- ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
+static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+ const ENTROPY_CONTEXT *l) {
ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
switch (tx_size) {
@@ -163,16 +163,16 @@ static int get_entropy_context(TX_SIZE tx_size,
left_ec = l[0] != 0;
break;
case TX_8X8:
- above_ec = !!*(uint16_t *)a;
- left_ec = !!*(uint16_t *)l;
+ above_ec = !!*(const uint16_t *)a;
+ left_ec = !!*(const uint16_t *)l;
break;
case TX_16X16:
- above_ec = !!*(uint32_t *)a;
- left_ec = !!*(uint32_t *)l;
+ above_ec = !!*(const uint32_t *)a;
+ left_ec = !!*(const uint32_t *)l;
break;
case TX_32X32:
- above_ec = !!*(uint64_t *)a;
- left_ec = !!*(uint64_t *)l;
+ above_ec = !!*(const uint64_t *)a;
+ left_ec = !!*(const uint64_t *)l;
break;
default:
assert(!"Invalid transform size.");
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 3347b35de..a963d55e6 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -161,51 +161,52 @@ static const vp9_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
{ 101, 21, 107, 181, 192, 103, 19, 67, 125 } // y = tm
};
-static const vp9_prob default_partition_probs[FRAME_TYPES][PARTITION_CONTEXTS]
+const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+ [PARTITION_TYPES - 1] = {
+ // 8x8 -> 4x4
+ { 158, 97, 94 }, // a/l both not split
+ { 93, 24, 99 }, // a split, l not split
+ { 85, 119, 44 }, // l split, a not split
+ { 62, 59, 67 }, // a/l both split
+ // 16x16 -> 8x8
+ { 149, 53, 53 }, // a/l both not split
+ { 94, 20, 48 }, // a split, l not split
+ { 83, 53, 24 }, // l split, a not split
+ { 52, 18, 18 }, // a/l both split
+ // 32x32 -> 16x16
+ { 150, 40, 39 }, // a/l both not split
+ { 78, 12, 26 }, // a split, l not split
+ { 67, 33, 11 }, // l split, a not split
+ { 24, 7, 5 }, // a/l both split
+ // 64x64 -> 32x32
+ { 174, 35, 49 }, // a/l both not split
+ { 68, 11, 27 }, // a split, l not split
+ { 57, 15, 9 }, // l split, a not split
+ { 12, 3, 3 }, // a/l both split
+};
+
+static const vp9_prob default_partition_probs[PARTITION_CONTEXTS]
[PARTITION_TYPES - 1] = {
- { // frame_type = keyframe
- // 8x8 -> 4x4
- { 158, 97, 94 }, // a/l both not split
- { 93, 24, 99 }, // a split, l not split
- { 85, 119, 44 }, // l split, a not split
- { 62, 59, 67 }, // a/l both split
- // 16x16 -> 8x8
- { 149, 53, 53 }, // a/l both not split
- { 94, 20, 48 }, // a split, l not split
- { 83, 53, 24 }, // l split, a not split
- { 52, 18, 18 }, // a/l both split
- // 32x32 -> 16x16
- { 150, 40, 39 }, // a/l both not split
- { 78, 12, 26 }, // a split, l not split
- { 67, 33, 11 }, // l split, a not split
- { 24, 7, 5 }, // a/l both split
- // 64x64 -> 32x32
- { 174, 35, 49 }, // a/l both not split
- { 68, 11, 27 }, // a split, l not split
- { 57, 15, 9 }, // l split, a not split
- { 12, 3, 3 }, // a/l both split
- }, { // frame_type = interframe
- // 8x8 -> 4x4
- { 199, 122, 141 }, // a/l both not split
- { 147, 63, 159 }, // a split, l not split
- { 148, 133, 118 }, // l split, a not split
- { 121, 104, 114 }, // a/l both split
- // 16x16 -> 8x8
- { 174, 73, 87 }, // a/l both not split
- { 92, 41, 83 }, // a split, l not split
- { 82, 99, 50 }, // l split, a not split
- { 53, 39, 39 }, // a/l both split
- // 32x32 -> 16x16
- { 177, 58, 59 }, // a/l both not split
- { 68, 26, 63 }, // a split, l not split
- { 52, 79, 25 }, // l split, a not split
- { 17, 14, 12 }, // a/l both split
- // 64x64 -> 32x32
- { 222, 34, 30 }, // a/l both not split
- { 72, 16, 44 }, // a split, l not split
- { 58, 32, 12 }, // l split, a not split
- { 10, 7, 6 }, // a/l both split
- }
+ // 8x8 -> 4x4
+ { 199, 122, 141 }, // a/l both not split
+ { 147, 63, 159 }, // a split, l not split
+ { 148, 133, 118 }, // l split, a not split
+ { 121, 104, 114 }, // a/l both split
+ // 16x16 -> 8x8
+ { 174, 73, 87 }, // a/l both not split
+ { 92, 41, 83 }, // a split, l not split
+ { 82, 99, 50 }, // l split, a not split
+ { 53, 39, 39 }, // a/l both split
+ // 32x32 -> 16x16
+ { 177, 58, 59 }, // a/l both not split
+ { 68, 26, 63 }, // a split, l not split
+ { 52, 79, 25 }, // l split, a not split
+ { 17, 14, 12 }, // a/l both split
+ // 64x64 -> 32x32
+ { 222, 34, 30 }, // a/l both not split
+ { 72, 16, 44 }, // a split, l not split
+ { 58, 32, 12 }, // l split, a not split
+ { 10, 7, 6 }, // a/l both split
};
static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
@@ -309,8 +310,8 @@ static const vp9_prob default_mbskip_probs[MBSKIP_CONTEXTS] = {
192, 128, 64
};
-static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTERS+1]
- [SWITCHABLE_FILTERS-1] = {
+static const vp9_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+ [SWITCHABLE_FILTERS - 1] = {
{ 235, 162, },
{ 36, 255, },
{ 34, 3, },
@@ -349,29 +350,15 @@ void vp9_entropy_mode_init() {
#define COUNT_SAT 20
#define MAX_UPDATE_FACTOR 128
-static int update_ct(vp9_prob pre_prob, vp9_prob prob,
- const unsigned int ct[2]) {
- return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
-}
-
-static int update_ct2(vp9_prob pre_prob, const unsigned int ct[2]) {
- return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
+static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) {
+ return merge_probs(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
}
-static void update_mode_probs(int n_modes,
- const vp9_tree_index *tree,
- const unsigned int *cnt,
- const vp9_prob *pre_probs, vp9_prob *dst_probs,
- unsigned int tok0_offset) {
-#define MAX_PROBS 32
- vp9_prob probs[MAX_PROBS];
- unsigned int branch_ct[MAX_PROBS][2];
- int t;
-
- assert(n_modes - 1 < MAX_PROBS);
- vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset);
- for (t = 0; t < n_modes - 1; ++t)
- dst_probs[t] = update_ct(pre_probs[t], probs[t], branch_ct[t]);
+static void adapt_probs(const vp9_tree_index *tree,
+ const vp9_prob *pre_probs, const unsigned int *counts,
+ unsigned int offset, vp9_prob *probs) {
+ tree_merge_probs(tree, pre_probs, counts, offset,
+ COUNT_SAT, MAX_UPDATE_FACTOR, probs);
}
void vp9_adapt_mode_probs(VP9_COMMON *cm) {
@@ -381,46 +368,40 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
const FRAME_COUNTS *counts = &cm->counts;
for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
- fc->intra_inter_prob[i] = update_ct2(pre_fc->intra_inter_prob[i],
+ fc->intra_inter_prob[i] = adapt_prob(pre_fc->intra_inter_prob[i],
counts->intra_inter[i]);
for (i = 0; i < COMP_INTER_CONTEXTS; i++)
- fc->comp_inter_prob[i] = update_ct2(pre_fc->comp_inter_prob[i],
+ fc->comp_inter_prob[i] = adapt_prob(pre_fc->comp_inter_prob[i],
counts->comp_inter[i]);
for (i = 0; i < REF_CONTEXTS; i++)
- fc->comp_ref_prob[i] = update_ct2(pre_fc->comp_ref_prob[i],
+ fc->comp_ref_prob[i] = adapt_prob(pre_fc->comp_ref_prob[i],
counts->comp_ref[i]);
for (i = 0; i < REF_CONTEXTS; i++)
for (j = 0; j < 2; j++)
- fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j],
+ fc->single_ref_prob[i][j] = adapt_prob(pre_fc->single_ref_prob[i][j],
counts->single_ref[i][j]);
for (i = 0; i < INTER_MODE_CONTEXTS; i++)
- update_mode_probs(INTER_MODES, vp9_inter_mode_tree,
- counts->inter_mode[i], pre_fc->inter_mode_probs[i],
- fc->inter_mode_probs[i], NEARESTMV);
+ adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
+ counts->inter_mode[i], NEARESTMV, fc->inter_mode_probs[i]);
for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
- update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
- counts->y_mode[i], pre_fc->y_mode_prob[i],
- fc->y_mode_prob[i], 0);
+ adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
+ counts->y_mode[i], 0, fc->y_mode_prob[i]);
for (i = 0; i < INTRA_MODES; ++i)
- update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
- counts->uv_mode[i], pre_fc->uv_mode_prob[i],
- fc->uv_mode_prob[i], 0);
+ adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
+ counts->uv_mode[i], 0, fc->uv_mode_prob[i]);
for (i = 0; i < PARTITION_CONTEXTS; i++)
- update_mode_probs(PARTITION_TYPES, vp9_partition_tree,
- counts->partition[i],
- pre_fc->partition_prob[INTER_FRAME][i],
- fc->partition_prob[INTER_FRAME][i], 0);
+ adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i],
+ counts->partition[i], 0, fc->partition_prob[i]);
if (cm->mcomp_filter_type == SWITCHABLE) {
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
- update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
- counts->switchable_interp[i],
- pre_fc->switchable_interp_prob[i],
- fc->switchable_interp_prob[i], 0);
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
+ adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i],
+ counts->switchable_interp[i], 0,
+ fc->switchable_interp_prob[i]);
}
if (cm->tx_mode == TX_MODE_SELECT) {
@@ -432,23 +413,23 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
for (j = 0; j < TX_SIZES - 3; ++j)
- fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
+ fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j],
branch_ct_8x8p[j]);
tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
for (j = 0; j < TX_SIZES - 2; ++j)
- fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
+ fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j],
branch_ct_16x16p[j]);
tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
for (j = 0; j < TX_SIZES - 1; ++j)
- fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
+ fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j],
branch_ct_32x32p[j]);
}
}
for (i = 0; i < MBSKIP_CONTEXTS; ++i)
- fc->mbskip_probs[i] = update_ct2(pre_fc->mbskip_probs[i],
+ fc->mbskip_probs[i] = adapt_prob(pre_fc->mbskip_probs[i],
counts->mbskip[i]);
}
diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index ab37b75c6..38b419948 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h
@@ -16,6 +16,7 @@
#define TX_SIZE_CONTEXTS 2
#define SWITCHABLE_FILTERS 3 // number of switchable filters
+#define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
// #define MODE_STATS
@@ -37,6 +38,9 @@ extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
[INTRA_MODES - 1];
+extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+ [PARTITION_TYPES - 1];
+
extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index f70b571ef..b061cdb38 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -191,60 +191,47 @@ void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) {
}
static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
- return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
+ return merge_probs(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
}
-static unsigned int adapt_probs(unsigned int i,
- vp9_tree tree,
- vp9_prob this_probs[],
- const vp9_prob last_probs[],
- const unsigned int num_events[]) {
- const unsigned int left = tree[i] <= 0
- ? num_events[-tree[i]]
- : adapt_probs(tree[i], tree, this_probs, last_probs, num_events);
-
- const unsigned int right = tree[i + 1] <= 0
- ? num_events[-tree[i + 1]]
- : adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events);
- const unsigned int ct[2] = { left, right };
- this_probs[i >> 1] = adapt_prob(last_probs[i >> 1], ct);
- return left + right;
+static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
+ const unsigned int *counts, vp9_prob *probs) {
+ tree_merge_probs(tree, pre_probs, counts, 0,
+ MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR, probs);
}
-
void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
int i, j;
- const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-
- nmv_context *ctx = &cm->fc.nmvc;
- const nmv_context *pre_ctx = &pre_fc->nmvc;
- const nmv_context_counts *cts = &cm->counts.mv;
+ nmv_context *fc = &cm->fc.nmvc;
+ const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
+ const nmv_context_counts *counts = &cm->counts.mv;
- adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
+ adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints,
+ fc->joints);
for (i = 0; i < 2; ++i) {
- ctx->comps[i].sign = adapt_prob(pre_ctx->comps[i].sign, cts->comps[i].sign);
- adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes,
- pre_ctx->comps[i].classes, cts->comps[i].classes);
- adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0,
- pre_ctx->comps[i].class0, cts->comps[i].class0);
+ nmv_component *comp = &fc->comps[i];
+ const nmv_component *pre_comp = &pre_fc->comps[i];
+ const nmv_component_counts *c = &counts->comps[i];
+
+ comp->sign = adapt_prob(pre_comp->sign, c->sign);
+ adapt_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
+ comp->classes);
+ adapt_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, comp->class0);
for (j = 0; j < MV_OFFSET_BITS; ++j)
- ctx->comps[i].bits[j] = adapt_prob(pre_ctx->comps[i].bits[j],
- cts->comps[i].bits[j]);
+ comp->bits[j] = adapt_prob(pre_comp->bits[j], c->bits[j]);
for (j = 0; j < CLASS0_SIZE; ++j)
- adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j],
- pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]);
+ adapt_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], c->class0_fp[j],
+ comp->class0_fp[j]);
- adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp,
- cts->comps[i].fp);
+ adapt_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
if (allow_hp) {
- ctx->comps[i].class0_hp = adapt_prob(pre_ctx->comps[i].class0_hp,
- cts->comps[i].class0_hp);
- ctx->comps[i].hp = adapt_prob(pre_ctx->comps[i].hp, cts->comps[i].hp);
+ comp->class0_hp = adapt_prob(pre_comp->class0_hp, c->class0_hp);
+ comp->hp = adapt_prob(pre_comp->hp, c->hp);
}
}
}
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 768ff2c94..1651b9050 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -76,4 +76,15 @@ typedef enum {
ADST_ADST = 3 // ADST in both directions
} TX_TYPE;
+typedef enum {
+ UNKNOWN = 0,
+ BT_601 = 1, // YUV
+ BT_709 = 2, // YUV
+ SMPTE_170 = 3, // YUV
+ SMPTE_240 = 4, // YUV
+ RESERVED_1 = 5,
+ RESERVED_2 = 6,
+ SRGB = 7 // RGB
+} COLOR_SPACE;
+
#endif // VP9_COMMON_VP9_ENUMS_H_
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index cedd44cad..79ace147c 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -97,19 +97,15 @@ DECLARE_ALIGNED(256, const subpel_kernel,
{ 0, -3, 1, 38, 64, 32, -1, -3}
};
-const subpel_kernel *vp9_get_filter_kernel(INTERPOLATIONFILTERTYPE type) {
- switch (type) {
- case EIGHTTAP:
- return vp9_sub_pel_filters_8;
- case EIGHTTAP_SMOOTH:
- return vp9_sub_pel_filters_8lp;
- case EIGHTTAP_SHARP:
- return vp9_sub_pel_filters_8s;
- case BILINEAR:
- return vp9_bilinear_filters;
- default:
- assert(!"Invalid filter type.");
- return NULL;
- }
+
+static const subpel_kernel* vp9_filter_kernels[4] = {
+ vp9_sub_pel_filters_8,
+ vp9_sub_pel_filters_8lp,
+ vp9_sub_pel_filters_8s,
+ vp9_bilinear_filters
+};
+
+const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type) {
+ return vp9_filter_kernels[type];
}
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index 302945374..b1e7e6499 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -27,7 +27,7 @@ typedef enum {
EIGHTTAP_SHARP = 2,
BILINEAR = 3,
SWITCHABLE = 4 /* should be the last one */
-} INTERPOLATIONFILTERTYPE;
+} INTERPOLATION_TYPE;
typedef int16_t subpel_kernel[SUBPEL_TAPS];
@@ -36,10 +36,9 @@ struct subpix_fn_table {
const subpel_kernel *filter_y;
};
-const subpel_kernel *vp9_get_filter_kernel(INTERPOLATIONFILTERTYPE type);
+const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type);
extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS];
-extern const subpel_kernel vp9_sub_pel_filters_6[SUBPEL_SHIFTS];
extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index 592ef6afa..b91c50143 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -35,6 +35,7 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
}
void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
+ const TileInfo *const tile,
int_mv *dst_nearest,
int_mv *dst_near,
int block_idx, int ref_idx,
@@ -46,7 +47,7 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
assert(ref_idx == 0 || ref_idx == 1);
assert(MAX_MV_REF_CANDIDATES == 2); // makes code here slightly easier
- vp9_find_mv_refs_idx(cm, xd, mi, xd->last_mi,
+ vp9_find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi,
mi->mbmi.ref_frame[ref_idx],
mv_list, block_idx, mi_row, mi_col);
diff --git a/vp9/common/vp9_findnearmv.h b/vp9/common/vp9_findnearmv.h
index d161d1b3e..2362caa41 100644
--- a/vp9/common/vp9_findnearmv.h
+++ b/vp9/common/vp9_findnearmv.h
@@ -34,8 +34,8 @@ static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
}
-void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm,
- MACROBLOCKD *xd,
+void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
+ const TileInfo *const tile,
int_mv *dst_nearest,
int_mv *dst_near,
int block_idx, int ref_idx,
@@ -43,42 +43,30 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm,
static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mi,
const MODE_INFO *left_mi, int b) {
- // FIXME(rbultje, jingning): temporary hack because jenkins doesn't
- // understand this condition. This will go away soon.
-
if (b == 0 || b == 2) {
- /* On L edge, get from MB to left of us */
- if (!left_mi)
+ if (!left_mi || is_inter_block(&left_mi->mbmi))
return DC_PRED;
- if (is_inter_block(&left_mi->mbmi))
- return DC_PRED;
- else
- return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode
- : left_mi->mbmi.mode;
+ return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode
+ : left_mi->mbmi.mode;
+ } else {
+ assert(b == 1 || b == 3);
+ return cur_mi->bmi[b - 1].as_mode;
}
- assert(b == 1 || b == 3);
- return cur_mi->bmi[b - 1].as_mode;
}
-static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
- const MODE_INFO *above_mb, int b) {
- const MODE_INFO *mi = cur_mb;
-
- if (!(b >> 1)) {
- /* On top edge, get from MB above us */
- mi = above_mb;
- if (!mi)
+static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mi,
+ const MODE_INFO *above_mi, int b) {
+ if (b == 0 || b == 1) {
+ if (!above_mi || is_inter_block(&above_mi->mbmi))
return DC_PRED;
- if (is_inter_block(&mi->mbmi))
- return DC_PRED;
- else
- return mi->mbmi.sb_type < BLOCK_8X8 ? (mi->bmi + 2 + b)->as_mode
- : mi->mbmi.mode;
+ return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode
+ : above_mi->mbmi.mode;
+ } else {
+ assert(b == 2 || b == 3);
+ return cur_mi->bmi[b - 2].as_mode;
}
-
- return (mi->bmi + b - 2)->as_mode;
}
#endif // VP9_COMMON_VP9_FINDNEARMV_H_
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 52b039d99..ea8683ea1 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -1280,6 +1280,31 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
}
}
+void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
+ int16_t out[32 * 32] = {0};
+ int16_t *outptr = out;
+ int i, j;
+ int16_t temp_in[32], temp_out[32];
+
+ // Rows
+ // only upper-left 8x8 has non-zero coeff
+ for (i = 0; i < 8; ++i) {
+ idct32_1d(input, outptr);
+ input += 32;
+ outptr += 32;
+ }
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = out[j * 32 + i];
+ idct32_1d(temp_in, temp_out);
+ for (j = 0; j < 32; ++j)
+ dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+ + dest[j * stride + i]);
+ }
+}
+
void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
int i, j;
int a1;
@@ -1350,6 +1375,9 @@ void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
if (eob) {
if (eob == 1)
vp9_idct32x32_1_add(input, dest, stride);
+ else if (eob <= 34)
+ // non-zero coeff only in upper-left 8x8
+ vp9_idct32x32_34_add(input, dest, stride);
else
vp9_idct32x32_1024_add(input, dest, stride);
}
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 85ac6d2bf..218e12e62 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -16,12 +16,6 @@
#include "vp9/common/vp9_seg_common.h"
-struct loop_filter_info {
- const uint8_t *mblim;
- const uint8_t *lim;
- const uint8_t *hev_thr;
-};
-
// This structure holds bit masks for all 8x8 blocks in a 64x64 region.
// Each 1 bit represents a position in which we want to apply the loop filter.
// Left_ entries refer to whether we apply a filter on the border to the
@@ -259,8 +253,8 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
if (block_inside_limit < 1)
block_inside_limit = 1;
- vpx_memset(lfi->lim[lvl], block_inside_limit, SIMD_WIDTH);
- vpx_memset(lfi->mblim[lvl], (2 * (lvl + 2) + block_inside_limit),
+ vpx_memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+ vpx_memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
SIMD_WIDTH);
}
}
@@ -268,7 +262,7 @@ static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
void vp9_loop_filter_init(VP9_COMMON *cm) {
loop_filter_info_n *lfi = &cm->lf_info;
struct loopfilter *lf = &cm->lf;
- int i;
+ int lvl;
// init limits for given sharpness
update_sharpness(lfi, lf->sharpness_level);
@@ -278,8 +272,8 @@ void vp9_loop_filter_init(VP9_COMMON *cm) {
lf_init_lut(lfi);
// init hev threshold const vectors
- for (i = 0; i < 4; i++)
- vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
+ for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
+ vpx_memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
}
void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
@@ -330,16 +324,14 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
static int build_lfi(const loop_filter_info_n *lfi_n,
const MB_MODE_INFO *mbmi,
- struct loop_filter_info *lfi) {
+ const loop_filter_thresh **lfi) {
const int seg = mbmi->segment_id;
const int ref = mbmi->ref_frame[0];
const int mode = lfi_n->mode_lf_lut[mbmi->mode];
const int filter_level = lfi_n->lvl[seg][ref][mode];
if (filter_level > 0) {
- lfi->mblim = lfi_n->mblim[filter_level];
- lfi->lim = lfi_n->lim[filter_level];
- lfi->hev_thr = lfi_n->hev_thr[filter_level >> 4];
+ *lfi = &lfi_n->lfthr[filter_level];
return 1;
} else {
return 0;
@@ -351,11 +343,13 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
unsigned int mask_8x8,
unsigned int mask_4x4,
unsigned int mask_4x4_int,
- const struct loop_filter_info *lfi) {
+ const loop_filter_thresh **p_lfi) {
unsigned int mask;
for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
mask; mask >>= 1) {
+ const loop_filter_thresh *lfi = *p_lfi;
+
if (mask & 1) {
if (mask_16x16 & 1) {
vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
@@ -379,7 +373,7 @@ static void filter_selectively_vert(uint8_t *s, int pitch,
vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
lfi->hev_thr, 1);
s += 8;
- lfi++;
+ p_lfi++;
mask_16x16 >>= 1;
mask_8x8 >>= 1;
mask_4x4 >>= 1;
@@ -393,12 +387,14 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
unsigned int mask_4x4,
unsigned int mask_4x4_int,
int only_4x4_1,
- const struct loop_filter_info *lfi) {
+ const loop_filter_thresh **p_lfi) {
unsigned int mask;
int count;
for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
mask; mask >>= count) {
+ const loop_filter_thresh *lfi = *p_lfi;
+
count = 1;
if (mask & 1) {
if (!only_4x4_1) {
@@ -432,7 +428,7 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
lfi->lim, lfi->hev_thr, 1);
}
s += 8 * count;
- lfi += count;
+ p_lfi += count;
mask_16x16 >>= count;
mask_8x8 >>= count;
mask_4x4 >>= count;
@@ -805,7 +801,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
- struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+ const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
int r, c;
for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
@@ -834,7 +830,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
// Filter level can vary per MI
- if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
+ if (!build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]))
continue;
// Build masks based on the transform size of each block
@@ -925,7 +921,7 @@ static void filter_block_plane(VP9_COMMON *const cm,
struct buf_2d *const dst = &plane->dst;
uint8_t* const dst0 = dst->buf;
unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
- struct loop_filter_info lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+ const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
int r, c;
int row_shift = 3 - ss_x;
int row_mask = 0xff >> (ss_x << 2);
@@ -938,8 +934,8 @@ static void filter_block_plane(VP9_COMMON *const cm,
// Determine the vertical edges that need filtering
for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
const MODE_INFO *mi = mi_8x8[c];
- if (!build_lfi(&cm->lf_info, &mi[0].mbmi, lfi[r] + (c >> ss_x)))
- continue;
+
+ build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]);
}
if (!plane->plane_type) {
mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h
index c698090d8..62389ea5e 100644
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -46,12 +46,13 @@ struct loopfilter {
// Need to align this structure so when it is declared and
// passed it can be loaded into vector registers.
typedef struct {
- DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
- mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
- DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
- lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
- DECLARE_ALIGNED(SIMD_WIDTH, uint8_t,
- hev_thr[4][SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
+} loop_filter_thresh;
+
+typedef struct {
+ loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
uint8_t mode_lf_lut[MB_MODE_COUNT];
} loop_filter_info_n;
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 659079639..8df8aec84 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -170,17 +170,19 @@ static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
// Checks that the given mi_row, mi_col and search point
// are inside the borders of the tile.
-static INLINE int is_inside(const VP9_COMMON *cm, int mi_col, int mi_row,
+static INLINE int is_inside(const TileInfo *const tile,
+ int mi_col, int mi_row, int mi_rows,
const MV *mv) {
return !(mi_row + mv->row < 0 ||
- mi_col + mv->col < cm->cur_tile_mi_col_start ||
- mi_row + mv->row >= cm->mi_rows ||
- mi_col + mv->col >= cm->cur_tile_mi_col_end);
+ mi_col + mv->col < tile->mi_col_start ||
+ mi_row + mv->row >= mi_rows ||
+ mi_col + mv->col >= tile->mi_col_end);
}
// This function searches the neighbourhood of a given MB/SB
// to try and find candidate reference vectors.
void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+ const TileInfo *const tile,
MODE_INFO *mi, const MODE_INFO *prev_mi,
MV_REFERENCE_FRAME ref_frame,
int_mv *mv_ref_list,
@@ -201,7 +203,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
// and we also need to keep a mode count.
for (i = 0; i < 2; ++i) {
const MV *const mv_ref = &mv_ref_search[i];
- if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+ if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
const MODE_INFO *const candidate_mi = xd->mi_8x8[mv_ref->col + mv_ref->row
* xd->mode_info_stride];
const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
@@ -228,7 +230,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
// mode counts.
for (; i < MVREF_NEIGHBOURS; ++i) {
const MV *const mv_ref = &mv_ref_search[i];
- if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+ if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
mv_ref->row
* xd->mode_info_stride]->mbmi;
@@ -258,7 +260,7 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
if (different_ref_found) {
for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
const MV *mv_ref = &mv_ref_search[i];
- if (is_inside(cm, mi_col, mi_row, mv_ref)) {
+ if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
const MB_MODE_INFO *const candidate = &xd->mi_8x8[mv_ref->col +
mv_ref->row
* xd->mode_info_stride]->mbmi;
diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h
index 39ebdb078..ce4c55983 100644
--- a/vp9/common/vp9_mvref_common.h
+++ b/vp9/common/vp9_mvref_common.h
@@ -15,6 +15,7 @@
#define VP9_COMMON_VP9_MVREF_COMMON_H_
void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+ const TileInfo *const tile,
MODE_INFO *mi, const MODE_INFO *prev_mi,
MV_REFERENCE_FRAME ref_frame,
int_mv *mv_ref_list,
@@ -22,11 +23,12 @@ void vp9_find_mv_refs_idx(const VP9_COMMON *cm, const MACROBLOCKD *xd,
int mi_row, int mi_col);
static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
+ const TileInfo *const tile,
MODE_INFO *mi, const MODE_INFO *prev_mi,
MV_REFERENCE_FRAME ref_frame,
int_mv *mv_ref_list,
int mi_row, int mi_col) {
- vp9_find_mv_refs_idx(cm, xd, mi, prev_mi, ref_frame,
+ vp9_find_mv_refs_idx(cm, xd, tile, mi, prev_mi, ref_frame,
mv_ref_list, -1, mi_row, mi_col);
}
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index f2244e555..a2af57acf 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -19,6 +19,7 @@
#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_entropymode.h"
#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_tile_common.h"
#if CONFIG_VP9_POSTPROC
#include "vp9/common/vp9_postproc.h"
@@ -40,9 +41,9 @@
typedef struct frame_contexts {
vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
- vp9_prob partition_prob[FRAME_TYPES][PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+ vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
- vp9_prob switchable_interp_prob[SWITCHABLE_FILTERS + 1]
+ vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
[SWITCHABLE_FILTERS - 1];
vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
@@ -61,7 +62,7 @@ typedef struct {
vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES];
unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES]
[COEF_BANDS][PREV_COEF_CONTEXTS];
- unsigned int switchable_interp[SWITCHABLE_FILTERS + 1]
+ unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
[SWITCHABLE_FILTERS];
unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
@@ -90,6 +91,8 @@ typedef struct VP9Common {
DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][8]);
#endif
+ COLOR_SPACE color_space;
+
int width;
int height;
int display_width;
@@ -115,6 +118,7 @@ typedef struct VP9Common {
// Each frame can reference ALLOWED_REFS_PER_FRAME buffers
int active_ref_idx[ALLOWED_REFS_PER_FRAME];
struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME];
+ struct scale_factors_common active_ref_scale_comm[ALLOWED_REFS_PER_FRAME];
int new_fb_idx;
YV12_BUFFER_CONFIG post_proc_buffer;
@@ -171,7 +175,7 @@ typedef struct VP9Common {
// Persistent mb segment id map used in prediction.
unsigned char *last_frame_seg_map;
- INTERPOLATIONFILTERTYPE mcomp_filter_type;
+ INTERPOLATION_TYPE mcomp_filter_type;
loop_filter_info_n lf_info;
@@ -182,14 +186,6 @@ typedef struct VP9Common {
struct loopfilter lf;
struct segmentation seg;
- /* Y,U,V */
- ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
- ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
-
- // partition contexts
- PARTITION_CONTEXT *above_seg_context;
- PARTITION_CONTEXT left_seg_context[8];
-
// Context probabilities for reference frame prediction
int allow_comp_inter_inter;
MV_REFERENCE_FRAME comp_fixed_ref;
@@ -212,10 +208,19 @@ typedef struct VP9Common {
int frame_parallel_decoding_mode;
int log2_tile_cols, log2_tile_rows;
- int cur_tile_mi_col_start, cur_tile_mi_col_end;
- int cur_tile_mi_row_start, cur_tile_mi_row_end;
} VP9_COMMON;
+// ref == 0 => LAST_FRAME
+// ref == 1 => GOLDEN_FRAME
+// ref == 2 => ALTREF_FRAME
+static YV12_BUFFER_CONFIG *get_frame_ref_buffer(VP9_COMMON *cm, int ref) {
+ return &cm->yv12_fb[cm->active_ref_idx[ref]];
+}
+
+static YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
+ return &cm->yv12_fb[cm->new_fb_idx];
+}
+
static int get_free_fb(VP9_COMMON *cm) {
int i;
for (i = 0; i < NUM_YV12_BUFFERS; i++)
@@ -240,47 +245,38 @@ static int mi_cols_aligned_to_sb(int n_mis) {
return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
}
-static INLINE void set_skip_context(VP9_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col) {
+static INLINE const vp9_prob* get_partition_probs(VP9_COMMON *cm, int ctx) {
+ return cm->frame_type == KEY_FRAME ? vp9_kf_partition_probs[ctx]
+ : cm->fc.partition_prob[ctx];
+}
+
+static INLINE void set_skip_context(
+ MACROBLOCKD *xd,
+ ENTROPY_CONTEXT *above_context[MAX_MB_PLANE],
+ ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16],
+ int mi_row, int mi_col) {
const int above_idx = mi_col * 2;
const int left_idx = (mi_row * 2) & 15;
int i;
for (i = 0; i < MAX_MB_PLANE; i++) {
struct macroblockd_plane *const pd = &xd->plane[i];
- pd->above_context = cm->above_context[i] + (above_idx >> pd->subsampling_x);
- pd->left_context = cm->left_context[i] + (left_idx >> pd->subsampling_y);
+ pd->above_context = above_context[i] + (above_idx >> pd->subsampling_x);
+ pd->left_context = left_context[i] + (left_idx >> pd->subsampling_y);
}
}
-// return the node index in the prob tree for binary coding
-static int check_bsize_coverage(int bs, int mi_rows, int mi_cols,
- int mi_row, int mi_col) {
- const int r = (mi_row + bs < mi_rows);
- const int c = (mi_col + bs < mi_cols);
-
- if (r && c)
- return 0;
-
- if (c && !r)
- return 1; // only allow horizontal/split partition types
-
- if (r && !c)
- return 2; // only allow vertical/split partition types
-
- return -1;
-}
-
-static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int bh,
- int mi_col, int bw) {
+static void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+ int mi_row, int bh,
+ int mi_col, int bw,
+ int mi_rows, int mi_cols) {
xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
- xd->mb_to_bottom_edge = ((cm->mi_rows - bh - mi_row) * MI_SIZE) * 8;
+ xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
- xd->mb_to_right_edge = ((cm->mi_cols - bw - mi_col) * MI_SIZE) * 8;
+ xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
// Are edges available for intra prediction?
xd->up_available = (mi_row != 0);
- xd->left_available = (mi_col > cm->cur_tile_mi_col_start);
+ xd->left_available = (mi_col > tile->mi_col_start);
}
static void set_prev_mi(VP9_COMMON *cm) {
@@ -299,12 +295,14 @@ static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
return cm->frame_type == KEY_FRAME || cm->intra_only;
}
-static INLINE void update_partition_context(VP9_COMMON *cm,
- int mi_row, int mi_col,
- BLOCK_SIZE sb_type,
- BLOCK_SIZE sb_size) {
- PARTITION_CONTEXT *above_ctx = cm->above_seg_context + mi_col;
- PARTITION_CONTEXT *left_ctx = cm->left_seg_context + (mi_row & MI_MASK);
+static INLINE void update_partition_context(
+ PARTITION_CONTEXT *above_seg_context,
+ PARTITION_CONTEXT left_seg_context[8],
+ int mi_row, int mi_col,
+ BLOCK_SIZE sb_type,
+ BLOCK_SIZE sb_size) {
+ PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col;
+ PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK);
const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
const int bwl = b_width_log2(sb_type);
@@ -323,11 +321,13 @@ static INLINE void update_partition_context(VP9_COMMON *cm,
vpx_memset(left_ctx, pcvalue[bhl == bsl], bs);
}
-static INLINE int partition_plane_context(const VP9_COMMON *cm,
- int mi_row, int mi_col,
- BLOCK_SIZE sb_type) {
- const PARTITION_CONTEXT *above_ctx = cm->above_seg_context + mi_col;
- const PARTITION_CONTEXT *left_ctx = cm->left_seg_context + (mi_row & MI_MASK);
+static INLINE int partition_plane_context(
+ const PARTITION_CONTEXT *above_seg_context,
+ const PARTITION_CONTEXT left_seg_context[8],
+ int mi_row, int mi_col,
+ BLOCK_SIZE sb_type) {
+ const PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col;
+ const PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK);
int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
int above = 0, left = 0, i;
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index be42c56b5..6018e1775 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -35,14 +35,14 @@ unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
// left of the entries correpsonding to real macroblocks.
// The prediction flags in these dummy entries are initialised to 0.
// left
- const int left_mv_pred = left_in_image ? is_inter_mode(left_mi->mbmi.mode)
+ const int left_mv_pred = left_in_image ? is_inter_block(&left_mi->mbmi)
: 0;
const int left_interp = left_in_image && left_mv_pred
? left_mi->mbmi.interp_filter
: SWITCHABLE_FILTERS;
// above
- const int above_mv_pred = above_in_image ? is_inter_mode(above_mi->mbmi.mode)
+ const int above_mv_pred = above_in_image ? is_inter_block(&above_mi->mbmi)
: 0;
const int above_interp = above_in_image && above_mv_pred
? above_mi->mbmi.interp_filter
@@ -403,8 +403,8 @@ void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) {
int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
BLOCK_SIZE bsize, int mi_row, int mi_col) {
const int mi_offset = mi_row * cm->mi_cols + mi_col;
- const int bw = 1 << mi_width_log2(bsize);
- const int bh = 1 << mi_height_log2(bsize);
+ const int bw = num_8x8_blocks_wide_lookup[bsize];
+ const int bh = num_8x8_blocks_high_lookup[bsize];
const int xmis = MIN(cm->mi_cols - mi_col, bw);
const int ymis = MIN(cm->mi_rows - mi_row, bh);
int x, y, segment_id = INT_MAX;
diff --git a/vp9/common/vp9_pred_common.h b/vp9/common/vp9_pred_common.h
index a869dc0a6..19032bf62 100644
--- a/vp9/common/vp9_pred_common.h
+++ b/vp9/common/vp9_pred_common.h
@@ -127,14 +127,14 @@ static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
return get_tx_probs(bsize, context, tx_probs);
}
-static void update_tx_counts(BLOCK_SIZE bsize, uint8_t context,
- TX_SIZE tx_size, struct tx_counts *tx_counts) {
- if (bsize >= BLOCK_32X32)
- tx_counts->p32x32[context][tx_size]++;
- else if (bsize >= BLOCK_16X16)
- tx_counts->p16x16[context][tx_size]++;
+static unsigned int *get_tx_counts(BLOCK_SIZE bsize, uint8_t context,
+ struct tx_counts *tx_counts) {
+ if (bsize < BLOCK_16X16)
+ return tx_counts->p8x8[context];
+ else if (bsize < BLOCK_32X32)
+ return tx_counts->p16x16[context];
else
- tx_counts->p8x8[context][tx_size]++;
+ return tx_counts->p32x32[context];
}
#endif // VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 6f16ac70a..1c96788db 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -21,7 +21,7 @@
#include "vp9/common/vp9_reconintra.h"
void vp9_setup_interp_filters(MACROBLOCKD *xd,
- INTERPOLATIONFILTERTYPE mcomp_filter_type,
+ INTERPOLATION_TYPE mcomp_filter_type,
VP9_COMMON *cm) {
if (xd->mi_8x8 && xd->mi_8x8[0]) {
MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
@@ -40,6 +40,24 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
}
+static void inter_predictor(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const MV32 *mv,
+ const struct scale_factors *scale,
+ int w, int h, int ref,
+ const struct subpix_fn_table *subpix,
+ int xs, int ys) {
+ const int subpel_x = mv->col & SUBPEL_MASK;
+ const int subpel_y = mv->row & SUBPEL_MASK;
+
+ src += (mv->row >> SUBPEL_BITS) * src_stride + (mv->col >> SUBPEL_BITS);
+ scale->sfc->predict[subpel_x != 0][subpel_y != 0][ref](
+ src, src_stride, dst, dst_stride,
+ subpix->filter_x[subpel_x], xs,
+ subpix->filter_y[subpel_y], ys,
+ w, h);
+}
+
void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const MV *src_mv,
@@ -50,16 +68,11 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
const int is_q4 = precision == MV_PRECISION_Q4;
const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
is_q4 ? src_mv->col : src_mv->col * 2 };
- const MV32 mv = scale->scale_mv(&mv_q4, scale);
- const int subpel_x = mv.col & SUBPEL_MASK;
- const int subpel_y = mv.row & SUBPEL_MASK;
+ const struct scale_factors_common *sfc = scale->sfc;
+ const MV32 mv = sfc->scale_mv(&mv_q4, scale);
- src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
- scale->predict[subpel_x != 0][subpel_y != 0][ref](
- src, src_stride, dst, dst_stride,
- subpix->filter_x[subpel_x], scale->x_step_q4,
- subpix->filter_y[subpel_y], scale->y_step_q4,
- w, h);
+ inter_predictor(src, src_stride, dst, dst_stride, &mv, scale,
+ w, h, ref, subpix, sfc->x_step_q4, sfc->y_step_q4);
}
static INLINE int round_mv_comp_q4(int value) {
@@ -133,10 +146,6 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
struct scale_factors *const scale = &xd->scale_factor[ref];
struct buf_2d *const pre_buf = &pd->pre[ref];
struct buf_2d *const dst_buf = &pd->dst;
-
- const uint8_t *const pre = pre_buf->buf + scaled_buffer_offset(x, y,
- pre_buf->stride, scale);
-
uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
// TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
@@ -152,15 +161,32 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
// scaling case. It needs to be done on the scaled MV, not the pre-scaling
// MV. Note however that it performs the subsampling aware scaling so
// that the result is always q4.
- const MV res_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
- pd->subsampling_x,
- pd->subsampling_y);
-
- scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
- vp9_build_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
- &res_mv, scale,
- 4 << pred_w, 4 << pred_h, ref,
- &xd->subpix, MV_PRECISION_Q4);
+ // mv_precision precision is MV_PRECISION_Q4.
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+ pd->subsampling_x,
+ pd->subsampling_y);
+
+ uint8_t *pre;
+ MV32 scaled_mv;
+ int xs, ys;
+
+ if (vp9_is_scaled(scale->sfc)) {
+ pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale);
+ scale->sfc->set_scaled_offsets(scale, arg->y + y, arg->x + x);
+ scaled_mv = scale->sfc->scale_mv(&mv_q4, scale);
+ xs = scale->sfc->x_step_q4;
+ ys = scale->sfc->y_step_q4;
+ } else {
+ pre = pre_buf->buf + (y * pre_buf->stride + x);
+ scaled_mv.row = mv_q4.row;
+ scaled_mv.col = mv_q4.col;
+ xs = ys = 16;
+ }
+
+ inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+ &scaled_mv, scale,
+ 4 << pred_w, 4 << pred_h, ref,
+ &xd->subpix, xs, ys);
}
}
@@ -220,15 +246,17 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
const int ref = cm->active_ref_idx[i];
struct scale_factors *const sf = &cm->active_ref_scale[i];
+ struct scale_factors_common *const sfc = &cm->active_ref_scale_comm[i];
if (ref >= NUM_YV12_BUFFERS) {
vp9_zero(*sf);
+ vp9_zero(*sfc);
} else {
YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref];
- vp9_setup_scale_factors_for_frame(sf,
+ vp9_setup_scale_factors_for_frame(sf, sfc,
fb->y_crop_width, fb->y_crop_height,
cm->width, cm->height);
- if (vp9_is_scaled(sf))
+ if (vp9_is_scaled(sfc))
vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y);
}
}
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 504b79356..2c8a6e4d9 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -25,7 +25,7 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
BLOCK_SIZE bsize);
void vp9_setup_interp_filters(MACROBLOCKD *xd,
- INTERPOLATIONFILTERTYPE filter,
+ INTERPOLATION_TYPE filter,
VP9_COMMON *cm);
void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
@@ -38,8 +38,10 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
static int scaled_buffer_offset(int x_offset, int y_offset, int stride,
const struct scale_factors *scale) {
- const int x = scale ? scale->scale_value_x(x_offset, scale) : x_offset;
- const int y = scale ? scale->scale_value_y(y_offset, scale) : y_offset;
+ const int x = scale ? scale->sfc->scale_value_x(x_offset, scale->sfc) :
+ x_offset;
+ const int y = scale ? scale->sfc->scale_value_y(y_offset, scale->sfc) :
+ y_offset;
return y * stride + x;
}
diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c
index bd609dcf0..eb643b090 100644
--- a/vp9/common/vp9_reconintra.c
+++ b/vp9/common/vp9_reconintra.c
@@ -369,7 +369,7 @@ static void build_intra_predictors(const uint8_t *ref, int ref_stride,
}
}
-void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
TX_SIZE tx_size, int mode,
const uint8_t *ref, int ref_stride,
uint8_t *dst, int dst_stride) {
diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h
index e9d0dbf04..6e3f55c4d 100644
--- a/vp9/common/vp9_reconintra.h
+++ b/vp9/common/vp9_reconintra.h
@@ -14,8 +14,8 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_blockd.h"
-void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
- TX_SIZE tx_size, int mode,
- const uint8_t *ref, int ref_stride,
- uint8_t *dst, int dst_stride);
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
+ TX_SIZE tx_size, int mode,
+ const uint8_t *ref, int ref_stride,
+ uint8_t *dst, int dst_stride);
#endif // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index df92b5882..debec6154 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -22,10 +22,11 @@ forward_decls vp9_common_forward_decls
# x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
[ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse_x86inc=sse &&
- sse2_x86inc=sse2 && ssse3_x86inc=ssse3
+ sse2_x86inc=sse2 && ssse3_x86inc=ssse3 && avx_x86inc=avx && avx2_x86inc=avx2
# this variable is for functions that are 64 bit only.
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 &&
+ ssse3_x86_64=ssse3 && avx_x86_64=avx && avx2_x86_64=avx2
#
# RECON
@@ -157,7 +158,7 @@ prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const
specialize vp9_d63_predictor_32x32 $ssse3_x86inc
prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_32x32 $ssse3 x86inc
+specialize vp9_h_predictor_32x32 $ssse3_x86inc
prototype void vp9_d117_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d117_predictor_32x32
@@ -199,7 +200,7 @@ prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8
specialize vp9_loop_filter_vertical_edge mmx neon
prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mb_lpf_horizontal_edge_w sse2 neon
+specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon
prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_mbloop_filter_horizontal_edge sse2 neon
@@ -268,43 +269,46 @@ specialize vp9_convolve8_avg_vert sse2 ssse3 neon dspr2
# dct
#
prototype void vp9_idct4x4_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct4x4_1_add sse2 neon
+specialize vp9_idct4x4_1_add sse2 neon dspr2
prototype void vp9_idct4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct4x4_16_add sse2 neon
+specialize vp9_idct4x4_16_add sse2 neon dspr2
prototype void vp9_idct8x8_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_1_add sse2 neon
+specialize vp9_idct8x8_1_add sse2 neon dspr2
prototype void vp9_idct8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_64_add sse2 neon
+specialize vp9_idct8x8_64_add sse2 neon dspr2
prototype void vp9_idct8x8_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct8x8_10_add sse2 neon
+specialize vp9_idct8x8_10_add sse2 neon dspr2
prototype void vp9_idct16x16_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_1_add sse2 neon
+specialize vp9_idct16x16_1_add sse2 neon dspr2
prototype void vp9_idct16x16_256_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_256_add sse2 neon
+specialize vp9_idct16x16_256_add sse2 neon dspr2
prototype void vp9_idct16x16_10_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct16x16_10_add sse2 neon
+specialize vp9_idct16x16_10_add sse2 neon dspr2
prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_1024_add sse2 neon
+specialize vp9_idct32x32_1024_add sse2 neon dspr2
+
+prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_idct32x32_34_add sse2
prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_1_add sse2
+specialize vp9_idct32x32_1_add sse2 dspr2
prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_iht4x4_16_add sse2 neon
+specialize vp9_iht4x4_16_add sse2 neon dspr2
prototype void vp9_iht8x8_64_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_iht8x8_64_add sse2 neon
+specialize vp9_iht8x8_64_add sse2 neon dspr2
prototype void vp9_iht16x16_256_add "const int16_t *input, uint8_t *output, int pitch, int tx_type"
-specialize vp9_iht16x16_256_add sse2
+specialize vp9_iht16x16_256_add sse2 dspr2
# dct and add
@@ -668,10 +672,10 @@ specialize vp9_block_error $sse2_x86inc
prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
specialize vp9_subtract_block $sse2_x86inc
-prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+prototype void vp9_quantize_b "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
specialize vp9_quantize_b $ssse3_x86_64
-prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+prototype void vp9_quantize_b_32x32 "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
specialize vp9_quantize_b_32x32 $ssse3_x86_64
#
@@ -686,32 +690,32 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
fi
# fdct functions
-prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
+prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
specialize vp9_short_fht4x4 sse2
-prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
+prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
specialize vp9_short_fht8x8 sse2
-prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type"
+prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
specialize vp9_short_fht16x16 sse2
-prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int stride"
-specialize vp9_short_fdct8x8 sse2
+prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fwht4x4
-prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int stride"
-specialize vp9_short_fdct4x4 sse2
+prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct4x4 sse2
-prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int stride"
-specialize vp9_short_fdct32x32 sse2
+prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct8x8 sse2
-prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int stride"
-specialize vp9_short_fdct32x32_rd sse2
+prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct16x16 sse2
-prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int stride"
-specialize vp9_short_fdct16x16 sse2
+prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct32x32 sse2
-prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch"
-specialize vp9_short_walsh4x4
+prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride"
+specialize vp9_fdct32x32_rd sse2
#
# Motion search
diff --git a/vp9/common/vp9_scale.c b/vp9/common/vp9_scale.c
index 989206c60..3f0994f80 100644
--- a/vp9/common/vp9_scale.c
+++ b/vp9/common/vp9_scale.c
@@ -12,23 +12,23 @@
#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_scale.h"
-static INLINE int scaled_x(int val, const struct scale_factors *scale) {
- return val * scale->x_scale_fp >> REF_SCALE_SHIFT;
+static INLINE int scaled_x(int val, const struct scale_factors_common *sfc) {
+ return val * sfc->x_scale_fp >> REF_SCALE_SHIFT;
}
-static INLINE int scaled_y(int val, const struct scale_factors *scale) {
- return val * scale->y_scale_fp >> REF_SCALE_SHIFT;
+static INLINE int scaled_y(int val, const struct scale_factors_common *sfc) {
+ return val * sfc->y_scale_fp >> REF_SCALE_SHIFT;
}
-static int unscaled_value(int val, const struct scale_factors *scale) {
- (void) scale;
+static int unscaled_value(int val, const struct scale_factors_common *sfc) {
+ (void) sfc;
return val;
}
static MV32 scaled_mv(const MV *mv, const struct scale_factors *scale) {
const MV32 res = {
- scaled_y(mv->row, scale) + scale->y_offset_q4,
- scaled_x(mv->col, scale) + scale->x_offset_q4
+ scaled_y(mv->row, scale->sfc) + scale->y_offset_q4,
+ scaled_x(mv->col, scale->sfc) + scale->x_offset_q4
};
return res;
}
@@ -43,8 +43,8 @@ static MV32 unscaled_mv(const MV *mv, const struct scale_factors *scale) {
static void set_offsets_with_scaling(struct scale_factors *scale,
int row, int col) {
- scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale) & SUBPEL_MASK;
- scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale) & SUBPEL_MASK;
+ scale->x_offset_q4 = scaled_x(col << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK;
+ scale->y_offset_q4 = scaled_y(row << SUBPEL_BITS, scale->sfc) & SUBPEL_MASK;
}
static void set_offsets_without_scaling(struct scale_factors *scale,
@@ -70,31 +70,30 @@ static int check_scale_factors(int other_w, int other_h,
}
void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+ struct scale_factors_common *scale_comm,
int other_w, int other_h,
int this_w, int this_h) {
if (!check_scale_factors(other_w, other_h, this_w, this_h)) {
- scale->x_scale_fp = REF_INVALID_SCALE;
- scale->y_scale_fp = REF_INVALID_SCALE;
+ scale_comm->x_scale_fp = REF_INVALID_SCALE;
+ scale_comm->y_scale_fp = REF_INVALID_SCALE;
return;
}
- scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
- scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
- scale->x_step_q4 = scaled_x(16, scale);
- scale->y_step_q4 = scaled_y(16, scale);
- scale->x_offset_q4 = 0; // calculated per block
- scale->y_offset_q4 = 0; // calculated per block
+ scale_comm->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
+ scale_comm->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
+ scale_comm->x_step_q4 = scaled_x(16, scale_comm);
+ scale_comm->y_step_q4 = scaled_y(16, scale_comm);
- if (vp9_is_scaled(scale)) {
- scale->scale_value_x = scaled_x;
- scale->scale_value_y = scaled_y;
- scale->set_scaled_offsets = set_offsets_with_scaling;
- scale->scale_mv = scaled_mv;
+ if (vp9_is_scaled(scale_comm)) {
+ scale_comm->scale_value_x = scaled_x;
+ scale_comm->scale_value_y = scaled_y;
+ scale_comm->set_scaled_offsets = set_offsets_with_scaling;
+ scale_comm->scale_mv = scaled_mv;
} else {
- scale->scale_value_x = unscaled_value;
- scale->scale_value_y = unscaled_value;
- scale->set_scaled_offsets = set_offsets_without_scaling;
- scale->scale_mv = unscaled_mv;
+ scale_comm->scale_value_x = unscaled_value;
+ scale_comm->scale_value_y = unscaled_value;
+ scale_comm->set_scaled_offsets = set_offsets_without_scaling;
+ scale_comm->scale_mv = unscaled_mv;
}
// TODO(agrange): Investigate the best choice of functions to use here
@@ -103,44 +102,48 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
// applied in one direction only, and not at all for 0,0, seems to give the
// best quality, but it may be worth trying an additional mode that does
// do the filtering on full-pel.
- if (scale->x_step_q4 == 16) {
- if (scale->y_step_q4 == 16) {
+ if (scale_comm->x_step_q4 == 16) {
+ if (scale_comm->y_step_q4 == 16) {
// No scaling in either direction.
- scale->predict[0][0][0] = vp9_convolve_copy;
- scale->predict[0][0][1] = vp9_convolve_avg;
- scale->predict[0][1][0] = vp9_convolve8_vert;
- scale->predict[0][1][1] = vp9_convolve8_avg_vert;
- scale->predict[1][0][0] = vp9_convolve8_horiz;
- scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+ scale_comm->predict[0][0][0] = vp9_convolve_copy;
+ scale_comm->predict[0][0][1] = vp9_convolve_avg;
+ scale_comm->predict[0][1][0] = vp9_convolve8_vert;
+ scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert;
+ scale_comm->predict[1][0][0] = vp9_convolve8_horiz;
+ scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz;
} else {
// No scaling in x direction. Must always scale in the y direction.
- scale->predict[0][0][0] = vp9_convolve8_vert;
- scale->predict[0][0][1] = vp9_convolve8_avg_vert;
- scale->predict[0][1][0] = vp9_convolve8_vert;
- scale->predict[0][1][1] = vp9_convolve8_avg_vert;
- scale->predict[1][0][0] = vp9_convolve8;
- scale->predict[1][0][1] = vp9_convolve8_avg;
+ scale_comm->predict[0][0][0] = vp9_convolve8_vert;
+ scale_comm->predict[0][0][1] = vp9_convolve8_avg_vert;
+ scale_comm->predict[0][1][0] = vp9_convolve8_vert;
+ scale_comm->predict[0][1][1] = vp9_convolve8_avg_vert;
+ scale_comm->predict[1][0][0] = vp9_convolve8;
+ scale_comm->predict[1][0][1] = vp9_convolve8_avg;
}
} else {
- if (scale->y_step_q4 == 16) {
+ if (scale_comm->y_step_q4 == 16) {
// No scaling in the y direction. Must always scale in the x direction.
- scale->predict[0][0][0] = vp9_convolve8_horiz;
- scale->predict[0][0][1] = vp9_convolve8_avg_horiz;
- scale->predict[0][1][0] = vp9_convolve8;
- scale->predict[0][1][1] = vp9_convolve8_avg;
- scale->predict[1][0][0] = vp9_convolve8_horiz;
- scale->predict[1][0][1] = vp9_convolve8_avg_horiz;
+ scale_comm->predict[0][0][0] = vp9_convolve8_horiz;
+ scale_comm->predict[0][0][1] = vp9_convolve8_avg_horiz;
+ scale_comm->predict[0][1][0] = vp9_convolve8;
+ scale_comm->predict[0][1][1] = vp9_convolve8_avg;
+ scale_comm->predict[1][0][0] = vp9_convolve8_horiz;
+ scale_comm->predict[1][0][1] = vp9_convolve8_avg_horiz;
} else {
// Must always scale in both directions.
- scale->predict[0][0][0] = vp9_convolve8;
- scale->predict[0][0][1] = vp9_convolve8_avg;
- scale->predict[0][1][0] = vp9_convolve8;
- scale->predict[0][1][1] = vp9_convolve8_avg;
- scale->predict[1][0][0] = vp9_convolve8;
- scale->predict[1][0][1] = vp9_convolve8_avg;
+ scale_comm->predict[0][0][0] = vp9_convolve8;
+ scale_comm->predict[0][0][1] = vp9_convolve8_avg;
+ scale_comm->predict[0][1][0] = vp9_convolve8;
+ scale_comm->predict[0][1][1] = vp9_convolve8_avg;
+ scale_comm->predict[1][0][0] = vp9_convolve8;
+ scale_comm->predict[1][0][1] = vp9_convolve8_avg;
}
}
// 2D subpel motion always gets filtered in both directions
- scale->predict[1][1][0] = vp9_convolve8;
- scale->predict[1][1][1] = vp9_convolve8_avg;
+ scale_comm->predict[1][1][0] = vp9_convolve8;
+ scale_comm->predict[1][1][1] = vp9_convolve8_avg;
+
+ scale->sfc = scale_comm;
+ scale->x_offset_q4 = 0; // calculated per block
+ scale->y_offset_q4 = 0; // calculated per block
}
diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h
index ece011477..1437fcd9c 100644
--- a/vp9/common/vp9_scale.h
+++ b/vp9/common/vp9_scale.h
@@ -18,34 +18,40 @@
#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
#define REF_INVALID_SCALE -1
-struct scale_factors {
+struct scale_factors;
+struct scale_factors_common {
int x_scale_fp; // horizontal fixed point scale factor
int y_scale_fp; // vertical fixed point scale factor
- int x_offset_q4;
int x_step_q4;
- int y_offset_q4;
int y_step_q4;
- int (*scale_value_x)(int val, const struct scale_factors *scale);
- int (*scale_value_y)(int val, const struct scale_factors *scale);
+ int (*scale_value_x)(int val, const struct scale_factors_common *sfc);
+ int (*scale_value_y)(int val, const struct scale_factors_common *sfc);
void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col);
MV32 (*scale_mv)(const MV *mv, const struct scale_factors *scale);
convolve_fn_t predict[2][2][2]; // horiz, vert, avg
};
+struct scale_factors {
+ int x_offset_q4;
+ int y_offset_q4;
+ const struct scale_factors_common *sfc;
+};
+
void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
+ struct scale_factors_common *scale_comm,
int other_w, int other_h,
int this_w, int this_h);
-static int vp9_is_valid_scale(const struct scale_factors *sf) {
- return sf->x_scale_fp != REF_INVALID_SCALE &&
- sf->y_scale_fp != REF_INVALID_SCALE;
+static int vp9_is_valid_scale(const struct scale_factors_common *sfc) {
+ return sfc->x_scale_fp != REF_INVALID_SCALE &&
+ sfc->y_scale_fp != REF_INVALID_SCALE;
}
-static int vp9_is_scaled(const struct scale_factors *sf) {
- return sf->x_scale_fp != REF_NO_SCALE ||
- sf->y_scale_fp != REF_NO_SCALE;
+static int vp9_is_scaled(const struct scale_factors_common *sfc) {
+ return sfc->x_scale_fp != REF_NO_SCALE ||
+ sfc->y_scale_fp != REF_NO_SCALE;
}
#endif // VP9_COMMON_VP9_SCALE_H_
diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
index a5c8463d5..14a1a7eb0 100644
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h
@@ -191,8 +191,7 @@ static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
}
static INLINE int get_coef_context(const int16_t *neighbors,
- uint8_t *token_cache,
- int c) {
+ const uint8_t *token_cache, int c) {
return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
}
diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c
index 1791c1a8f..e3035d076 100644
--- a/vp9/common/vp9_tile_common.c
+++ b/vp9/common/vp9_tile_common.c
@@ -10,6 +10,8 @@
#include "vp9/common/vp9_tile_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
#define MIN_TILE_WIDTH_B64 4
#define MAX_TILE_WIDTH_B64 64
@@ -17,8 +19,8 @@ static int to_sbs(n_mis) {
return mi_cols_aligned_to_sb(n_mis) >> MI_BLOCK_SIZE_LOG2;
}
-static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off,
- int tile_idx, int log2_n_tiles, int n_mis) {
+static void get_tile_offsets(int *min_tile_off, int *max_tile_off,
+ int tile_idx, int log2_n_tiles, int n_mis) {
const int n_sbs = to_sbs(n_mis);
const int sb_off1 = (tile_idx * n_sbs) >> log2_n_tiles;
const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;
@@ -27,17 +29,14 @@ static void vp9_get_tile_offsets(int *min_tile_off, int *max_tile_off,
*max_tile_off = MIN(sb_off2 << 3, n_mis);
}
-void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) {
- vp9_get_tile_offsets(&cm->cur_tile_mi_col_start, &cm->cur_tile_mi_col_end,
- tile_col_idx, cm->log2_tile_cols, cm->mi_cols);
-}
-
-void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) {
- vp9_get_tile_offsets(&cm->cur_tile_mi_row_start, &cm->cur_tile_mi_row_end,
- tile_row_idx, cm->log2_tile_rows, cm->mi_rows);
+void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm,
+ int row_idx, int col_idx) {
+ get_tile_offsets(&tile->mi_row_start, &tile->mi_row_end,
+ row_idx, cm->log2_tile_rows, cm->mi_rows);
+ get_tile_offsets(&tile->mi_col_start, &tile->mi_col_end,
+ col_idx, cm->log2_tile_cols, cm->mi_cols);
}
-
void vp9_get_tile_n_bits(int mi_cols,
int *min_log2_tile_cols, int *max_log2_tile_cols) {
const int sb_cols = to_sbs(mi_cols);
diff --git a/vp9/common/vp9_tile_common.h b/vp9/common/vp9_tile_common.h
index 6d14560b9..a110abbdb 100644
--- a/vp9/common/vp9_tile_common.h
+++ b/vp9/common/vp9_tile_common.h
@@ -11,11 +11,17 @@
#ifndef VP9_COMMON_VP9_TILE_COMMON_H_
#define VP9_COMMON_VP9_TILE_COMMON_H_
-#include "vp9/common/vp9_onyxc_int.h"
+struct VP9Common;
-void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx);
+typedef struct TileInfo {
+ int mi_row_start, mi_row_end;
+ int mi_col_start, mi_col_end;
+} TileInfo;
-void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx);
+// initializes 'tile->mi_(row|col)_(start|end)' for (row_idx, col_idx) based on
+// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)'
+void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm,
+ int row_idx, int col_idx);
void vp9_get_tile_n_bits(int mi_cols,
int *min_log2_tile_cols, int *max_log2_tile_cols);
diff --git a/vp9/common/vp9_treecoder.c b/vp9/common/vp9_treecoder.c
index da1213d71..1805fb4d8 100644
--- a/vp9/common/vp9_treecoder.c
+++ b/vp9/common/vp9_treecoder.c
@@ -40,9 +40,7 @@ void vp9_tokens_from_tree_offset(struct vp9_token *p, vp9_tree t,
tree2tok(p - offset, t, 0, 0, 0);
}
-static unsigned int convert_distribution(unsigned int i,
- vp9_tree tree,
- vp9_prob probs[],
+static unsigned int convert_distribution(unsigned int i, vp9_tree tree,
unsigned int branch_ct[][2],
const unsigned int num_events[],
unsigned int tok0_offset) {
@@ -51,24 +49,25 @@ static unsigned int convert_distribution(unsigned int i,
if (tree[i] <= 0) {
left = num_events[-tree[i] - tok0_offset];
} else {
- left = convert_distribution(tree[i], tree, probs, branch_ct,
- num_events, tok0_offset);
+ left = convert_distribution(tree[i], tree, branch_ct, num_events,
+ tok0_offset);
}
if (tree[i + 1] <= 0)
right = num_events[-tree[i + 1] - tok0_offset];
else
- right = convert_distribution(tree[i + 1], tree, probs, branch_ct,
- num_events, tok0_offset);
+ right = convert_distribution(tree[i + 1], tree, branch_ct, num_events,
+ tok0_offset);
- probs[i>>1] = get_binary_prob(left, right);
- branch_ct[i>>1][0] = left;
- branch_ct[i>>1][1] = right;
+ branch_ct[i >> 1][0] = left;
+ branch_ct[i >> 1][1] = right;
return left + right;
}
-void vp9_tree_probs_from_distribution(vp9_tree tree, vp9_prob probs[/* n-1 */],
+void vp9_tree_probs_from_distribution(vp9_tree tree,
unsigned int branch_ct[/* n-1 */][2],
const unsigned int num_events[/* n */],
unsigned int tok0_offset) {
- convert_distribution(0, tree, probs, branch_ct, num_events, tok0_offset);
+ convert_distribution(0, tree, branch_ct, num_events, tok0_offset);
}
+
+
diff --git a/vp9/common/vp9_treecoder.h b/vp9/common/vp9_treecoder.h
index 4ba171f46..9c776d61c 100644
--- a/vp9/common/vp9_treecoder.h
+++ b/vp9/common/vp9_treecoder.h
@@ -50,11 +50,11 @@ void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset);
probability updates. */
void vp9_tree_probs_from_distribution(vp9_tree tree,
- vp9_prob probs[ /* n - 1 */ ],
unsigned int branch_ct[ /* n - 1 */ ][2],
const unsigned int num_events[ /* n */ ],
unsigned int tok0_offset);
+
static INLINE vp9_prob clip_prob(int p) {
return (p > 255) ? 255u : (p < 1) ? 1u : p;
}
@@ -81,21 +81,46 @@ static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
}
-static INLINE vp9_prob merge_probs(vp9_prob pre_prob, vp9_prob prob,
+static INLINE vp9_prob merge_probs(vp9_prob pre_prob,
const unsigned int ct[2],
unsigned int count_sat,
unsigned int max_update_factor) {
+ const vp9_prob prob = get_binary_prob(ct[0], ct[1]);
const unsigned int count = MIN(ct[0] + ct[1], count_sat);
const unsigned int factor = max_update_factor * count / count_sat;
return weighted_prob(pre_prob, prob, factor);
}
-static INLINE vp9_prob merge_probs2(vp9_prob pre_prob,
- const unsigned int ct[2],
- unsigned int count_sat,
- unsigned int max_update_factor) {
- return merge_probs(pre_prob, get_binary_prob(ct[0], ct[1]), ct, count_sat,
- max_update_factor);
+static unsigned int tree_merge_probs_impl(unsigned int i,
+ const vp9_tree_index *tree,
+ const vp9_prob *pre_probs,
+ const unsigned int *counts,
+ unsigned int count_sat,
+ unsigned int max_update_factor,
+ vp9_prob *probs) {
+ const int l = tree[i];
+ const unsigned int left_count = (l <= 0)
+ ? counts[-l]
+ : tree_merge_probs_impl(l, tree, pre_probs, counts,
+ count_sat, max_update_factor, probs);
+ const int r = tree[i + 1];
+ const unsigned int right_count = (r <= 0)
+ ? counts[-r]
+ : tree_merge_probs_impl(r, tree, pre_probs, counts,
+ count_sat, max_update_factor, probs);
+ const unsigned int ct[2] = { left_count, right_count };
+ probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct,
+ count_sat, max_update_factor);
+ return left_count + right_count;
+}
+
+static void tree_merge_probs(const vp9_tree_index *tree,
+ const vp9_prob *pre_probs,
+ const unsigned int *counts, int offset,
+ unsigned int count_sat,
+ unsigned int max_update_factor, vp9_prob *probs) {
+ tree_merge_probs_impl(0, tree, pre_probs, &counts[-offset],
+ count_sat, max_update_factor, probs);
}
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index cfec36b42..ccf5aac17 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -415,7 +415,7 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
res3 = _mm_packs_epi32(tmp6, tmp7); \
}
-#define IDCT8x8_1D \
+#define IDCT8_1D \
/* Stage1 */ \
{ \
const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
@@ -525,12 +525,12 @@ void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
// 2-D
for (i = 0; i < 2; i++) {
- // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
+ // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in4, in5, in6, in7);
// 4-stage 1D idct8x8
- IDCT8x8_1D
+ IDCT8_1D
}
// Final rounding and shift
@@ -638,12 +638,12 @@ static void idct8_1d_sse2(__m128i *in) {
in6 = in[6];
in7 = in[7];
- // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
+ // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
in4, in5, in6, in7);
// 4-stage 1D idct8x8
- IDCT8x8_1D
+ IDCT8_1D
in[0] = in0;
in[1] = in1;
in[2] = in2;
@@ -1068,7 +1068,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
in4, in5, in6, in7)
// 1D idct8x8
- IDCT8x8_1D
+ IDCT8_1D
// Final rounding and shift
in0 = _mm_adds_epi16(in0, final_rounding);
@@ -1099,7 +1099,7 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
RECON_AND_STORE(dest, in7);
}
-#define IDCT16x16_1D \
+#define IDCT16_1D \
/* Stage2 */ \
{ \
const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
@@ -1321,7 +1321,7 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
in12, in13, in14, in15);
}
- IDCT16x16_1D
+ IDCT16_1D
// Stage7
if (i == 0) {
@@ -2703,7 +2703,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
- IDCT16x16_1D
+ IDCT16_1D
// Stage7
in0 = _mm_add_epi16(stp2_0, stp1_15);
@@ -2785,6 +2785,698 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
input += 8; \
} \
+#define IDCT32_1D \
+/* Stage1 */ \
+{ \
+ const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
+ const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
+ const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
+ const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
+ \
+ const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
+ const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
+ const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
+ const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
+ \
+ const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
+ const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
+ const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
+ const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
+ \
+ const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
+ const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
+ const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
+ const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
+ stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
+ stp1_17, stp1_30) \
+ MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
+ stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
+ stp1_19, stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
+ stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
+ stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
+ stp1_23, stp1_24) \
+} \
+\
+/* Stage2 */ \
+{ \
+ const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
+ const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
+ const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
+ const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
+ \
+ const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
+ const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
+ const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
+ const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
+ stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
+ stp2_14) \
+ MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
+ stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
+ stp2_11, stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
+ stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
+ stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
+ \
+ stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
+ stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
+ stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
+ stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
+ \
+ stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
+ stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
+ stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
+} \
+\
+/* Stage3 */ \
+{ \
+ const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
+ const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
+ const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
+ const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
+ \
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ \
+ MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
+ stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
+ stp1_6) \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+ stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+ \
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+ stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+ stp1_18, stp1_29) \
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+ stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+ stp1_22, stp1_25) \
+ \
+ stp1_16 = stp2_16; \
+ stp1_31 = stp2_31; \
+ stp1_19 = stp2_19; \
+ stp1_20 = stp2_20; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_27 = stp2_27; \
+ stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+ const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
+ const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
+ const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
+ const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
+ stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
+ stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+ stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+ stp2_10, stp2_13) \
+ \
+ stp2_8 = stp1_8; \
+ stp2_15 = stp1_15; \
+ stp2_11 = stp1_11; \
+ stp2_12 = stp1_12; \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+ stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+ stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+ stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+ stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+ \
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+ stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+ stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+ stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+ stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+ \
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_4 = stp2_4; \
+ stp1_7 = stp2_7; \
+ \
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ \
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+ stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+ stp1_19, stp1_28) \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+ stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ \
+ stp1_22 = stp2_22; \
+ stp1_23 = stp2_23; \
+ stp1_24 = stp2_24; \
+ stp1_25 = stp2_25; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+ \
+ stp2_8 = stp1_8; \
+ stp2_9 = stp1_9; \
+ stp2_14 = stp1_14; \
+ stp2_15 = stp1_15; \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+ stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+ stp2_13, stp2_11, stp2_12) \
+ \
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+ stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+ stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+ stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+ stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+ stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+ \
+ stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+ stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+ stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+ stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+ stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+ stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+ stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+ stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+ \
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+ stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+ stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+ stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+ stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+ stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+ stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+ stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+ stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+ stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+ stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+ stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+ \
+ stp1_16 = stp2_16; \
+ stp1_17 = stp2_17; \
+ stp1_18 = stp2_18; \
+ stp1_19 = stp2_19; \
+ \
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+ stp1_21, stp1_26) \
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+ stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+ stp1_23, stp1_24) \
+ \
+ stp1_28 = stp2_28; \
+ stp1_29 = stp2_29; \
+ stp1_30 = stp2_30; \
+ stp1_31 = stp2_31; \
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1<<5);
+
+ // idct constants for each stage
+ const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
+ in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
+ in24, in25, in26, in27, in28, in29, in30, in31;
+ __m128i col[128];
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+ stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+ stp1_30, stp1_31;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+ stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+ stp2_30, stp2_31;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i, j, i32;
+
+ // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
+ for (i = 0; i < 8; i++) {
+ i32 = (i << 5);
+ if (i == 0) {
+ // First 1-D idct: first 8 rows
+ // Load input data.
+ LOAD_DQCOEFF(in0, input);
+ LOAD_DQCOEFF(in8, input);
+ LOAD_DQCOEFF(in16, input);
+ LOAD_DQCOEFF(in24, input);
+ LOAD_DQCOEFF(in1, input);
+ LOAD_DQCOEFF(in9, input);
+ LOAD_DQCOEFF(in17, input);
+ LOAD_DQCOEFF(in25, input);
+ LOAD_DQCOEFF(in2, input);
+ LOAD_DQCOEFF(in10, input);
+ LOAD_DQCOEFF(in18, input);
+ LOAD_DQCOEFF(in26, input);
+ LOAD_DQCOEFF(in3, input);
+ LOAD_DQCOEFF(in11, input);
+ LOAD_DQCOEFF(in19, input);
+ LOAD_DQCOEFF(in27, input);
+
+ LOAD_DQCOEFF(in4, input);
+ LOAD_DQCOEFF(in12, input);
+ LOAD_DQCOEFF(in20, input);
+ LOAD_DQCOEFF(in28, input);
+ LOAD_DQCOEFF(in5, input);
+ LOAD_DQCOEFF(in13, input);
+ LOAD_DQCOEFF(in21, input);
+ LOAD_DQCOEFF(in29, input);
+ LOAD_DQCOEFF(in6, input);
+ LOAD_DQCOEFF(in14, input);
+ LOAD_DQCOEFF(in22, input);
+ LOAD_DQCOEFF(in30, input);
+ LOAD_DQCOEFF(in7, input);
+ LOAD_DQCOEFF(in15, input);
+ LOAD_DQCOEFF(in23, input);
+ LOAD_DQCOEFF(in31, input);
+
+ // Transpose 32x8 block to 8x32 block
+ TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
+ in18, in19, in20, in21, in22, in23);
+ TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
+ in26, in27, in28, in29, in30, in31);
+ } else if (i < 4) {
+ // First 1-D idct: next 24 zero-coeff rows
+ col[i32 + 0] = _mm_setzero_si128();
+ col[i32 + 1] = _mm_setzero_si128();
+ col[i32 + 2] = _mm_setzero_si128();
+ col[i32 + 3] = _mm_setzero_si128();
+ col[i32 + 4] = _mm_setzero_si128();
+ col[i32 + 5] = _mm_setzero_si128();
+ col[i32 + 6] = _mm_setzero_si128();
+ col[i32 + 7] = _mm_setzero_si128();
+ col[i32 + 8] = _mm_setzero_si128();
+ col[i32 + 9] = _mm_setzero_si128();
+ col[i32 + 10] = _mm_setzero_si128();
+ col[i32 + 11] = _mm_setzero_si128();
+ col[i32 + 12] = _mm_setzero_si128();
+ col[i32 + 13] = _mm_setzero_si128();
+ col[i32 + 14] = _mm_setzero_si128();
+ col[i32 + 15] = _mm_setzero_si128();
+ col[i32 + 16] = _mm_setzero_si128();
+ col[i32 + 17] = _mm_setzero_si128();
+ col[i32 + 18] = _mm_setzero_si128();
+ col[i32 + 19] = _mm_setzero_si128();
+ col[i32 + 20] = _mm_setzero_si128();
+ col[i32 + 21] = _mm_setzero_si128();
+ col[i32 + 22] = _mm_setzero_si128();
+ col[i32 + 23] = _mm_setzero_si128();
+ col[i32 + 24] = _mm_setzero_si128();
+ col[i32 + 25] = _mm_setzero_si128();
+ col[i32 + 26] = _mm_setzero_si128();
+ col[i32 + 27] = _mm_setzero_si128();
+ col[i32 + 28] = _mm_setzero_si128();
+ col[i32 + 29] = _mm_setzero_si128();
+ col[i32 + 30] = _mm_setzero_si128();
+ col[i32 + 31] = _mm_setzero_si128();
+ continue;
+ } else {
+ // Second 1-D idct
+ j = i - 4;
+
+ // Transpose 32x8 block to 8x32 block
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ j += 4;
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
+ in11, in12, in13, in14, in15);
+ j += 4;
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
+ in19, in20, in21, in22, in23);
+ j += 4;
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
+ in28, in29, in30, in31);
+ }
+
+ IDCT32_1D
+
+ // final stage
+ if (i < 4) {
+ // 1_D: Store 32 intermediate results for each 8x32 block.
+ col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+ col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+ col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+ col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+ col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+ col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+ col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+ col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+ col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+ col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+ col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+ col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+ col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+ col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+ col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+ col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+ col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+ col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+ col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+ col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+ col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+ col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+ col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+ col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+ col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+ col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+ col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+ col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+ col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+ col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+ col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+ col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+ } else {
+ const __m128i zero = _mm_setzero_si128();
+
+ // 2_D: Calculate the results and store them to destination.
+ in0 = _mm_add_epi16(stp1_0, stp1_31);
+ in1 = _mm_add_epi16(stp1_1, stp1_30);
+ in2 = _mm_add_epi16(stp1_2, stp1_29);
+ in3 = _mm_add_epi16(stp1_3, stp1_28);
+ in4 = _mm_add_epi16(stp1_4, stp1_27);
+ in5 = _mm_add_epi16(stp1_5, stp1_26);
+ in6 = _mm_add_epi16(stp1_6, stp1_25);
+ in7 = _mm_add_epi16(stp1_7, stp1_24);
+ in8 = _mm_add_epi16(stp1_8, stp1_23);
+ in9 = _mm_add_epi16(stp1_9, stp1_22);
+ in10 = _mm_add_epi16(stp1_10, stp1_21);
+ in11 = _mm_add_epi16(stp1_11, stp1_20);
+ in12 = _mm_add_epi16(stp1_12, stp1_19);
+ in13 = _mm_add_epi16(stp1_13, stp1_18);
+ in14 = _mm_add_epi16(stp1_14, stp1_17);
+ in15 = _mm_add_epi16(stp1_15, stp1_16);
+ in16 = _mm_sub_epi16(stp1_15, stp1_16);
+ in17 = _mm_sub_epi16(stp1_14, stp1_17);
+ in18 = _mm_sub_epi16(stp1_13, stp1_18);
+ in19 = _mm_sub_epi16(stp1_12, stp1_19);
+ in20 = _mm_sub_epi16(stp1_11, stp1_20);
+ in21 = _mm_sub_epi16(stp1_10, stp1_21);
+ in22 = _mm_sub_epi16(stp1_9, stp1_22);
+ in23 = _mm_sub_epi16(stp1_8, stp1_23);
+ in24 = _mm_sub_epi16(stp1_7, stp1_24);
+ in25 = _mm_sub_epi16(stp1_6, stp1_25);
+ in26 = _mm_sub_epi16(stp1_5, stp1_26);
+ in27 = _mm_sub_epi16(stp1_4, stp1_27);
+ in28 = _mm_sub_epi16(stp1_3, stp1_28);
+ in29 = _mm_sub_epi16(stp1_2, stp1_29);
+ in30 = _mm_sub_epi16(stp1_1, stp1_30);
+ in31 = _mm_sub_epi16(stp1_0, stp1_31);
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+ in8 = _mm_adds_epi16(in8, final_rounding);
+ in9 = _mm_adds_epi16(in9, final_rounding);
+ in10 = _mm_adds_epi16(in10, final_rounding);
+ in11 = _mm_adds_epi16(in11, final_rounding);
+ in12 = _mm_adds_epi16(in12, final_rounding);
+ in13 = _mm_adds_epi16(in13, final_rounding);
+ in14 = _mm_adds_epi16(in14, final_rounding);
+ in15 = _mm_adds_epi16(in15, final_rounding);
+ in16 = _mm_adds_epi16(in16, final_rounding);
+ in17 = _mm_adds_epi16(in17, final_rounding);
+ in18 = _mm_adds_epi16(in18, final_rounding);
+ in19 = _mm_adds_epi16(in19, final_rounding);
+ in20 = _mm_adds_epi16(in20, final_rounding);
+ in21 = _mm_adds_epi16(in21, final_rounding);
+ in22 = _mm_adds_epi16(in22, final_rounding);
+ in23 = _mm_adds_epi16(in23, final_rounding);
+ in24 = _mm_adds_epi16(in24, final_rounding);
+ in25 = _mm_adds_epi16(in25, final_rounding);
+ in26 = _mm_adds_epi16(in26, final_rounding);
+ in27 = _mm_adds_epi16(in27, final_rounding);
+ in28 = _mm_adds_epi16(in28, final_rounding);
+ in29 = _mm_adds_epi16(in29, final_rounding);
+ in30 = _mm_adds_epi16(in30, final_rounding);
+ in31 = _mm_adds_epi16(in31, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 6);
+ in1 = _mm_srai_epi16(in1, 6);
+ in2 = _mm_srai_epi16(in2, 6);
+ in3 = _mm_srai_epi16(in3, 6);
+ in4 = _mm_srai_epi16(in4, 6);
+ in5 = _mm_srai_epi16(in5, 6);
+ in6 = _mm_srai_epi16(in6, 6);
+ in7 = _mm_srai_epi16(in7, 6);
+ in8 = _mm_srai_epi16(in8, 6);
+ in9 = _mm_srai_epi16(in9, 6);
+ in10 = _mm_srai_epi16(in10, 6);
+ in11 = _mm_srai_epi16(in11, 6);
+ in12 = _mm_srai_epi16(in12, 6);
+ in13 = _mm_srai_epi16(in13, 6);
+ in14 = _mm_srai_epi16(in14, 6);
+ in15 = _mm_srai_epi16(in15, 6);
+ in16 = _mm_srai_epi16(in16, 6);
+ in17 = _mm_srai_epi16(in17, 6);
+ in18 = _mm_srai_epi16(in18, 6);
+ in19 = _mm_srai_epi16(in19, 6);
+ in20 = _mm_srai_epi16(in20, 6);
+ in21 = _mm_srai_epi16(in21, 6);
+ in22 = _mm_srai_epi16(in22, 6);
+ in23 = _mm_srai_epi16(in23, 6);
+ in24 = _mm_srai_epi16(in24, 6);
+ in25 = _mm_srai_epi16(in25, 6);
+ in26 = _mm_srai_epi16(in26, 6);
+ in27 = _mm_srai_epi16(in27, 6);
+ in28 = _mm_srai_epi16(in28, 6);
+ in29 = _mm_srai_epi16(in29, 6);
+ in30 = _mm_srai_epi16(in30, 6);
+ in31 = _mm_srai_epi16(in31, 6);
+
+ RECON_AND_STORE(dest, in0);
+ RECON_AND_STORE(dest, in1);
+ RECON_AND_STORE(dest, in2);
+ RECON_AND_STORE(dest, in3);
+ RECON_AND_STORE(dest, in4);
+ RECON_AND_STORE(dest, in5);
+ RECON_AND_STORE(dest, in6);
+ RECON_AND_STORE(dest, in7);
+ RECON_AND_STORE(dest, in8);
+ RECON_AND_STORE(dest, in9);
+ RECON_AND_STORE(dest, in10);
+ RECON_AND_STORE(dest, in11);
+ RECON_AND_STORE(dest, in12);
+ RECON_AND_STORE(dest, in13);
+ RECON_AND_STORE(dest, in14);
+ RECON_AND_STORE(dest, in15);
+ RECON_AND_STORE(dest, in16);
+ RECON_AND_STORE(dest, in17);
+ RECON_AND_STORE(dest, in18);
+ RECON_AND_STORE(dest, in19);
+ RECON_AND_STORE(dest, in20);
+ RECON_AND_STORE(dest, in21);
+ RECON_AND_STORE(dest, in22);
+ RECON_AND_STORE(dest, in23);
+ RECON_AND_STORE(dest, in24);
+ RECON_AND_STORE(dest, in25);
+ RECON_AND_STORE(dest, in26);
+ RECON_AND_STORE(dest, in27);
+ RECON_AND_STORE(dest, in28);
+ RECON_AND_STORE(dest, in29);
+ RECON_AND_STORE(dest, in30);
+ RECON_AND_STORE(dest, in31);
+
+ dest += 8 - (stride * 32);
+ }
+ }
+}
+
void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -3009,336 +3701,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
in28, in29, in30, in31);
}
- // Stage1
- {
- const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31);
- const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31);
- const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15);
- const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15);
-
- const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23);
- const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23);
- const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7);
- const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7);
-
- const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27);
- const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27);
- const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11);
- const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11);
-
- const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19);
- const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19);
- const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3);
- const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3);
-
- MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,
- stg1_1, stg1_2, stg1_3, stp1_16, stp1_31,
- stp1_17, stp1_30)
- MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4,
- stg1_5, stg1_6, stg1_7, stp1_18, stp1_29,
- stp1_19, stp1_28)
- MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,
- stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,
- stp1_21, stp1_26)
- MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,
- stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,
- stp1_23, stp1_24)
- }
-
- // Stage2
- {
- const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30);
- const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30);
- const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14);
- const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14);
-
- const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22);
- const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22);
- const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6);
- const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6);
-
- MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,
- stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,
- stp2_14)
- MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,
- stg2_5, stg2_6, stg2_7, stp2_10, stp2_13,
- stp2_11, stp2_12)
-
- stp2_16 = _mm_add_epi16(stp1_16, stp1_17);
- stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);
- stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);
- stp2_19 = _mm_add_epi16(stp1_19, stp1_18);
-
- stp2_20 = _mm_add_epi16(stp1_20, stp1_21);
- stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);
- stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);
- stp2_23 = _mm_add_epi16(stp1_23, stp1_22);
-
- stp2_24 = _mm_add_epi16(stp1_24, stp1_25);
- stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);
- stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);
- stp2_27 = _mm_add_epi16(stp1_27, stp1_26);
-
- stp2_28 = _mm_add_epi16(stp1_28, stp1_29);
- stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);
- stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);
- stp2_31 = _mm_add_epi16(stp1_31, stp1_30);
- }
-
- // Stage3
- {
- const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28);
- const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28);
- const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12);
- const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12);
-
- const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);
- const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
-
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
-
- MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,
- stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,
- stp1_6)
-
- stp1_8 = _mm_add_epi16(stp2_8, stp2_9);
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
- stp1_12 = _mm_add_epi16(stp2_12, stp2_13);
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
-
- MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,
- stg3_5, stg3_6, stg3_4, stp1_17, stp1_30,
- stp1_18, stp1_29)
- MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,
- stg3_9, stg3_10, stg3_8, stp1_21, stp1_26,
- stp1_22, stp1_25)
-
- stp1_16 = stp2_16;
- stp1_31 = stp2_31;
- stp1_19 = stp2_19;
- stp1_20 = stp2_20;
- stp1_23 = stp2_23;
- stp1_24 = stp2_24;
- stp1_27 = stp2_27;
- stp1_28 = stp2_28;
- }
-
- // Stage4
- {
- const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16);
- const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16);
- const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24);
- const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24);
-
- const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
- const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
-
- MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0,
- stg4_1, stg4_2, stg4_3, stp2_0, stp2_1,
- stp2_2, stp2_3)
-
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-
- MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,
- stg4_5, stg4_6, stg4_4, stp2_9, stp2_14,
- stp2_10, stp2_13)
-
- stp2_8 = stp1_8;
- stp2_15 = stp1_15;
- stp2_11 = stp1_11;
- stp2_12 = stp1_12;
-
- stp2_16 = _mm_add_epi16(stp1_16, stp1_19);
- stp2_17 = _mm_add_epi16(stp1_17, stp1_18);
- stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);
- stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);
- stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);
- stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);
- stp2_22 = _mm_add_epi16(stp1_22, stp1_21);
- stp2_23 = _mm_add_epi16(stp1_23, stp1_20);
-
- stp2_24 = _mm_add_epi16(stp1_24, stp1_27);
- stp2_25 = _mm_add_epi16(stp1_25, stp1_26);
- stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);
- stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);
- stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);
- stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);
- stp2_30 = _mm_add_epi16(stp1_29, stp1_30);
- stp2_31 = _mm_add_epi16(stp1_28, stp1_31);
- }
-
- // Stage5
- {
- const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
- const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);
- const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
- const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
-
- const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);
- const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
-
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
-
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-
- stp1_5 = _mm_packs_epi32(tmp0, tmp1);
- stp1_6 = _mm_packs_epi32(tmp2, tmp3);
-
- stp1_4 = stp2_4;
- stp1_7 = stp2_7;
-
- stp1_8 = _mm_add_epi16(stp2_8, stp2_11);
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
- stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);
- stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
- stp1_15 = _mm_add_epi16(stp2_15, stp2_12);
-
- stp1_16 = stp2_16;
- stp1_17 = stp2_17;
-
- MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,
- stg4_5, stg4_4, stg4_5, stp1_18, stp1_29,
- stp1_19, stp1_28)
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,
- stg4_4, stg4_6, stg4_4, stp1_20, stp1_27,
- stp1_21, stp1_26)
-
- stp1_22 = stp2_22;
- stp1_23 = stp2_23;
- stp1_24 = stp2_24;
- stp1_25 = stp2_25;
- stp1_30 = stp2_30;
- stp1_31 = stp2_31;
- }
-
- // Stage6
- {
- const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
- const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
- const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
- const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);
-
- stp2_0 = _mm_add_epi16(stp1_0, stp1_7);
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
- stp2_3 = _mm_add_epi16(stp1_3, stp1_4);
- stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
- stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);
-
- stp2_8 = stp1_8;
- stp2_9 = stp1_9;
- stp2_14 = stp1_14;
- stp2_15 = stp1_15;
-
- MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12,
- stg6_0, stg4_0, stg6_0, stg4_0, stp2_10,
- stp2_13, stp2_11, stp2_12)
-
- stp2_16 = _mm_add_epi16(stp1_16, stp1_23);
- stp2_17 = _mm_add_epi16(stp1_17, stp1_22);
- stp2_18 = _mm_add_epi16(stp1_18, stp1_21);
- stp2_19 = _mm_add_epi16(stp1_19, stp1_20);
- stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);
- stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);
- stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);
- stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);
-
- stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);
- stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);
- stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);
- stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);
- stp2_28 = _mm_add_epi16(stp1_27, stp1_28);
- stp2_29 = _mm_add_epi16(stp1_26, stp1_29);
- stp2_30 = _mm_add_epi16(stp1_25, stp1_30);
- stp2_31 = _mm_add_epi16(stp1_24, stp1_31);
- }
-
- // Stage7
- {
- const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
- const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
- const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
- const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
-
- const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
- const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
- const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);
- const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);
-
- stp1_0 = _mm_add_epi16(stp2_0, stp2_15);
- stp1_1 = _mm_add_epi16(stp2_1, stp2_14);
- stp1_2 = _mm_add_epi16(stp2_2, stp2_13);
- stp1_3 = _mm_add_epi16(stp2_3, stp2_12);
- stp1_4 = _mm_add_epi16(stp2_4, stp2_11);
- stp1_5 = _mm_add_epi16(stp2_5, stp2_10);
- stp1_6 = _mm_add_epi16(stp2_6, stp2_9);
- stp1_7 = _mm_add_epi16(stp2_7, stp2_8);
- stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);
- stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);
- stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);
- stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);
- stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);
- stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);
- stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);
- stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);
-
- stp1_16 = stp2_16;
- stp1_17 = stp2_17;
- stp1_18 = stp2_18;
- stp1_19 = stp2_19;
-
- MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,
- stg4_0, stg6_0, stg4_0, stp1_20, stp1_27,
- stp1_21, stp1_26)
- MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,
- stg4_0, stg6_0, stg4_0, stp1_22, stp1_25,
- stp1_23, stp1_24)
-
- stp1_28 = stp2_28;
- stp1_29 = stp2_29;
- stp1_30 = stp2_30;
- stp1_31 = stp2_31;
- }
+ IDCT32_1D
// final stage
if (i < 4) {
diff --git a/vp9/common/x86/vp9_intrapred_ssse3.asm b/vp9/common/x86/vp9_intrapred_ssse3.asm
index 568e2080e..88df9b2d1 100644
--- a/vp9/common/x86/vp9_intrapred_ssse3.asm
+++ b/vp9/common/x86/vp9_intrapred_ssse3.asm
@@ -991,7 +991,7 @@ cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset
lea dst8q, [dst8q+strideq*4]
; output 2nd half of 3rd 8 lines and half of 4th 8 lines
- mova m0, [sh_b23456789abcdefff]
+ mova m0, [GLOBAL(sh_b23456789abcdefff)]
mova [dstq +16], m7
mova [dst8q ], m7
pshufb m7, m0
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
new file mode 100644
index 000000000..3c5cb8ffd
--- /dev/null
+++ b/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
@@ -0,0 +1,943 @@
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h> /* AVX2 */
+
+static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
+ const unsigned char *_blimit, const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+ __m128i abs_p1p0;
+
+ const __m128i thresh = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _thresh[0]));
+ const __m128i limit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _limit[0]));
+ const __m128i blimit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _blimit[0]));
+
+ q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
+ q4p4 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
+ q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
+ q3p3 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
+ q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
+ q2p2 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
+ q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
+ q1p1 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
+ p1q1 = _mm_shuffle_epi32(q1p1, 78);
+ q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
+ q0p0 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
+ p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+ abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+ _mm_subs_epu8(q0p0, q1p1));
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ fe = _mm_set1_epi8(0xfe);
+ ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+ _mm_subs_epu8(p0q0, q0p0));
+ abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+ _mm_subs_epu8(p1q1, q1p1));
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+ _mm_subs_epu8(q1p1, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+ _mm_subs_epu8(q2p2, q3p3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i t1 = _mm_set1_epi16(0x1);
+ __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+ __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+ __m128i qs0 = _mm_xor_si128(p0q0, t80);
+ __m128i qs1 = _mm_xor_si128(p1q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+ __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+ filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, qs0ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ filter1 = _mm_unpacklo_epi8(zero, filter1);
+ filter1 = _mm_srai_epi16(filter1, 0xB);
+ filter2 = _mm_unpacklo_epi8(zero, filter2);
+ filter2 = _mm_srai_epi16(filter2, 0xB);
+
+ /* Filter1 >> 3 */
+ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+ qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi16(filter1, t1);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(
+ _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
+ filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+ qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ flat = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+ _mm_subs_epu8(q0p0, q2p2)),
+ _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+ _mm_subs_epu8(q0p0, q3p3)));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
+ q5p5 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q5p5),
+ (__m64 *) (s + 5 * p)));
+
+ q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
+ q6p6 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q6p6),
+ (__m64 *) (s + 6 * p)));
+
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
+ _mm_subs_epu8(q0p0, q4p4)),
+ _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
+ _mm_subs_epu8(q0p0, q5p5)));
+
+ q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
+ q7p7 = _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(q7p7),
+ (__m64 *) (s + 7 * p)));
+
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
+ _mm_subs_epu8(q0p0, q6p6)),
+ _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
+ _mm_subs_epu8(q0p0, q7p7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+ __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+ p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+ p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+ p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+ p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+ p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+ p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+ q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+ q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+ q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+ q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+ q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+ q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+ q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+ pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+ _mm_add_epi16(p4_16, p3_16));
+ pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+ _mm_add_epi16(q4_16, q3_16));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
+ _mm_add_epi16(p2_16, p1_16));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
+ _mm_add_epi16(q2_16, q1_16));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p = _mm_add_epi16(eight,
+ _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(four,
+ _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
+ 4);
+ flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(q3_16, q0_16)), 3);
+
+ flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(p7_16, p7_16);
+ sum_q7 = _mm_add_epi16(q7_16, q7_16);
+ sum_p3 = _mm_add_epi16(p3_16, p3_16);
+ sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
+ 4);
+ flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0,
+ _mm_add_epi16(sum_q3, q1_16)), 3);
+ flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
+ 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0,
+ _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0,
+ _mm_add_epi16(sum_q3, q2_16)), 3);
+ flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
+ 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
+ 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
+ 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+ sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
+ 4);
+ flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ flat = _mm_shuffle_epi32(flat, 68);
+ flat2 = _mm_shuffle_epi32(flat2, 68);
+
+ q2p2 = _mm_andnot_si128(flat, q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+ q6p6 = _mm_andnot_si128(flat2, q6p6);
+ flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+ q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+ _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
+ _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
+
+ q5p5 = _mm_andnot_si128(flat2, q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+ _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
+ _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
+
+ q4p4 = _mm_andnot_si128(flat2, q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+ _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
+ _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
+
+ q3p3 = _mm_andnot_si128(flat2, q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+ _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
+ _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
+
+ q2p2 = _mm_andnot_si128(flat2, q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+ _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
+ _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
+
+ q1p1 = _mm_andnot_si128(flat2, q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+ _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
+ _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
+
+ q0p0 = _mm_andnot_si128(flat2, q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+ _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
+ _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
+ }
+}
+
+static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
+ const unsigned char *_blimit, const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i mask, hev, flat, flat2;
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i p7, p6, p5;
+ __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+ __m128i q5, q6, q7;
+
+ const __m128i thresh = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _thresh[0]));
+ const __m128i limit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _limit[0]));
+ const __m128i blimit = _mm_broadcastb_epi8(
+ _mm_cvtsi32_si128((int) _blimit[0]));
+
+ p4 = _mm_loadu_si128((__m128i *) (s - 5 * p));
+ p3 = _mm_loadu_si128((__m128i *) (s - 4 * p));
+ p2 = _mm_loadu_si128((__m128i *) (s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *) (s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *) (s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *) (s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *) (s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *) (s + 2 * p));
+ q3 = _mm_loadu_si128((__m128i *) (s + 3 * p));
+ q4 = _mm_loadu_si128((__m128i *) (s + 4 * p));
+
+ {
+ const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+ _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+ _mm_subs_epu8(q0, q1));
+ const __m128i fe = _mm_set1_epi8(0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+ _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+ _mm_subs_epu8(q1, p1));
+ __m128i work;
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8(0x80);
+ const __m128i te0 = _mm_set1_epi8(0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+ flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
+ flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
+ flat_q2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ /* Filter1 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+ /* Filter2 >> 3 */
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+ /* filt >> 1 */
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+ _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ p5 = _mm_loadu_si128((__m128i *) (s - 6 * p));
+ q5 = _mm_loadu_si128((__m128i *) (s + 5 * p));
+ flat2 = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+ _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ p6 = _mm_loadu_si128((__m128i *) (s - 7 * p));
+ q6 = _mm_loadu_si128((__m128i *) (s + 6 * p));
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+ _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+
+ p7 = _mm_loadu_si128((__m128i *) (s - 8 * p));
+ q7 = _mm_loadu_si128((__m128i *) (s + 7 * p));
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+ _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m256i eight = _mm256_set1_epi16(8);
+ const __m256i four = _mm256_set1_epi16(4);
+ __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+ q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+ p256_0, q256_0;
+ __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+ pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
+ res_q;
+
+ p256_7 = _mm256_cvtepu8_epi16(p7);
+ p256_6 = _mm256_cvtepu8_epi16(p6);
+ p256_5 = _mm256_cvtepu8_epi16(p5);
+ p256_4 = _mm256_cvtepu8_epi16(p4);
+ p256_3 = _mm256_cvtepu8_epi16(p3);
+ p256_2 = _mm256_cvtepu8_epi16(p2);
+ p256_1 = _mm256_cvtepu8_epi16(p1);
+ p256_0 = _mm256_cvtepu8_epi16(p0);
+ q256_0 = _mm256_cvtepu8_epi16(q0);
+ q256_1 = _mm256_cvtepu8_epi16(q1);
+ q256_2 = _mm256_cvtepu8_epi16(q2);
+ q256_3 = _mm256_cvtepu8_epi16(q3);
+ q256_4 = _mm256_cvtepu8_epi16(q4);
+ q256_5 = _mm256_cvtepu8_epi16(q5);
+ q256_6 = _mm256_cvtepu8_epi16(q6);
+ q256_7 = _mm256_cvtepu8_epi16(q7);
+
+ pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+ _mm256_add_epi16(p256_4, p256_3));
+ pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+ _mm256_add_epi16(q256_4, q256_3));
+
+ pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
+ _mm256_add_epi16(p256_2, p256_1));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
+ _mm256_add_epi16(q256_2, q256_1));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+
+ pixelFilter_p = _mm256_add_epi16(eight,
+ _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+
+ pixetFilter_p2p1p0 = _mm256_add_epi16(four,
+ _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(p256_7, p256_0)), 4);
+
+ flat2_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(q256_7, q256_0)), 4);
+
+ flat2_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(p256_3, p256_0)), 3);
+
+ flat_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(q256_3, q256_0)), 3);
+
+ flat_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+
+ sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+
+ sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_1)), 4);
+
+ flat2_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_1)), 4);
+
+ flat2_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_1)), 3);
+
+ flat_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_1)), 3);
+
+ flat_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+
+ sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_2)), 4);
+
+ flat2_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_2)), 4);
+
+ flat2_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+
+ pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_p2p1p0,
+ _mm256_add_epi16(sum_p3, p256_2)), 3);
+
+ flat_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixetFilter_q2q1q0,
+ _mm256_add_epi16(sum_q3, q256_2)), 3);
+
+ flat_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_3)), 4);
+
+ flat2_p3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_3)), 4);
+
+ flat2_q3 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_4)), 4);
+
+ flat2_p4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_4)), 4);
+
+ flat2_q4 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_5)), 4);
+
+ flat2_p5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_5)), 4);
+
+ flat2_q5 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+
+ sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+ sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+ pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+
+ pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+
+ res_p = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_p,
+ _mm256_add_epi16(sum_p7, p256_6)), 4);
+
+ flat2_p6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+ 168));
+
+ res_q = _mm256_srli_epi16(
+ _mm256_add_epi16(pixelFilter_q,
+ _mm256_add_epi16(sum_q7, q256_6)), 4);
+
+ flat2_q6 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+ 168));
+ }
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ p2 = _mm_andnot_si128(flat, p2);
+ flat_p2 = _mm_and_si128(flat, flat_p2);
+ p2 = _mm_or_si128(flat_p2, p2);
+
+ p1 = _mm_andnot_si128(flat, ps1);
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ p1 = _mm_or_si128(flat_p1, p1);
+
+ p0 = _mm_andnot_si128(flat, ps0);
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ p0 = _mm_or_si128(flat_p0, p0);
+
+ q0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(flat_q0, q0);
+
+ q1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(flat_q1, q1);
+
+ q2 = _mm_andnot_si128(flat, q2);
+ flat_q2 = _mm_and_si128(flat, flat_q2);
+ q2 = _mm_or_si128(flat_q2, q2);
+
+ p6 = _mm_andnot_si128(flat2, p6);
+ flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+ p6 = _mm_or_si128(flat2_p6, p6);
+ _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
+
+ p5 = _mm_andnot_si128(flat2, p5);
+ flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+ p5 = _mm_or_si128(flat2_p5, p5);
+ _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
+
+ p4 = _mm_andnot_si128(flat2, p4);
+ flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+ p4 = _mm_or_si128(flat2_p4, p4);
+ _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
+
+ p3 = _mm_andnot_si128(flat2, p3);
+ flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+ p3 = _mm_or_si128(flat2_p3, p3);
+ _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
+
+ p2 = _mm_andnot_si128(flat2, p2);
+ flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+ p2 = _mm_or_si128(flat2_p2, p2);
+ _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
+
+ p1 = _mm_andnot_si128(flat2, p1);
+ flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+ p1 = _mm_or_si128(flat2_p1, p1);
+ _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
+
+ p0 = _mm_andnot_si128(flat2, p0);
+ flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+ p0 = _mm_or_si128(flat2_p0, p0);
+ _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
+
+ q0 = _mm_andnot_si128(flat2, q0);
+ flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+ q0 = _mm_or_si128(flat2_q0, q0);
+ _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
+
+ q1 = _mm_andnot_si128(flat2, q1);
+ flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+ q1 = _mm_or_si128(flat2_q1, q1);
+ _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
+
+ q2 = _mm_andnot_si128(flat2, q2);
+ flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+ q2 = _mm_or_si128(flat2_q2, q2);
+ _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
+
+ q3 = _mm_andnot_si128(flat2, q3);
+ flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+ q3 = _mm_or_si128(flat2_q3, q3);
+ _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
+
+ q4 = _mm_andnot_si128(flat2, q4);
+ flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+ q4 = _mm_or_si128(flat2_q4, q4);
+ _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
+
+ q5 = _mm_andnot_si128(flat2, q5);
+ flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+ q5 = _mm_or_si128(flat2_q5, q5);
+ _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
+
+ q6 = _mm_andnot_si128(flat2, q6);
+ flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+ q6 = _mm_or_si128(flat2_q6, q6);
+ _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
+ }
+}
+
+void vp9_mb_lpf_horizontal_edge_w_avx2(unsigned char *s, int p,
+ const unsigned char *_blimit, const unsigned char *_limit,
+ const unsigned char *_thresh, int count) {
+ if (count == 1)
+ mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
+ else
+ mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
+}
diff --git a/vp9/decoder/vp9_dboolhuff.h b/vp9/decoder/vp9_dboolhuff.h
index c86451649..fd8e74ca4 100644
--- a/vp9/decoder/vp9_dboolhuff.h
+++ b/vp9/decoder/vp9_dboolhuff.h
@@ -44,7 +44,7 @@ static int vp9_read(vp9_reader *br, int probability) {
VP9_BD_VALUE bigsplit;
int count;
unsigned int range;
- unsigned int split = 1 + (((br->range - 1) * probability) >> 8);
+ unsigned int split = ((br->range * probability) + (256 - probability)) >> 8;
if (br->count < 0)
vp9_reader_fill(br);
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index 33793eee0..9792d2c6d 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -72,7 +72,7 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
}
if (!cm->frame_parallel_decoding_mode)
- update_tx_counts(bsize, context, tx_size, &cm->counts.tx);
+ ++get_tx_counts(bsize, context, &cm->counts.tx)[tx_size];
return tx_size;
}
@@ -91,8 +91,8 @@ static TX_SIZE read_tx_size(VP9_COMMON *const cm, MACROBLOCKD *const xd,
static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
int mi_row, int mi_col, int segment_id) {
const int mi_offset = mi_row * cm->mi_cols + mi_col;
- const int bw = 1 << mi_width_log2(bsize);
- const int bh = 1 << mi_height_log2(bsize);
+ const int bw = num_8x8_blocks_wide_lookup[bsize];
+ const int bh = num_8x8_blocks_high_lookup[bsize];
const int xmis = MIN(cm->mi_cols - mi_col, bw);
const int ymis = MIN(cm->mi_rows - mi_row, bh);
int x, y;
@@ -149,16 +149,17 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
return segment_id;
}
-static uint8_t read_skip_coeff(VP9_COMMON *const cm, MACROBLOCKD *const xd,
- int segment_id, vp9_reader *r) {
- int skip_coeff = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
- if (!skip_coeff) {
+static int read_skip_coeff(VP9_COMMON *cm, const MACROBLOCKD *xd,
+ int segment_id, vp9_reader *r) {
+ if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+ return 1;
+ } else {
const int ctx = vp9_get_pred_context_mbskip(xd);
- skip_coeff = vp9_read(r, vp9_get_pred_prob_mbskip(cm, xd));
+ const int skip = vp9_read(r, cm->fc.mbskip_probs[ctx]);
if (!cm->frame_parallel_decoding_mode)
- ++cm->counts.mbskip[ctx][skip_coeff];
+ ++cm->counts.mbskip[ctx][skip];
+ return skip;
}
- return skip_coeff;
}
static void read_intra_frame_mode_info(VP9_COMMON *const cm,
@@ -311,7 +312,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
}
-static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(
+static INLINE INTERPOLATION_TYPE read_switchable_filter_type(
VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader *r) {
const int ctx = vp9_get_pred_context_switchable_interp(xd);
const int type = treed_read(r, vp9_switchable_interp_tree,
@@ -414,6 +415,7 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
static void read_inter_block_mode_info(VP9_COMMON *const cm,
MACROBLOCKD *const xd,
+ const TileInfo *const tile,
MODE_INFO *const mi,
int mi_row, int mi_col, vp9_reader *r) {
MB_MODE_INFO *const mbmi = &mi->mbmi;
@@ -430,7 +432,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
ref0 = mbmi->ref_frame[0];
is_compound = has_second_ref(mbmi);
- vp9_find_mv_refs(cm, xd, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0],
+ vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0],
mi_row, mi_col);
inter_mode_ctx = mbmi->mode_context[ref0];
@@ -456,7 +458,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
if (is_compound) {
const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
- vp9_find_mv_refs(cm, xd, mi, xd->last_mi,
+ vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi,
ref1, mbmi->ref_mvs[ref1], mi_row, mi_col);
if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
@@ -482,12 +484,12 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
b_mode = read_inter_mode(cm, r, inter_mode_ctx);
if (b_mode == NEARESTMV || b_mode == NEARMV) {
- vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest[0],
+ vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[0],
&nearmv[0], j, 0,
mi_row, mi_col);
if (is_compound)
- vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest[1],
+ vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[1],
&nearmv[1], j, 1,
mi_row, mi_col);
}
@@ -523,6 +525,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
static void read_inter_frame_mode_info(VP9_COMMON *const cm,
MACROBLOCKD *const xd,
+ const TileInfo *const tile,
MODE_INFO *const mi,
int mi_row, int mi_col, vp9_reader *r) {
MB_MODE_INFO *const mbmi = &mi->mbmi;
@@ -537,17 +540,18 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm,
!mbmi->skip_coeff || !inter_block, r);
if (inter_block)
- read_inter_block_mode_info(cm, xd, mi, mi_row, mi_col, r);
+ read_inter_block_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
else
read_intra_block_mode_info(cm, mi, r);
}
void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+ const TileInfo *const tile,
int mi_row, int mi_col, vp9_reader *r) {
MODE_INFO *const mi = xd->mi_8x8[0];
const BLOCK_SIZE bsize = mi->mbmi.sb_type;
- const int bw = 1 << mi_width_log2(bsize);
- const int bh = 1 << mi_height_log2(bsize);
+ const int bw = num_8x8_blocks_wide_lookup[bsize];
+ const int bh = num_8x8_blocks_high_lookup[bsize];
const int y_mis = MIN(bh, cm->mi_rows - mi_row);
const int x_mis = MIN(bw, cm->mi_cols - mi_col);
int x, y, z;
@@ -555,7 +559,7 @@ void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
if (frame_is_intra_only(cm))
read_intra_frame_mode_info(cm, xd, mi, mi_row, mi_col, r);
else
- read_inter_frame_mode_info(cm, xd, mi, mi_row, mi_col, r);
+ read_inter_frame_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
for (y = 0, z = 0; y < y_mis; y++, z += cm->mode_info_stride) {
for (x = !y; x < x_mis; x++) {
diff --git a/vp9/decoder/vp9_decodemv.h b/vp9/decoder/vp9_decodemv.h
index cec99f253..8e9ae4a54 100644
--- a/vp9/decoder/vp9_decodemv.h
+++ b/vp9/decoder/vp9_decodemv.h
@@ -14,7 +14,10 @@
#include "vp9/decoder/vp9_onyxd_int.h"
#include "vp9/decoder/vp9_dboolhuff.h"
+struct TileInfo;
+
void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
+ const struct TileInfo *const tile,
int mi_row, int mi_col, vp9_reader *r);
#endif // VP9_DECODER_VP9_DECODEMV_H_
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 3ee8ba41d..4746a3abd 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -37,10 +37,44 @@
#include "vp9/decoder/vp9_thread.h"
#include "vp9/decoder/vp9_treereader.h"
+typedef struct TileWorkerData {
+ VP9_COMMON *cm;
+ vp9_reader bit_reader;
+ DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+ DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
+} TileWorkerData;
+
static int read_be32(const uint8_t *p) {
return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
}
+static int is_compound_prediction_allowed(const VP9_COMMON *cm) {
+ int i;
+ for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i)
+ if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
+ return 1;
+
+ return 0;
+}
+
+static void setup_compound_prediction(VP9_COMMON *cm) {
+ if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+ cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
+ cm->comp_fixed_ref = ALTREF_FRAME;
+ cm->comp_var_ref[0] = LAST_FRAME;
+ cm->comp_var_ref[1] = GOLDEN_FRAME;
+ } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+ cm->ref_frame_sign_bias[ALTREF_FRAME]) {
+ cm->comp_fixed_ref = GOLDEN_FRAME;
+ cm->comp_var_ref[0] = LAST_FRAME;
+ cm->comp_var_ref[1] = ALTREF_FRAME;
+ } else {
+ cm->comp_fixed_ref = LAST_FRAME;
+ cm->comp_var_ref[0] = GOLDEN_FRAME;
+ cm->comp_var_ref[1] = ALTREF_FRAME;
+ }
+}
+
// len == 0 is not allowed
static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
return start + len > start && start + len <= end;
@@ -76,7 +110,7 @@ static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) {
static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
int i, j;
- for (j = 0; j < SWITCHABLE_FILTERS + 1; ++j)
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
vp9_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
}
@@ -98,8 +132,11 @@ static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
int i;
- cm->comp_pred_mode = cm->allow_comp_inter_inter ? read_comp_pred_mode(r)
- : SINGLE_PREDICTION_ONLY;
+ const int compound_allowed = is_compound_prediction_allowed(cm);
+ cm->comp_pred_mode = compound_allowed ? read_comp_pred_mode(r)
+ : SINGLE_PREDICTION_ONLY;
+ if (compound_allowed)
+ setup_compound_prediction(cm);
if (cm->comp_pred_mode == HYBRID_PREDICTION)
for (i = 0; i < COMP_INTER_CONTEXTS; i++)
@@ -169,11 +206,49 @@ static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
xd->plane[i].dequant = cm->uv_dequant[q_index];
}
-static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *arg) {
- MACROBLOCKD* const xd = arg;
+// Allocate storage for each tile column.
+// TODO(jzern): when max_threads <= 1 the same storage could be used for each
+// tile.
+static void alloc_tile_storage(VP9D_COMP *pbi, int tile_cols) {
+ VP9_COMMON *const cm = &pbi->common;
+ const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+ int i, tile_col;
+
+ CHECK_MEM_ERROR(cm, pbi->mi_streams,
+ vpx_realloc(pbi->mi_streams, tile_cols *
+ sizeof(*pbi->mi_streams)));
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileInfo tile;
+
+ vp9_tile_init(&tile, cm, 0, tile_col);
+ pbi->mi_streams[tile_col] =
+ &cm->mi[cm->mi_rows * tile.mi_col_start];
+ }
+
+ // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
+ // block where mi unit size is 8x8.
+ CHECK_MEM_ERROR(cm, pbi->above_context[0],
+ vpx_realloc(pbi->above_context[0],
+ sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
+ 2 * aligned_mi_cols));
+ for (i = 1; i < MAX_MB_PLANE; ++i) {
+ pbi->above_context[i] = pbi->above_context[0] +
+ i * sizeof(*pbi->above_context[0]) *
+ 2 * aligned_mi_cols;
+ }
+
+ // This is sized based on the entire frame. Each tile operates within its
+ // column bounds.
+ CHECK_MEM_ERROR(cm, pbi->above_seg_context,
+ vpx_realloc(pbi->above_seg_context,
+ sizeof(*pbi->above_seg_context) *
+ aligned_mi_cols));
+}
+
+static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
struct macroblockd_plane *const pd = &xd->plane[plane];
- int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+ int16_t* const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
const int stride = pd->dst.stride;
const int eob = pd->eobs[block];
if (eob > 0) {
@@ -186,40 +261,53 @@ static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
case TX_4X4:
tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block);
if (tx_type == DCT_DCT)
- xd->itxm_add(qcoeff, dst, stride, eob);
+ xd->itxm_add(dqcoeff, dst, stride, eob);
else
- vp9_iht4x4_add(tx_type, qcoeff, dst, stride, eob);
+ vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type);
break;
case TX_8X8:
tx_type = get_tx_type_8x8(pd->plane_type, xd);
- vp9_iht8x8_add(tx_type, qcoeff, dst, stride, eob);
+ vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
break;
case TX_16X16:
tx_type = get_tx_type_16x16(pd->plane_type, xd);
- vp9_iht16x16_add(tx_type, qcoeff, dst, stride, eob);
+ vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
break;
case TX_32X32:
tx_type = DCT_DCT;
- vp9_idct32x32_add(qcoeff, dst, stride, eob);
+ vp9_idct32x32_add(dqcoeff, dst, stride, eob);
break;
default:
assert(!"Invalid transform size");
}
if (eob == 1) {
- *((int32_t *)qcoeff) = 0;
+ vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
} else {
if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
- vpx_memset(qcoeff, 0, 4 * (4 << tx_size) * sizeof(qcoeff[0]));
+ vpx_memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+ else if (tx_size == TX_32X32 && eob <= 34)
+ vpx_memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
else
- vpx_memset(qcoeff, 0, (16 << (tx_size << 1)) * sizeof(qcoeff[0]));
+ vpx_memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
}
}
}
-static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *arg) {
- MACROBLOCKD* const xd = arg;
+struct intra_args {
+ VP9_COMMON *cm;
+ MACROBLOCKD *xd;
+ vp9_reader *r;
+ unsigned char* token_cache;
+};
+
+static void predict_and_reconstruct_intra_block(int plane, int block,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct intra_args *const args = arg;
+ VP9_COMMON *const cm = args->cm;
+ MACROBLOCKD *const xd = args->xd;
+
struct macroblockd_plane *const pd = &xd->plane[plane];
MODE_INFO *const mi = xd->mi_8x8[0];
const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
@@ -238,31 +326,37 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
b_width_log2(plane_bsize), tx_size, mode,
dst, pd->dst.stride, dst, pd->dst.stride);
- if (!mi->mbmi.skip_coeff)
- decode_block(plane, block, plane_bsize, tx_size, arg);
+ if (!mi->mbmi.skip_coeff) {
+ vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size,
+ args->r, args->token_cache);
+ inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
+ }
}
-static int decode_tokens(VP9_COMMON *const cm, MACROBLOCKD *const xd,
- BLOCK_SIZE bsize, vp9_reader *r) {
- MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-
- if (mbmi->skip_coeff) {
- reset_skip_context(xd, bsize);
- return -1;
- } else {
- if (cm->seg.enabled)
- setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
- cm->base_qindex));
-
- // TODO(dkovalev) if (!vp9_reader_has_error(r))
- return vp9_decode_tokens(cm, xd, &cm->seg, r, bsize);
- }
+struct inter_args {
+ VP9_COMMON *cm;
+ MACROBLOCKD *xd;
+ vp9_reader *r;
+ int *eobtotal;
+ unsigned char* token_cache;
+};
+
+static void reconstruct_inter_block(int plane, int block,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct inter_args *args = arg;
+ VP9_COMMON *const cm = args->cm;
+ MACROBLOCKD *const xd = args->xd;
+
+ *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
+ plane_bsize, tx_size,
+ args->r, args->token_cache);
+ inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
}
-static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize,
- int mi_row, int mi_col) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
+static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ const TileInfo *const tile,
+ BLOCK_SIZE bsize, int mi_row, int mi_col) {
const int bh = num_8x8_blocks_high_lookup[bsize];
const int bw = num_8x8_blocks_wide_lookup[bsize];
const int offset = mi_row * cm->mode_info_stride + mi_col;
@@ -281,143 +375,163 @@ static void set_offsets(VP9D_COMP *pbi, BLOCK_SIZE bsize,
// cannot be used.
xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
- set_skip_context(cm, xd, mi_row, mi_col);
+ set_skip_context(xd, xd->above_context, xd->left_context, mi_row, mi_col);
// Distance of Mb to the various image edges. These are specified to 8th pel
// as they are always compared to values that are in 1/8th pel units
- set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
- setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], mi_row, mi_col);
+ setup_dst_planes(xd, get_frame_new_buffer(cm), mi_row, mi_col);
}
static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
int idx, int mi_row, int mi_col) {
MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
const int ref = mbmi->ref_frame[idx] - LAST_FRAME;
- const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[ref]];
- const struct scale_factors *sf = &cm->active_ref_scale[ref];
- if (!vp9_is_valid_scale(sf))
+ const YV12_BUFFER_CONFIG *cfg = get_frame_ref_buffer(cm, ref);
+ const struct scale_factors_common *sfc = &cm->active_ref_scale_comm[ref];
+ if (!vp9_is_valid_scale(sfc))
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
"Invalid scale factors");
- xd->scale_factor[idx] = *sf;
- setup_pre_planes(xd, idx, cfg, mi_row, mi_col, sf);
+ xd->scale_factor[idx].sfc = sfc;
+ setup_pre_planes(xd, idx, cfg, mi_row, mi_col, &xd->scale_factor[idx]);
xd->corrupted |= cfg->corrupted;
}
-static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
- vp9_reader *r, BLOCK_SIZE bsize, int index) {
- VP9_COMMON *const cm = &pbi->common;
- MACROBLOCKD *const xd = &pbi->mb;
+static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ const TileInfo *const tile,
+ int mi_row, int mi_col,
+ vp9_reader *r, BLOCK_SIZE bsize,
+ unsigned char *token_cache) {
const int less8x8 = bsize < BLOCK_8X8;
MB_MODE_INFO *mbmi;
- int eobtotal;
-
- if (less8x8)
- if (index > 0)
- return;
- set_offsets(pbi, bsize, mi_row, mi_col);
- vp9_read_mode_info(cm, xd, mi_row, mi_col, r);
+ set_offsets(cm, xd, tile, bsize, mi_row, mi_col);
+ vp9_read_mode_info(cm, xd, tile, mi_row, mi_col, r);
if (less8x8)
bsize = BLOCK_8X8;
// Has to be called after set_offsets
mbmi = &xd->mi_8x8[0]->mbmi;
- eobtotal = decode_tokens(cm, xd, bsize, r);
- if (!is_inter_block(mbmi)) {
- // Intra reconstruction
- foreach_transformed_block(xd, bsize, decode_block_intra, xd);
+ if (mbmi->skip_coeff) {
+ reset_skip_context(xd, bsize);
} else {
- // Inter reconstruction
- const int decode_blocks = (eobtotal > 0);
-
- if (!less8x8) {
- assert(mbmi->sb_type == bsize);
- if (eobtotal == 0)
- mbmi->skip_coeff = 1; // skip loopfilter
- }
+ if (cm->seg.enabled)
+ setup_plane_dequants(cm, xd, vp9_get_qindex(&cm->seg, mbmi->segment_id,
+ cm->base_qindex));
+ }
+ if (!is_inter_block(mbmi)) {
+ struct intra_args arg = { cm, xd, r, token_cache };
+ foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
+ &arg);
+ } else {
+ // Setup
set_ref(cm, xd, 0, mi_row, mi_col);
if (has_second_ref(mbmi))
set_ref(cm, xd, 1, mi_row, mi_col);
xd->subpix.filter_x = xd->subpix.filter_y =
vp9_get_filter_kernel(mbmi->interp_filter);
+
+ // Prediction
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
- if (decode_blocks)
- foreach_transformed_block(xd, bsize, decode_block, xd);
+ // Reconstruction
+ if (!mbmi->skip_coeff) {
+ int eobtotal = 0;
+ struct inter_args arg = { cm, xd, r, &eobtotal, token_cache };
+ foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
+ if (!less8x8 && eobtotal == 0)
+ mbmi->skip_coeff = 1; // skip loopfilter
+ }
}
+
xd->corrupted |= vp9_reader_has_error(r);
}
-static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
- vp9_reader* r, BLOCK_SIZE bsize, int index) {
- VP9_COMMON *const cm = &pbi->common;
+static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ vp9_reader *r) {
+ const int ctx = partition_plane_context(xd->above_seg_context,
+ xd->left_seg_context,
+ mi_row, mi_col, bsize);
+ const vp9_prob *const probs = get_partition_probs(cm, ctx);
+ const int has_rows = (mi_row + hbs) < cm->mi_rows;
+ const int has_cols = (mi_col + hbs) < cm->mi_cols;
+ PARTITION_TYPE p;
+
+ if (has_rows && has_cols)
+ p = treed_read(r, vp9_partition_tree, probs);
+ else if (!has_rows && has_cols)
+ p = vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
+ else if (has_rows && !has_cols)
+ p = vp9_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
+ else
+ p = PARTITION_SPLIT;
+
+ if (!cm->frame_parallel_decoding_mode)
+ ++cm->counts.partition[ctx][p];
+
+ return p;
+}
+
+static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ const TileInfo *const tile,
+ int mi_row, int mi_col,
+ vp9_reader* r, BLOCK_SIZE bsize,
+ unsigned char *token_cache) {
const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
- PARTITION_TYPE partition = PARTITION_NONE;
+ PARTITION_TYPE partition;
BLOCK_SIZE subsize;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- if (bsize < BLOCK_8X8) {
- if (index > 0)
- return;
- } else {
- int pl;
- const int idx = check_bsize_coverage(hbs, cm->mi_rows, cm->mi_cols,
- mi_row, mi_col);
- pl = partition_plane_context(cm, mi_row, mi_col, bsize);
-
- if (idx == 0)
- partition = treed_read(r, vp9_partition_tree,
- cm->fc.partition_prob[cm->frame_type][pl]);
- else if (idx > 0 &&
- !vp9_read(r, cm->fc.partition_prob[cm->frame_type][pl][idx]))
- partition = (idx == 1) ? PARTITION_HORZ : PARTITION_VERT;
- else
- partition = PARTITION_SPLIT;
-
- if (!cm->frame_parallel_decoding_mode)
- ++cm->counts.partition[pl][partition];
- }
-
+ partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
subsize = get_subsize(bsize, partition);
-
- switch (partition) {
- case PARTITION_NONE:
- decode_modes_b(pbi, mi_row, mi_col, r, subsize, 0);
- break;
- case PARTITION_HORZ:
- decode_modes_b(pbi, mi_row, mi_col, r, subsize, 0);
- if (mi_row + hbs < cm->mi_rows)
- decode_modes_b(pbi, mi_row + hbs, mi_col, r, subsize, 1);
- break;
- case PARTITION_VERT:
- decode_modes_b(pbi, mi_row, mi_col, r, subsize, 0);
- if (mi_col + hbs < cm->mi_cols)
- decode_modes_b(pbi, mi_row, mi_col + hbs, r, subsize, 1);
- break;
- case PARTITION_SPLIT: {
- int n;
- for (n = 0; n < 4; n++) {
- const int j = n >> 1, i = n & 1;
- decode_modes_sb(pbi, mi_row + j * hbs, mi_col + i * hbs,
- r, subsize, n);
- }
- } break;
- default:
- assert(!"Invalid partition type");
+ if (subsize < BLOCK_8X8) {
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+ } else {
+ switch (partition) {
+ case PARTITION_NONE:
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+ break;
+ case PARTITION_HORZ:
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+ if (mi_row + hbs < cm->mi_rows)
+ decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
+ token_cache);
+ break;
+ case PARTITION_VERT:
+ decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
+ if (mi_col + hbs < cm->mi_cols)
+ decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
+ token_cache);
+ break;
+ case PARTITION_SPLIT:
+ decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize,
+ token_cache);
+ decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
+ token_cache);
+ decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
+ token_cache);
+ decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize,
+ token_cache);
+ break;
+ default:
+ assert(!"Invalid partition type");
+ }
}
// update partition context
if (bsize >= BLOCK_8X8 &&
(bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
- update_partition_context(cm, mi_row, mi_col, subsize, bsize);
+ update_partition_context(xd->above_seg_context, xd->left_seg_context,
+ mi_row, mi_col, subsize, bsize);
}
static void setup_token_decoder(const uint8_t *data,
@@ -453,16 +567,10 @@ static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs,
static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
vp9_reader *r) {
- read_coef_probs_common(fc->coef_probs[TX_4X4], r);
-
- if (tx_mode > ONLY_4X4)
- read_coef_probs_common(fc->coef_probs[TX_8X8], r);
-
- if (tx_mode > ALLOW_8X8)
- read_coef_probs_common(fc->coef_probs[TX_16X16], r);
-
- if (tx_mode > ALLOW_16X16)
- read_coef_probs_common(fc->coef_probs[TX_32X32], r);
+ const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+ TX_SIZE tx_size;
+ for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+ read_coef_probs_common(fc->coef_probs[tx_size], r);
}
static void setup_segmentation(struct segmentation *seg,
@@ -549,9 +657,8 @@ static int read_delta_q(struct vp9_read_bit_buffer *rb, int *delta_q) {
return old != *delta_q;
}
-static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
- MACROBLOCKD *const xd = &pbi->mb;
- VP9_COMMON *const cm = &pbi->common;
+static void setup_quantization(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+ struct vp9_read_bit_buffer *rb) {
int update = 0;
cm->base_qindex = vp9_rb_read_literal(rb, QINDEX_BITS);
@@ -569,12 +676,12 @@ static void setup_quantization(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
xd->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
}
-static INTERPOLATIONFILTERTYPE read_interp_filter_type(
- struct vp9_read_bit_buffer *rb) {
- const INTERPOLATIONFILTERTYPE literal_to_type[] = { EIGHTTAP_SMOOTH,
- EIGHTTAP,
- EIGHTTAP_SHARP,
- BILINEAR };
+static INTERPOLATION_TYPE read_interp_filter_type(
+ struct vp9_read_bit_buffer *rb) {
+ const INTERPOLATION_TYPE literal_to_type[] = { EIGHTTAP_SMOOTH,
+ EIGHTTAP,
+ EIGHTTAP_SHARP,
+ BILINEAR };
return vp9_rb_read_bit(rb) ? SWITCHABLE
: literal_to_type[vp9_rb_read_literal(rb, 2)];
}
@@ -620,7 +727,7 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) {
vp9_update_frame_size(cm);
}
- vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx], cm->width, cm->height,
+ vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
VP9BORDERINPIXELS);
}
@@ -641,7 +748,7 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi,
int found = 0, i;
for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
if (vp9_rb_read_bit(rb)) {
- YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->active_ref_idx[i]];
+ YV12_BUFFER_CONFIG *const cfg = get_frame_ref_buffer(cm, i);
width = cfg->y_crop_width;
height = cfg->y_crop_height;
found = 1;
@@ -660,18 +767,28 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi,
setup_display_size(cm, rb);
}
-static void decode_tile(VP9D_COMP *pbi, vp9_reader *r, int tile_col) {
+static void setup_tile_context(VP9D_COMP *const pbi, MACROBLOCKD *const xd,
+ int tile_col) {
+ int i;
+ xd->mi_stream = pbi->mi_streams[tile_col];
+
+ for (i = 0; i < MAX_MB_PLANE; ++i) {
+ xd->above_context[i] = pbi->above_context[i];
+ }
+ // see note in alloc_tile_storage().
+ xd->above_seg_context = pbi->above_seg_context;
+}
+
+static void decode_tile(VP9D_COMP *pbi, const TileInfo *const tile,
+ vp9_reader *r) {
const int num_threads = pbi->oxcf.max_threads;
VP9_COMMON *const cm = &pbi->common;
int mi_row, mi_col;
- YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[cm->new_fb_idx];
MACROBLOCKD *xd = &pbi->mb;
- xd->mi_stream = pbi->mi_streams[tile_col];
-
if (pbi->do_loopfilter_inline) {
LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
- lf_data->frame_buffer = fb;
+ lf_data->frame_buffer = get_frame_new_buffer(cm);
lf_data->cm = cm;
lf_data->xd = pbi->mb;
lf_data->stop = 0;
@@ -679,14 +796,15 @@ static void decode_tile(VP9D_COMP *pbi, vp9_reader *r, int tile_col) {
vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
}
- for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end;
+ for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
mi_row += MI_BLOCK_SIZE) {
// For a SB there are 2 left contexts, each pertaining to a MB row within
- vp9_zero(cm->left_context);
- vp9_zero(cm->left_seg_context);
- for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+ vp9_zero(xd->left_context);
+ vp9_zero(xd->left_seg_context);
+ for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
mi_col += MI_BLOCK_SIZE)
- decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_64X64, 0);
+ decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64,
+ pbi->token_cache);
if (pbi->do_loopfilter_inline) {
const int lf_start = mi_row - MI_BLOCK_SIZE;
@@ -696,7 +814,7 @@ static void decode_tile(VP9D_COMP *pbi, vp9_reader *r, int tile_col) {
if (lf_start < 0) continue;
// decoding has completed: finish up the loop filter in this thread.
- if (mi_row + MI_BLOCK_SIZE >= cm->cur_tile_mi_row_end) continue;
+ if (mi_row + MI_BLOCK_SIZE >= tile->mi_row_end) continue;
vp9_worker_sync(&pbi->lf_worker);
lf_data->start = lf_start;
@@ -735,10 +853,32 @@ static void setup_tile_info(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
cm->log2_tile_rows += vp9_rb_read_bit(rb);
}
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static size_t get_tile(const uint8_t *const data_end,
+ int is_last,
+ struct vpx_internal_error_info *error_info,
+ const uint8_t **data) {
+ size_t size;
+
+ if (!is_last) {
+ if (!read_is_valid(*data, 4, data_end))
+ vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt tile length");
+
+ size = read_be32(*data);
+ *data += 4;
+ } else {
+ size = data_end - *data;
+ }
+ return size;
+}
+
static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
vp9_reader residual_bc;
VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
const uint8_t *const data_end = pbi->source + pbi->source_sz;
const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
@@ -748,70 +888,57 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
// Note: this memset assumes above_context[0], [1] and [2]
// are allocated as part of the same buffer.
- vpx_memset(cm->above_context[0], 0,
- sizeof(ENTROPY_CONTEXT) * MAX_MB_PLANE * (2 * aligned_mi_cols));
+ vpx_memset(pbi->above_context[0], 0,
+ sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
+ 2 * aligned_mi_cols);
- vpx_memset(cm->above_seg_context, 0,
- sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
+ vpx_memset(pbi->above_seg_context, 0,
+ sizeof(*pbi->above_seg_context) * aligned_mi_cols);
if (pbi->oxcf.inv_tile_order) {
const uint8_t *data_ptr2[4][1 << 6];
vp9_reader bc_bak = {0};
- // pre-initialize the offsets, we're going to read in inverse order
+ // pre-initialize the offsets, we're going to decode in inverse order
data_ptr2[0][0] = data;
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- if (tile_row) {
- const int size = read_be32(data_ptr2[tile_row - 1][tile_cols - 1]);
- data_ptr2[tile_row - 1][tile_cols - 1] += 4;
- data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][tile_cols - 1] + size;
- }
-
- for (tile_col = 1; tile_col < tile_cols; tile_col++) {
- const int size = read_be32(data_ptr2[tile_row][tile_col - 1]);
- data_ptr2[tile_row][tile_col - 1] += 4;
- data_ptr2[tile_row][tile_col] =
- data_ptr2[tile_row][tile_col - 1] + size;
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ const int last_tile =
+ tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
+ const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
+ data_ptr2[tile_row][tile_col] = data;
+ data += size;
}
}
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- vp9_get_tile_row_offsets(cm, tile_row);
for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) {
- vp9_get_tile_col_offsets(cm, tile_col);
+ TileInfo tile;
+
+ vp9_tile_init(&tile, cm, tile_row, tile_col);
setup_token_decoder(data_ptr2[tile_row][tile_col], data_end,
data_end - data_ptr2[tile_row][tile_col],
&cm->error, &residual_bc);
- decode_tile(pbi, &residual_bc, tile_col);
+ setup_tile_context(pbi, xd, tile_col);
+ decode_tile(pbi, &tile, &residual_bc);
if (tile_row == tile_rows - 1 && tile_col == tile_cols - 1)
bc_bak = residual_bc;
}
}
residual_bc = bc_bak;
} else {
- int has_more;
-
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- vp9_get_tile_row_offsets(cm, tile_row);
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
- size_t size;
+ const int last_tile =
+ tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
+ const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
+ TileInfo tile;
- vp9_get_tile_col_offsets(cm, tile_col);
-
- has_more = tile_col < tile_cols - 1 || tile_row < tile_rows - 1;
- if (has_more) {
- if (!read_is_valid(data, 4, data_end))
- vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Truncated packet or corrupt tile length");
-
- size = read_be32(data);
- data += 4;
- } else {
- size = data_end - data;
- }
+ vp9_tile_init(&tile, cm, tile_row, tile_col);
setup_token_decoder(data, data_end, size, &cm->error, &residual_bc);
- decode_tile(pbi, &residual_bc, tile_col);
+ setup_tile_context(pbi, xd, tile_col);
+ decode_tile(pbi, &tile, &residual_bc);
data += size;
}
}
@@ -820,10 +947,113 @@ static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
return vp9_reader_find_end(&residual_bc);
}
+static int tile_worker_hook(void *arg1, void *arg2) {
+ TileWorkerData *tile_data = (TileWorkerData*)arg1;
+ const TileInfo *const tile = (TileInfo*)arg2;
+ int mi_row, mi_col;
+
+ for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
+ mi_row += MI_BLOCK_SIZE) {
+ vp9_zero(tile_data->xd.left_context);
+ vp9_zero(tile_data->xd.left_seg_context);
+ for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+ mi_col += MI_BLOCK_SIZE) {
+ decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
+ mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64,
+ tile_data->token_cache);
+ }
+ }
+ return !tile_data->xd.corrupted;
+}
+
+static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) {
+ VP9_COMMON *const cm = &pbi->common;
+ const uint8_t *const data_end = pbi->source + pbi->source_sz;
+ const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+ const int tile_cols = 1 << cm->log2_tile_cols;
+ const int tile_rows = 1 << cm->log2_tile_rows;
+ const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols);
+ int tile_col = 0;
+
+ assert(tile_rows == 1);
+ (void)tile_rows;
+
+ if (num_workers > pbi->num_tile_workers) {
+ int i;
+ CHECK_MEM_ERROR(cm, pbi->tile_workers,
+ vpx_realloc(pbi->tile_workers,
+ num_workers * sizeof(*pbi->tile_workers)));
+ for (i = pbi->num_tile_workers; i < num_workers; ++i) {
+ VP9Worker *const worker = &pbi->tile_workers[i];
+ ++pbi->num_tile_workers;
+
+ vp9_worker_init(worker);
+ worker->hook = (VP9WorkerHook)tile_worker_hook;
+ CHECK_MEM_ERROR(cm, worker->data1,
+ vpx_memalign(32, sizeof(TileWorkerData)));
+ CHECK_MEM_ERROR(cm, worker->data2, vpx_malloc(sizeof(TileInfo)));
+ if (i < num_workers - 1 && !vp9_worker_reset(worker)) {
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Tile decoder thread creation failed");
+ }
+ }
+ }
+
+ // Note: this memset assumes above_context[0], [1] and [2]
+ // are allocated as part of the same buffer.
+ vpx_memset(pbi->above_context[0], 0,
+ sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
+ 2 * aligned_mi_cols);
+ vpx_memset(pbi->above_seg_context, 0,
+ sizeof(*pbi->above_seg_context) * aligned_mi_cols);
+
+ while (tile_col < tile_cols) {
+ int i;
+ for (i = 0; i < num_workers && tile_col < tile_cols; ++i) {
+ VP9Worker *const worker = &pbi->tile_workers[i];
+ TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
+ TileInfo *const tile = (TileInfo*)worker->data2;
+ const size_t size =
+ get_tile(data_end, tile_col == tile_cols - 1, &cm->error, &data);
+
+ tile_data->cm = cm;
+ tile_data->xd = pbi->mb;
+ tile_data->xd.corrupted = 0;
+ vp9_tile_init(tile, tile_data->cm, 0, tile_col);
+
+ setup_token_decoder(data, data_end, size, &cm->error,
+ &tile_data->bit_reader);
+ setup_tile_context(pbi, &tile_data->xd, tile_col);
+
+ worker->had_error = 0;
+ if (i == num_workers - 1 || tile_col == tile_cols - 1) {
+ vp9_worker_execute(worker);
+ } else {
+ vp9_worker_launch(worker);
+ }
+
+ data += size;
+ ++tile_col;
+ }
+
+ for (; i > 0; --i) {
+ VP9Worker *const worker = &pbi->tile_workers[i - 1];
+ pbi->mb.corrupted |= !vp9_worker_sync(worker);
+ }
+ }
+
+ {
+ const int final_worker = (tile_cols + num_workers - 1) % num_workers;
+ TileWorkerData *const tile_data =
+ (TileWorkerData*)pbi->tile_workers[final_worker].data1;
+ return vp9_reader_find_end(&tile_data->bit_reader);
+ }
+}
+
static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
- if (vp9_rb_read_literal(rb, 8) != SYNC_CODE_0 ||
- vp9_rb_read_literal(rb, 8) != SYNC_CODE_1 ||
- vp9_rb_read_literal(rb, 8) != SYNC_CODE_2) {
+ if (vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_0 ||
+ vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_1 ||
+ vp9_rb_read_literal(rb, 8) != VP9_SYNC_CODE_2) {
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
"Invalid frame sync code");
}
@@ -834,34 +1064,6 @@ static void error_handler(void *data, size_t bit_offset) {
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet");
}
-static void setup_inter_inter(VP9_COMMON *cm) {
- int i;
-
- cm->allow_comp_inter_inter = 0;
- for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i)
- cm->allow_comp_inter_inter |=
- cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1];
-
- if (cm->allow_comp_inter_inter) {
- // which one is always-on in comp inter-inter?
- if (cm->ref_frame_sign_bias[LAST_FRAME] ==
- cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
- cm->comp_fixed_ref = ALTREF_FRAME;
- cm->comp_var_ref[0] = LAST_FRAME;
- cm->comp_var_ref[1] = GOLDEN_FRAME;
- } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
- cm->ref_frame_sign_bias[ALTREF_FRAME]) {
- cm->comp_fixed_ref = GOLDEN_FRAME;
- cm->comp_var_ref[0] = LAST_FRAME;
- cm->comp_var_ref[1] = ALTREF_FRAME;
- } else {
- cm->comp_fixed_ref = LAST_FRAME;
- cm->comp_var_ref[0] = GOLDEN_FRAME;
- cm->comp_var_ref[1] = ALTREF_FRAME;
- }
- }
-}
-
#define RESERVED \
if (vp9_rb_read_bit(rb)) \
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, \
@@ -875,7 +1077,7 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
cm->last_frame_type = cm->frame_type;
- if (vp9_rb_read_literal(rb, 2) != 0x2)
+ if (vp9_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
"Invalid frame marker");
@@ -896,12 +1098,10 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
cm->error_resilient_mode = vp9_rb_read_bit(rb);
if (cm->frame_type == KEY_FRAME) {
- int csp;
-
check_sync_code(cm, rb);
- csp = vp9_rb_read_literal(rb, 3); // colorspace
- if (csp != 7) { // != sRGB
+ cm->color_space = vp9_rb_read_literal(rb, 3); // colorspace
+ if (cm->color_space != SRGB) {
vp9_rb_read_bit(rb); // [16,235] (including xvycc) vs [0,255] range
if (cm->version == 1) {
cm->subsampling_x = vp9_rb_read_bit(rb);
@@ -953,8 +1153,6 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
vp9_setup_scale_factors(cm, i);
-
- setup_inter_inter(cm);
}
}
@@ -974,13 +1172,17 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
vp9_setup_past_independence(cm);
setup_loopfilter(&cm->lf, rb);
- setup_quantization(pbi, rb);
+ setup_quantization(cm, &pbi->mb, rb);
setup_segmentation(&cm->seg, rb);
setup_tile_info(cm, rb);
sz = vp9_rb_read_literal(rb, 16);
- return sz > 0 ? sz : -1;
+ if (sz == 0)
+ vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+ "Invalid header size");
+
+ return sz;
}
static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
@@ -1023,7 +1225,7 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
for (j = 0; j < PARTITION_CONTEXTS; ++j)
for (i = 0; i < PARTITION_TYPES - 1; ++i)
- vp9_diff_update_prob(&r, &fc->partition_prob[INTER_FRAME][j][i]);
+ vp9_diff_update_prob(&r, &fc->partition_prob[j][i]);
read_mv_probs(&r, nmvc, cm->allow_high_precision_mv);
}
@@ -1087,69 +1289,65 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
MACROBLOCKD *const xd = &pbi->mb;
const uint8_t *data = pbi->source;
- const uint8_t *data_end = pbi->source + pbi->source_sz;
+ const uint8_t *const data_end = pbi->source + pbi->source_sz;
- struct vp9_read_bit_buffer rb = { data, data_end, 0,
- cm, error_handler };
+ struct vp9_read_bit_buffer rb = { data, data_end, 0, cm, error_handler };
const size_t first_partition_size = read_uncompressed_header(pbi, &rb);
const int keyframe = cm->frame_type == KEY_FRAME;
- YV12_BUFFER_CONFIG *new_fb = &cm->yv12_fb[cm->new_fb_idx];
+ const int tile_rows = 1 << cm->log2_tile_rows;
const int tile_cols = 1 << cm->log2_tile_cols;
- int tile_col;
+ YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
if (!first_partition_size) {
- if (!keyframe) {
// showing a frame directly
*p_data_end = data + 1;
return 0;
- } else {
- vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
- "Invalid key frame");
- return -1;
- }
}
- data += vp9_rb_bytes_read(&rb);
- xd->corrupted = 0;
- new_fb->corrupted = 0;
- pbi->do_loopfilter_inline =
- (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level;
if (!pbi->decoded_key_frame && !keyframe)
return -1;
+ data += vp9_rb_bytes_read(&rb);
if (!read_is_valid(data, first_partition_size, data_end))
vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
"Truncated packet or corrupt header length");
- setup_plane_dequants(cm, &pbi->mb, cm->base_qindex);
+ pbi->do_loopfilter_inline =
+ (cm->log2_tile_rows | cm->log2_tile_cols) == 0 && cm->lf.filter_level;
+ if (pbi->do_loopfilter_inline && pbi->lf_worker.data1 == NULL) {
+ CHECK_MEM_ERROR(cm, pbi->lf_worker.data1, vpx_malloc(sizeof(LFWorkerData)));
+ pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
+ if (pbi->oxcf.max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker)) {
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Loop filter thread creation failed");
+ }
+ }
+
+ alloc_tile_storage(pbi, tile_cols);
xd->mi_8x8 = cm->mi_grid_visible;
xd->mode_info_stride = cm->mode_info_stride;
+ set_prev_mi(cm);
- CHECK_MEM_ERROR(cm, pbi->mi_streams,
- vpx_realloc(pbi->mi_streams, tile_cols *
- sizeof(*pbi->mi_streams)));
- for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
- vp9_get_tile_col_offsets(cm, tile_col);
- pbi->mi_streams[tile_col] =
- &cm->mi[cm->mi_rows * cm->cur_tile_mi_col_start];
- }
+ setup_plane_dequants(cm, xd, cm->base_qindex);
+ setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y);
cm->fc = cm->frame_contexts[cm->frame_context_idx];
-
vp9_zero(cm->counts);
-
- new_fb->corrupted |= read_compressed_header(pbi, data, first_partition_size);
-
- setup_block_dptrs(xd, cm->subsampling_x, cm->subsampling_y);
-
- // clear out the coeff buffer
for (i = 0; i < MAX_MB_PLANE; ++i)
- vp9_zero(xd->plane[i].qcoeff);
+ vp9_zero(xd->plane[i].dqcoeff);
- set_prev_mi(cm);
+ xd->corrupted = 0;
+ new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
- *p_data_end = decode_tiles(pbi, data + first_partition_size);
+ // TODO(jzern): remove frame_parallel_decoding_mode restriction for
+ // single-frame tile decoding.
+ if (pbi->oxcf.max_threads > 1 && tile_rows == 1 && tile_cols > 1 &&
+ cm->frame_parallel_decoding_mode) {
+ *p_data_end = decode_tiles_mt(pbi, data + first_partition_size);
+ } else {
+ *p_data_end = decode_tiles(pbi, data + first_partition_size);
+ }
cm->last_width = cm->width;
cm->last_height = cm->height;
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 0d0f0dfe0..010b8fe33 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -70,28 +70,28 @@ static const vp9_prob cat6_prob[15] = {
DCT_EOB_MODEL_TOKEN : TWO_TOKEN) : \
token]; \
} \
- token_cache[scan[c]] = vp9_pt_energy_class[token]; \
} while (0)
#define WRITE_COEF_CONTINUE(val, token) \
{ \
- qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \
+ dqcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \
dq[c > 0] / (1 + (tx_size == TX_32X32)); \
INCREMENT_COUNT(token); \
+ token_cache[scan[c]] = vp9_pt_energy_class[token]; \
c++; \
continue; \
}
-#define ADJUST_COEF(prob, bits_count) \
- do { \
- if (vp9_read(r, prob)) \
- val += 1 << bits_count; \
+#define ADJUST_COEF(prob, bits_count) \
+ do { \
+ val += (vp9_read(r, prob) << bits_count); \
} while (0);
static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
vp9_reader *r, int block_idx,
- PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr,
- TX_SIZE tx_size, const int16_t *dq, int pt) {
+ PLANE_TYPE type, int seg_eob, int16_t *dqcoeff_ptr,
+ TX_SIZE tx_size, const int16_t *dq, int pt,
+ uint8_t *token_cache) {
const FRAME_CONTEXT *const fc = &cm->fc;
FRAME_COUNTS *const counts = &cm->counts;
const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
@@ -104,7 +104,6 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
vp9_coeff_count_model *coef_counts = counts->coef[tx_size];
const int16_t *scan, *nb;
const uint8_t *const band_translate = get_band_translate(tx_size);
- uint8_t token_cache[1024];
get_scan(xd, tx_size, type, block_idx, &scan, &nb);
while (1) {
@@ -131,6 +130,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
INCREMENT_COUNT(ZERO_TOKEN);
+ token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN];
++c;
goto SKIP_START;
}
@@ -210,45 +210,26 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
return c;
}
-struct decode_block_args {
- VP9_COMMON *cm;
- MACROBLOCKD *xd;
- struct segmentation *seg;
- vp9_reader *r;
- int *eobtotal;
-};
-
-static void decode_block(int plane, int block, BLOCK_SIZE plane_bsize,
- TX_SIZE tx_size, void *argv) {
- const struct decode_block_args* const arg = argv;
-
- // find the maximum eob for this transform size, adjusted by segment
- MACROBLOCKD *xd = arg->xd;
- const struct segmentation *seg = arg->seg;
- struct macroblockd_plane* pd = &xd->plane[plane];
- const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
- const int seg_eob = get_tx_eob(seg, segment_id, tx_size);
+int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, vp9_reader *r,
+ uint8_t *token_cache) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
+ tx_size);
int aoff, loff, eob, pt;
-
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
pt = get_entropy_context(tx_size, pd->above_context + aoff,
pd->left_context + loff);
- eob = decode_coefs(arg->cm, xd, arg->r, block,
- pd->plane_type, seg_eob, BLOCK_OFFSET(pd->qcoeff, block),
- tx_size, pd->dequant, pt);
+ eob = decode_coefs(cm, xd, r, block,
+ pd->plane_type, seg_eob, BLOCK_OFFSET(pd->dqcoeff, block),
+ tx_size, pd->dequant, pt, token_cache);
set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff);
pd->eobs[block] = eob;
- *arg->eobtotal += eob;
+ return eob;
}
-int vp9_decode_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
- struct segmentation *seg,
- vp9_reader *r, BLOCK_SIZE bsize) {
- int eobtotal = 0;
- struct decode_block_args args = {cm, xd, seg, r, &eobtotal};
- foreach_transformed_block(xd, bsize, decode_block, &args);
- return eobtotal;
-}
+
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index 0fb4c3cc9..04939ead3 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -15,8 +15,9 @@
#include "vp9/decoder/vp9_onyxd_int.h"
#include "vp9/decoder/vp9_dboolhuff.h"
-int vp9_decode_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
- struct segmentation *seg,
- vp9_reader *r, BLOCK_SIZE bsize);
+int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, vp9_reader *r,
+ uint8_t *token_cache);
#endif // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index 243dbef21..5f970a3d5 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c
@@ -142,18 +142,12 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
pbi->decoded_key_frame = 0;
vp9_worker_init(&pbi->lf_worker);
- pbi->lf_worker.data1 = vpx_malloc(sizeof(LFWorkerData));
- pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
- if (pbi->lf_worker.data1 == NULL ||
- (pbi->oxcf.max_threads > 1 && !vp9_worker_reset(&pbi->lf_worker))) {
- vp9_remove_decompressor(pbi);
- return NULL;
- }
return pbi;
}
void vp9_remove_decompressor(VP9D_PTR ptr) {
+ int i;
VP9D_COMP *const pbi = (VP9D_COMP *)ptr;
if (!pbi)
@@ -162,7 +156,16 @@ void vp9_remove_decompressor(VP9D_PTR ptr) {
vp9_remove_common(&pbi->common);
vp9_worker_end(&pbi->lf_worker);
vpx_free(pbi->lf_worker.data1);
+ for (i = 0; i < pbi->num_tile_workers; ++i) {
+ VP9Worker *const worker = &pbi->tile_workers[i];
+ vp9_worker_end(worker);
+ vpx_free(worker->data1);
+ vpx_free(worker->data2);
+ }
+ vpx_free(pbi->tile_workers);
vpx_free(pbi->mi_streams);
+ vpx_free(pbi->above_context[0]);
+ vpx_free(pbi->above_seg_context);
vpx_free(pbi);
}
@@ -176,7 +179,6 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr,
YV12_BUFFER_CONFIG *sd) {
VP9D_COMP *pbi = (VP9D_COMP *) ptr;
VP9_COMMON *cm = &pbi->common;
- int ref_fb_idx;
/* TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
* encoder is using the frame buffers for. This is just a stub to keep the
@@ -184,18 +186,15 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9D_PTR ptr,
* later commit that adds VP9-specific controls for this functionality.
*/
if (ref_frame_flag == VP9_LAST_FLAG) {
- ref_fb_idx = cm->ref_frame_map[0];
+ YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->ref_frame_map[0]];
+ if (!equal_dimensions(cfg, sd))
+ vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ else
+ vp8_yv12_copy_frame(cfg, sd);
} else {
vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
"Invalid reference frame");
- return cm->error.error_code;
- }
-
- if (!equal_dimensions(&cm->yv12_fb[ref_fb_idx], sd)) {
- vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
- "Incorrect buffer dimensions");
- } else {
- vp8_yv12_copy_frame(&cm->yv12_fb[ref_fb_idx], sd);
}
return cm->error.error_code;
@@ -267,7 +266,7 @@ static void swap_frame_buffers(VP9D_COMP *pbi) {
++ref_index;
}
- cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+ cm->frame_to_show = get_frame_new_buffer(cm);
cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
// Invalidate these references until the next frame starts.
@@ -305,7 +304,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
* thing to do here.
*/
if (cm->active_ref_idx[0] != INT_MAX)
- cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1;
+ get_frame_ref_buffer(cm, 0)->corrupted = 1;
}
cm->new_fb_idx = get_free_fb(cm);
@@ -322,7 +321,7 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
* thing to do here.
*/
if (cm->active_ref_idx[0] != INT_MAX)
- cm->yv12_fb[cm->active_ref_idx[0]].corrupted = 1;
+ get_frame_ref_buffer(cm, 0)->corrupted = 1;
if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0)
cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index 68b30347e..7c4c9db36 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h
@@ -40,9 +40,17 @@ typedef struct VP9Decompressor {
int do_loopfilter_inline; // apply loopfilter to available rows immediately
VP9Worker lf_worker;
+ VP9Worker *tile_workers;
+ int num_tile_workers;
+
/* Each tile column has its own MODE_INFO stream. This array indexes them by
tile column index. */
MODE_INFO **mi_streams;
+
+ ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+ PARTITION_CONTEXT *above_seg_context;
+
+ DECLARE_ALIGNED(16, unsigned char, token_cache[1024]);
} VP9D_COMP;
#endif // VP9_DECODER_VP9_ONYXD_INT_H_
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 8378a78e1..87bd36c2b 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -53,8 +53,7 @@ extern unsigned int active_section;
int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES];
int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1];
int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-int64_t switchable_interp_stats[SWITCHABLE_FILTERS+1]
- [SWITCHABLE_FILTERS];
+int64_t switchable_interp_stats[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
void init_tx_count_stats() {
vp9_zero(tx_count_32x32p_stats);
@@ -87,10 +86,9 @@ static void update_tx_count_stats(VP9_COMMON *cm) {
static void update_switchable_interp_stats(VP9_COMMON *cm) {
int i, j;
- for (i = 0; i < SWITCHABLE_FILTERS+1; ++i)
- for (j = 0; j < SWITCHABLE_FILTERS; ++j) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+ for (j = 0; j < SWITCHABLE_FILTERS; ++j)
switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j];
- }
}
void write_tx_count_stats() {
@@ -140,9 +138,9 @@ void write_switchable_interp_stats() {
fclose(fp);
printf(
- "vp9_default_switchable_filter_count[SWITCHABLE_FILTERS+1]"
+ "vp9_default_switchable_filter_count[SWITCHABLE_FILTER_CONTEXTS]"
"[SWITCHABLE_FILTERS] = {\n");
- for (i = 0; i < SWITCHABLE_FILTERS+1; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
printf(" { ");
for (j = 0; j < SWITCHABLE_FILTERS; j++) {
printf("%"PRId64", ", switchable_interp_stats[i][j]);
@@ -165,18 +163,13 @@ void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,
vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
}
-static void update_mode(
- vp9_writer *w,
- int n,
- vp9_tree tree,
- vp9_prob Pnew[/* n-1 */],
- vp9_prob Pcur[/* n-1 */],
- unsigned int bct[/* n-1 */] [2],
- const unsigned int num_events[/* n */]
-) {
+static void update_mode(vp9_writer *w, int n, vp9_tree tree,
+ vp9_prob Pcur[/* n-1 */],
+ unsigned int bct[/* n-1 */][2],
+ const unsigned int num_events[/* n */]) {
int i = 0;
- vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0);
+ vp9_tree_probs_from_distribution(tree, bct, num_events, 0);
n--;
for (i = 0; i < n; ++i)
@@ -187,11 +180,10 @@ static void update_mbintra_mode_probs(VP9_COMP* const cpi,
vp9_writer* const bc) {
VP9_COMMON *const cm = &cpi->common;
int j;
- vp9_prob pnew[INTRA_MODES - 1];
unsigned int bct[INTRA_MODES - 1][2];
for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
- update_mode(bc, INTRA_MODES, vp9_intra_mode_tree, pnew,
+ update_mode(bc, INTRA_MODES, vp9_intra_mode_tree,
cm->fc.y_mode_prob[j], bct,
(unsigned int *)cpi->y_mode_count[j]);
}
@@ -233,44 +225,35 @@ static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m);
}
-static void update_switchable_interp_probs(VP9_COMP *const cpi,
- vp9_writer* const bc) {
+static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) {
VP9_COMMON *const cm = &cpi->common;
- unsigned int branch_ct[SWITCHABLE_FILTERS + 1]
- [SWITCHABLE_FILTERS - 1][2];
- vp9_prob new_prob[SWITCHABLE_FILTERS + 1][SWITCHABLE_FILTERS - 1];
+ unsigned int branch_ct[SWITCHABLE_FILTERS - 1][2];
int i, j;
- for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
- vp9_tree_probs_from_distribution(
- vp9_switchable_interp_tree,
- new_prob[j], branch_ct[j],
- cm->counts.switchable_interp[j], 0);
- }
- for (j = 0; j <= SWITCHABLE_FILTERS; ++j) {
- for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
- vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
- branch_ct[j][i]);
- }
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
+ vp9_tree_probs_from_distribution(vp9_switchable_interp_tree, branch_ct,
+ cm->counts.switchable_interp[j], 0);
+
+ for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
+ vp9_cond_prob_diff_update(w, &cm->fc.switchable_interp_prob[j][i],
+ branch_ct[i]);
}
+
#ifdef MODE_STATS
if (!cpi->dummy_packing)
update_switchable_interp_stats(cm);
#endif
}
-static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer* const bc) {
+static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer *w) {
int i, j;
for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
unsigned int branch_ct[INTER_MODES - 1][2];
- vp9_prob new_prob[INTER_MODES - 1];
-
- vp9_tree_probs_from_distribution(vp9_inter_mode_tree,
- new_prob, branch_ct,
+ vp9_tree_probs_from_distribution(vp9_inter_mode_tree, branch_ct,
cm->counts.inter_mode[i], NEARESTMV);
for (j = 0; j < INTER_MODES - 1; ++j)
- vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j],
+ vp9_cond_prob_diff_update(w, &cm->fc.inter_mode_probs[i][j],
branch_ct[j]);
}
}
@@ -561,7 +544,8 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
write_intra_mode(bc, m->mbmi.uv_mode, vp9_kf_uv_mode_prob[ym]);
}
-static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
+static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
+ MODE_INFO **mi_8x8, vp9_writer *bc,
TOKENEXTRA **tok, TOKENEXTRA *tok_end,
int mi_row, int mi_col, int index) {
VP9_COMMON *const cm = &cpi->common;
@@ -574,9 +558,10 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
xd->mi_8x8 = mi_8x8;
- set_mi_row_col(&cpi->common, xd,
+ set_mi_row_col(xd, tile,
mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
- mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type]);
+ mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
+ cm->mi_rows, cm->mi_cols);
if (frame_is_intra_only(cm)) {
write_mb_modes_kf(cpi, mi_8x8, bc);
#ifdef ENTROPY_STATS
@@ -593,7 +578,31 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
pack_mb_tokens(bc, tok, tok_end);
}
-static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
+static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col,
+ PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int ctx = partition_plane_context(cpi->above_seg_context,
+ cpi->left_seg_context,
+ mi_row, mi_col, bsize);
+ const vp9_prob *const probs = get_partition_probs(cm, ctx);
+ const int has_rows = (mi_row + hbs) < cm->mi_rows;
+ const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+ if (has_rows && has_cols) {
+ write_token(w, vp9_partition_tree, probs, &vp9_partition_encodings[p]);
+ } else if (!has_rows && has_cols) {
+ assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+ vp9_write(w, p == PARTITION_SPLIT, probs[1]);
+ } else if (has_rows && !has_cols) {
+ assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+ vp9_write(w, p == PARTITION_SPLIT, probs[2]);
+ } else {
+ assert(p == PARTITION_SPLIT);
+ }
+}
+
+static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
+ MODE_INFO **mi_8x8, vp9_writer *bc,
TOKENEXTRA **tok, TOKENEXTRA *tok_end,
int mi_row, int mi_col, BLOCK_SIZE bsize,
int index) {
@@ -615,42 +624,32 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
if (index > 0)
return;
} else {
- int pl;
- const int idx = check_bsize_coverage(bs, cm->mi_rows, cm->mi_cols,
- mi_row, mi_col);
- pl = partition_plane_context(cm, mi_row, mi_col, bsize);
- // encode the partition information
- if (idx == 0)
- write_token(bc, vp9_partition_tree,
- cm->fc.partition_prob[cm->frame_type][pl],
- vp9_partition_encodings + partition);
- else if (idx > 0)
- vp9_write(bc, partition == PARTITION_SPLIT,
- cm->fc.partition_prob[cm->frame_type][pl][idx]);
+ write_partition(cpi, bs, mi_row, mi_col, partition, bsize, bc);
}
subsize = get_subsize(bsize, partition);
switch (partition) {
case PARTITION_NONE:
- write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
+ write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
break;
case PARTITION_HORZ:
- write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
+ write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
if ((mi_row + bs) < cm->mi_rows)
- write_modes_b(cpi, mi_8x8 + bs * mis, bc, tok, tok_end, mi_row + bs,
- mi_col, 1);
+ write_modes_b(cpi, tile, mi_8x8 + bs * mis, bc, tok, tok_end,
+ mi_row + bs, mi_col, 1);
break;
case PARTITION_VERT:
- write_modes_b(cpi, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
+ write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
if ((mi_col + bs) < cm->mi_cols)
- write_modes_b(cpi, mi_8x8 + bs, bc, tok, tok_end, mi_row, mi_col + bs,
- 1);
+ write_modes_b(cpi, tile, mi_8x8 + bs, bc, tok, tok_end,
+ mi_row, mi_col + bs, 1);
break;
case PARTITION_SPLIT:
for (n = 0; n < 4; n++) {
const int j = n >> 1, i = n & 1;
- write_modes_sb(cpi, mi_8x8 + j * bs * mis + i * bs, bc, tok, tok_end,
+ write_modes_sb(cpi, tile, mi_8x8 + j * bs * mis + i * bs, bc,
+ tok, tok_end,
mi_row + j * bs, mi_col + i * bs, subsize, n);
}
break;
@@ -661,10 +660,12 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc,
// update partition context
if (bsize >= BLOCK_8X8 &&
(bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
- update_partition_context(cm, mi_row, mi_col, subsize, bsize);
+ update_partition_context(cpi->above_seg_context, cpi->left_seg_context,
+ mi_row, mi_col, subsize, bsize);
}
-static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
+static void write_modes(VP9_COMP *cpi, const TileInfo *const tile,
+ vp9_writer* const bc,
TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
VP9_COMMON *const cm = &cpi->common;
const int mis = cm->mode_info_stride;
@@ -672,15 +673,15 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
MODE_INFO **mi_8x8 = cm->mi_grid_visible;
MODE_INFO **m_8x8;
- mi_8x8 += cm->cur_tile_mi_col_start + cm->cur_tile_mi_row_start * mis;
+ mi_8x8 += tile->mi_col_start + tile->mi_row_start * mis;
- for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end;
+ for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
mi_row += 8, mi_8x8 += 8 * mis) {
m_8x8 = mi_8x8;
- vp9_zero(cm->left_seg_context);
- for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+ vp9_zero(cpi->left_seg_context);
+ for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) {
- write_modes_sb(cpi, m_8x8, bc, tok, tok_end, mi_row, mi_col,
+ write_modes_sb(cpi, tile, m_8x8, bc, tok, tok_end, mi_row, mi_col,
BLOCK_64X64, 0);
}
}
@@ -692,8 +693,7 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
cpi->common.counts.eob_branch[tx_size];
vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[tx_size];
- vp9_prob full_probs[ENTROPY_NODES];
- int i, j, k, l;
+ int i, j, k, l, m;
for (i = 0; i < BLOCK_TYPES; ++i) {
for (j = 0; j < REF_TYPES; ++j) {
@@ -702,16 +702,14 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
if (l >= 3 && k == 0)
continue;
vp9_tree_probs_from_distribution(vp9_coef_tree,
- full_probs,
coef_branch_ct[i][j][k][l],
coef_counts[i][j][k][l], 0);
- vpx_memcpy(coef_probs[i][j][k][l], full_probs,
- sizeof(vp9_prob) * UNCONSTRAINED_NODES);
coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
coef_branch_ct[i][j][k][l][0][0];
- coef_probs[i][j][k][l][0] =
- get_binary_prob(coef_branch_ct[i][j][k][l][0][0],
- coef_branch_ct[i][j][k][l][0][1]);
+ for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+ coef_probs[i][j][k][l][m] = get_binary_prob(
+ coef_branch_ct[i][j][k][l][m][0],
+ coef_branch_ct[i][j][k][l][m][1]);
#ifdef ENTROPY_STATS
if (!cpi->dummy_packing) {
int t;
@@ -1103,7 +1101,7 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
}
}
-static void write_interp_filter_type(INTERPOLATIONFILTERTYPE type,
+static void write_interp_filter_type(INTERPOLATION_TYPE type,
struct vp9_write_bit_buffer *wb) {
const int type_to_literal[] = { 1, 0, 2, 3 };
@@ -1121,7 +1119,7 @@ static void fix_mcomp_filter_type(VP9_COMP *cpi) {
int i, j, c = 0;
for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
count[i] = 0;
- for (j = 0; j <= SWITCHABLE_FILTERS; ++j)
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
count[i] += cm->counts.switchable_interp[j][i];
c += (count[i] > 0);
}
@@ -1201,7 +1199,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
const int tile_cols = 1 << cm->log2_tile_cols;
const int tile_rows = 1 << cm->log2_tile_rows;
- vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
+ vpx_memset(cpi->above_seg_context, 0, sizeof(*cpi->above_seg_context) *
mi_cols_aligned_to_sb(cm->mi_cols));
tok[0][0] = cpi->tok;
@@ -1216,9 +1214,10 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
}
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- vp9_get_tile_row_offsets(cm, tile_row);
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
- vp9_get_tile_col_offsets(cm, tile_col);
+ TileInfo tile;
+
+ vp9_tile_init(&tile, cm, 0, tile_col);
tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
@@ -1226,7 +1225,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
else
vp9_start_encode(&residual_bc, data_ptr + total_size);
- write_modes(cpi, &residual_bc, &tok[tile_row][tile_col], tok_end);
+ write_modes(cpi, &tile, &residual_bc, &tok[tile_row][tile_col], tok_end);
assert(tok[tile_row][tile_col] == tok_end);
vp9_stop_encode(&residual_bc);
if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
@@ -1295,17 +1294,16 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
}
static void write_sync_code(struct vp9_write_bit_buffer *wb) {
- vp9_wb_write_literal(wb, SYNC_CODE_0, 8);
- vp9_wb_write_literal(wb, SYNC_CODE_1, 8);
- vp9_wb_write_literal(wb, SYNC_CODE_2, 8);
+ vp9_wb_write_literal(wb, VP9_SYNC_CODE_0, 8);
+ vp9_wb_write_literal(wb, VP9_SYNC_CODE_1, 8);
+ vp9_wb_write_literal(wb, VP9_SYNC_CODE_2, 8);
}
static void write_uncompressed_header(VP9_COMP *cpi,
struct vp9_write_bit_buffer *wb) {
VP9_COMMON *const cm = &cpi->common;
- // frame marker bits
- vp9_wb_write_literal(wb, 0x2, 2);
+ vp9_wb_write_literal(wb, VP9_FRAME_MARKER, 2);
// bitstream version.
// 00 - profile 0. 4:2:0 only
@@ -1319,18 +1317,10 @@ static void write_uncompressed_header(VP9_COMP *cpi,
vp9_wb_write_bit(wb, cm->error_resilient_mode);
if (cm->frame_type == KEY_FRAME) {
+ const COLOR_SPACE cs = UNKNOWN;
write_sync_code(wb);
- // colorspaces
- // 000 - Unknown
- // 001 - BT.601
- // 010 - BT.709
- // 011 - SMPTE-170
- // 100 - SMPTE-240
- // 101 - Reserved
- // 110 - Reserved
- // 111 - sRGB (RGB)
- vp9_wb_write_literal(wb, 0, 3);
- if (1 /* colorspace != sRGB */) {
+ vp9_wb_write_literal(wb, cs, 3);
+ if (cs != SRGB) {
vp9_wb_write_bit(wb, 0); // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
if (cm->version == 1) {
vp9_wb_write_bit(wb, cm->subsampling_x);
@@ -1457,11 +1447,9 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
update_mbintra_mode_probs(cpi, &header_bc);
for (i = 0; i < PARTITION_CONTEXTS; ++i) {
- vp9_prob pnew[PARTITION_TYPES - 1];
unsigned int bct[PARTITION_TYPES - 1][2];
- update_mode(&header_bc, PARTITION_TYPES,
- vp9_partition_tree, pnew,
- fc->partition_prob[cm->frame_type][i], bct,
+ update_mode(&header_bc, PARTITION_TYPES, vp9_partition_tree,
+ fc->partition_prob[i], bct,
(unsigned int *)cpi->partition_count[i]);
}
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 12dad0311..8033a4d15 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -42,7 +42,7 @@ typedef struct {
int comp_pred_diff;
int single_pred_diff;
int64_t tx_rd_diff[TX_MODES];
- int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
// motion vector cache for adaptive motion search control in partition
// search loop
@@ -118,8 +118,7 @@ struct macroblock {
unsigned inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
int intra_uv_mode_cost[2][MB_MODE_COUNT];
int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
- int switchable_interp_costs[SWITCHABLE_FILTERS + 1]
- [SWITCHABLE_FILTERS];
+ int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
// These define limits to motion vector components to prevent them
// from extending outside the UMV borders
@@ -137,7 +136,7 @@ struct macroblock {
// note that token_costs is the cost when eob node is skipped
vp9_coeff_cost token_costs[TX_SIZES];
- uint8_t token_cache[1024];
+ DECLARE_ALIGNED(16, uint8_t, token_cache[1024]);
int optimize;
@@ -173,7 +172,7 @@ struct macroblock {
BLOCK_SIZE sb_partitioning[4];
BLOCK_SIZE sb64_partitioning;
- void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
+ void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
};
// TODO(jingning): the variables used here are little complicated. need further
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 550cdee60..065992a25 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -8,14 +8,17 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
#include <assert.h>
#include <math.h>
+
#include "./vpx_config.h"
-#include "vp9/common/vp9_systemdependent.h"
+#include "./vp9_rtcd.h"
#include "vp9/common/vp9_blockd.h"
#include "vp9/common/vp9_idct.h"
+#include "vp9/common/vp9_systemdependent.h"
+
+#include "vp9/encoder/vp9_dct.h"
static void fdct4(const int16_t *input, int16_t *output) {
int16_t step[4];
@@ -36,7 +39,7 @@ static void fdct4(const int16_t *input, int16_t *output) {
output[3] = dct_const_round_shift(temp2);
}
-void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
@@ -46,7 +49,7 @@ void vp9_short_fdct4x4_c(int16_t *input, int16_t *output, int stride) {
int pass;
// We need an intermediate buffer between passes.
int16_t intermediate[4 * 4];
- int16_t *in = input;
+ const int16_t *in = input;
int16_t *out = intermediate;
// Do the two transform/transpose passes
for (pass = 0; pass < 2; ++pass) {
@@ -148,8 +151,8 @@ static const transform_2d FHT_4[] = {
{ fadst4, fadst4 } // ADST_ADST = 3
};
-void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
- int pitch, TX_TYPE tx_type) {
+void vp9_short_fht4x4_c(const int16_t *input, int16_t *output,
+ int stride, int tx_type) {
int16_t out[4 * 4];
int16_t *outptr = &out[0];
int i, j;
@@ -159,7 +162,7 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output,
// Columns
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
- temp_in[j] = input[j * pitch + i] * 16;
+ temp_in[j] = input[j * stride + i] * 16;
if (i == 0 && temp_in[0])
temp_in[0] += 1;
ht.cols(temp_in, temp_out);
@@ -229,7 +232,7 @@ static void fdct8(const int16_t *input, int16_t *output) {
output[7] = dct_const_round_shift(t3);
}
-void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int stride) {
+void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
int i, j;
int16_t intermediate[64];
@@ -300,7 +303,7 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int stride) {
}
}
-void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int stride) {
+void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
@@ -310,7 +313,7 @@ void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int stride) {
int pass;
// We need an intermediate buffer between passes.
int16_t intermediate[256];
- int16_t *in = input;
+ const int16_t *in = input;
int16_t *out = intermediate;
// Do the two transform/transpose passes
for (pass = 0; pass < 2; ++pass) {
@@ -556,8 +559,8 @@ static const transform_2d FHT_8[] = {
{ fadst8, fadst8 } // ADST_ADST = 3
};
-void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
- int pitch, TX_TYPE tx_type) {
+void vp9_short_fht8x8_c(const int16_t *input, int16_t *output,
+ int stride, int tx_type) {
int16_t out[64];
int16_t *outptr = &out[0];
int i, j;
@@ -567,7 +570,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
// Columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
- temp_in[j] = input[j * pitch + i] * 4;
+ temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j)
outptr[j * 8 + i] = temp_out[j];
@@ -585,10 +588,10 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output,
/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
pixel. */
-void vp9_short_walsh4x4_c(int16_t *input, int16_t *output, int stride) {
+void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
int i;
int a1, b1, c1, d1, e1;
- int16_t *ip = input;
+ const int16_t *ip = input;
int16_t *op = output;
for (i = 0; i < 4; i++) {
@@ -949,8 +952,8 @@ static const transform_2d FHT_16[] = {
{ fadst16, fadst16 } // ADST_ADST = 3
};
-void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
- int pitch, TX_TYPE tx_type) {
+void vp9_short_fht16x16_c(const int16_t *input, int16_t *output,
+ int stride, int tx_type) {
int16_t out[256];
int16_t *outptr = &out[0];
int i, j;
@@ -960,7 +963,7 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
// Columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
- temp_in[j] = input[j * pitch + i] * 4;
+ temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
@@ -1311,7 +1314,7 @@ static void dct32_1d(const int *input, int *output, int round) {
output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
}
-void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int stride) {
+void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
int i, j;
int output[32 * 32];
@@ -1339,7 +1342,7 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int stride) {
// Note that although we use dct_32_round in dct32_1d computation flow,
// this 2d fdct32x32 for rate-distortion optimization loop is operating
// within 16 bits precision.
-void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int stride) {
+void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
int i, j;
int output[32 * 32];
@@ -1366,3 +1369,27 @@ void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int stride) {
out[j + i * 32] = temp_out[j];
}
}
+
+void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+ int stride) {
+ if (tx_type == DCT_DCT)
+ vp9_fdct4x4(input, output, stride);
+ else
+ vp9_short_fht4x4(input, output, stride, tx_type);
+}
+
+void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+ int stride) {
+ if (tx_type == DCT_DCT)
+ vp9_fdct8x8(input, output, stride);
+ else
+ vp9_short_fht8x8(input, output, stride, tx_type);
+}
+
+void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+ int stride) {
+ if (tx_type == DCT_DCT)
+ vp9_fdct16x16(input, output, stride);
+ else
+ vp9_short_fht16x16(input, output, stride, tx_type);
+}
diff --git a/vp9/encoder/vp9_dct.h b/vp9/encoder/vp9_dct.h
new file mode 100644
index 000000000..aaf976d93
--- /dev/null
+++ b/vp9/encoder/vp9_dct.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP9_ENCODER_VP9_DCT_H_
+#define VP9_ENCODER_VP9_DCT_H_
+
+void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+ int stride);
+
+void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+ int stride);
+
+void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
+ int stride);
+
+#endif // VP9_ENCODER_VP9_DCT_H_
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 98284a690..a45299b59 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -282,7 +282,7 @@ static void build_activity_map(VP9_COMP *cpi) {
VP9_COMMON * const cm = &cpi->common;
#if ALT_ACT_MEASURE
- YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+ YV12_BUFFER_CONFIG *new_yv12 = get_frame_new_buffer(cm);
int recon_yoffset;
int recon_y_stride = new_yv12->y_stride;
#endif
@@ -465,7 +465,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
}
}
@@ -484,8 +484,8 @@ void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
x->e_mbd.plane[i].subsampling_y);
}
-static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
- BLOCK_SIZE bsize) {
+static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
MACROBLOCK *const x = &cpi->mb;
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
@@ -499,7 +499,7 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
const int idx_map = mb_row * cm->mb_cols + mb_col;
const struct segmentation *const seg = &cm->seg;
- set_skip_context(cm, xd, mi_row, mi_col);
+ set_skip_context(xd, cpi->above_context, cpi->left_context, mi_row, mi_col);
// Activity map pointer
x->mb_activity_ptr = &cpi->mb_activity_map[idx_map];
@@ -528,7 +528,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
// Set up distance of MB to edge of frame in 1/8th pel units
assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
- set_mi_row_col(cm, xd, mi_row, mi_height, mi_col, mi_width);
+ set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+ cm->mi_rows, cm->mi_cols);
/* set up source buffers */
vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
@@ -555,9 +556,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
const int x = mb_col & ~3;
const int p16 = ((mb_row & 1) << 1) + (mb_col & 1);
const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
- const int tile_progress = cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
- const int mb_cols = (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start)
- >> 1;
+ const int tile_progress = tile->mi_col_start * cm->mb_rows >> 1;
+ const int mb_cols = (tile->mi_col_end - tile->mi_col_start) >> 1;
cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress)
<< 16) / cm->MBs;
@@ -570,7 +570,8 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
}
}
-static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
+static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
+ int mi_row, int mi_col,
int *totalrate, int64_t *totaldist,
BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
int64_t best_rd) {
@@ -596,7 +597,7 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
}
}
- set_offsets(cpi, mi_row, mi_col, bsize);
+ set_offsets(cpi, tile, mi_row, mi_col, bsize);
xd->mi_8x8[0]->mbmi.sb_type = bsize;
// Set to zero to make sure we do not use the previous encoded frame stats
@@ -632,10 +633,10 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
best_rd);
} else {
if (bsize >= BLOCK_8X8)
- vp9_rd_pick_inter_mode_sb(cpi, x, mi_row, mi_col, totalrate, totaldist,
- bsize, ctx, best_rd);
+ vp9_rd_pick_inter_mode_sb(cpi, x, tile, mi_row, mi_col,
+ totalrate, totaldist, bsize, ctx, best_rd);
else
- vp9_rd_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, totalrate,
+ vp9_rd_pick_inter_mode_sub8x8(cpi, x, tile, mi_row, mi_col, totalrate,
totaldist, bsize, ctx, best_rd);
}
@@ -682,10 +683,6 @@ static void update_stats(VP9_COMP *cpi) {
[mbmi->ref_frame[0] != GOLDEN_FRAME]++;
}
}
-
- // Count of last ref frame 0,0 usage
- if (mbmi->mode == ZEROMV && mbmi->ref_frame[0] == LAST_FRAME)
- cpi->inter_zz_count++;
}
}
@@ -711,7 +708,6 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
BLOCK_SIZE bsize) {
- VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
int p;
@@ -721,28 +717,27 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
int mi_height = num_8x8_blocks_high_lookup[bsize];
for (p = 0; p < MAX_MB_PLANE; p++) {
vpx_memcpy(
- cm->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+ cpi->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
a + num_4x4_blocks_wide * p,
(sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
xd->plane[p].subsampling_x);
vpx_memcpy(
- cm->left_context[p]
+ cpi->left_context[p]
+ ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
l + num_4x4_blocks_high * p,
(sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
xd->plane[p].subsampling_y);
}
- vpx_memcpy(cm->above_seg_context + mi_col, sa,
- sizeof(PARTITION_CONTEXT) * mi_width);
- vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
- sizeof(PARTITION_CONTEXT) * mi_height);
+ vpx_memcpy(cpi->above_seg_context + mi_col, sa,
+ sizeof(*cpi->above_seg_context) * mi_width);
+ vpx_memcpy(cpi->left_seg_context + (mi_row & MI_MASK), sl,
+ sizeof(cpi->left_seg_context[0]) * mi_height);
}
static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
BLOCK_SIZE bsize) {
- const VP9_COMMON *const cm = &cpi->common;
const MACROBLOCK *const x = &cpi->mb;
const MACROBLOCKD *const xd = &x->e_mbd;
int p;
@@ -755,23 +750,24 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
for (p = 0; p < MAX_MB_PLANE; ++p) {
vpx_memcpy(
a + num_4x4_blocks_wide * p,
- cm->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+ cpi->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
(sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
xd->plane[p].subsampling_x);
vpx_memcpy(
l + num_4x4_blocks_high * p,
- cm->left_context[p]
+ cpi->left_context[p]
+ ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
(sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
xd->plane[p].subsampling_y);
}
- vpx_memcpy(sa, cm->above_seg_context + mi_col,
- sizeof(PARTITION_CONTEXT) * mi_width);
- vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
- sizeof(PARTITION_CONTEXT) * mi_height);
+ vpx_memcpy(sa, cpi->above_seg_context + mi_col,
+ sizeof(*cpi->above_seg_context) * mi_width);
+ vpx_memcpy(sl, cpi->left_seg_context + (mi_row & MI_MASK),
+ sizeof(cpi->left_seg_context[0]) * mi_height);
}
-static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
+ TOKENEXTRA **tp, int mi_row, int mi_col,
int output_enabled, BLOCK_SIZE bsize, int sub_index) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
@@ -789,7 +785,7 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
if (xd->ab_index > 0)
return;
}
- set_offsets(cpi, mi_row, mi_col, bsize);
+ set_offsets(cpi, tile, mi_row, mi_col, bsize);
update_state(cpi, get_block_context(x, bsize), bsize, output_enabled);
encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
@@ -801,7 +797,8 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
}
}
-static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
+ TOKENEXTRA **tp, int mi_row, int mi_col,
int output_enabled, BLOCK_SIZE bsize) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
@@ -818,7 +815,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
c1 = BLOCK_4X4;
if (bsize >= BLOCK_8X8) {
- pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+ pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+ mi_row, mi_col, bsize);
c1 = *(get_sb_partitioning(x, bsize));
}
partition = partition_lookup[bsl][c1];
@@ -827,19 +825,19 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
case PARTITION_NONE:
if (output_enabled && bsize >= BLOCK_8X8)
cpi->partition_count[pl][PARTITION_NONE]++;
- encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
+ encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, -1);
break;
case PARTITION_VERT:
if (output_enabled)
cpi->partition_count[pl][PARTITION_VERT]++;
- encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
- encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
+ encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0);
+ encode_b(cpi, tile, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
break;
case PARTITION_HORZ:
if (output_enabled)
cpi->partition_count[pl][PARTITION_HORZ]++;
- encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
- encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
+ encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0);
+ encode_b(cpi, tile, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
break;
case PARTITION_SPLIT:
subsize = get_subsize(bsize, PARTITION_SPLIT);
@@ -851,7 +849,7 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
const int x_idx = i & 1, y_idx = i >> 1;
*get_sb_index(xd, subsize) = i;
- encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
+ encode_sb(cpi, tile, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
output_enabled, subsize);
}
break;
@@ -861,7 +859,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
}
if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
- update_partition_context(cm, mi_row, mi_col, c1, bsize);
+ update_partition_context(cpi->above_seg_context, cpi->left_seg_context,
+ mi_row, mi_col, c1, bsize);
}
// Check to see if the given partition size is allowed for a specified number
@@ -889,13 +888,13 @@ static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
// However, at the bottom and right borders of the image the requested size
// may not be allowed in which case this code attempts to choose the largest
// allowable partition.
-static void set_partitioning(VP9_COMP *cpi, MODE_INFO **mi_8x8,
- int mi_row, int mi_col) {
+static void set_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
+ MODE_INFO **mi_8x8, int mi_row, int mi_col) {
VP9_COMMON *const cm = &cpi->common;
BLOCK_SIZE bsize = cpi->sf.always_this_block_size;
const int mis = cm->mode_info_stride;
- int row8x8_remaining = cm->cur_tile_mi_row_end - mi_row;
- int col8x8_remaining = cm->cur_tile_mi_col_end - mi_col;
+ int row8x8_remaining = tile->mi_row_end - mi_row;
+ int col8x8_remaining = tile->mi_col_end - mi_col;
int block_row, block_col;
MODE_INFO * mi_upper_left = cm->mi + mi_row * mis + mi_col;
int bh = num_8x8_blocks_high_lookup[bsize];
@@ -970,7 +969,9 @@ static int sb_has_motion(VP9_COMP *cpi, MODE_INFO **prev_mi_8x8) {
return 0;
}
-static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+static void rd_use_partition(VP9_COMP *cpi,
+ const TileInfo *const tile,
+ MODE_INFO **mi_8x8,
TOKENEXTRA **tp, int mi_row, int mi_col,
BLOCK_SIZE bsize, int *rate, int64_t *dist,
int do_recon) {
@@ -1022,7 +1023,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
if (bsize == BLOCK_16X16) {
- set_offsets(cpi, mi_row, mi_col, bsize);
+ set_offsets(cpi, tile, mi_row, mi_col, bsize);
x->mb_energy = vp9_block_energy(cpi, x, bsize);
}
@@ -1049,10 +1050,12 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
mi_row + (ms >> 1) < cm->mi_rows &&
mi_col + (ms >> 1) < cm->mi_cols) {
*(get_sb_partitioning(x, bsize)) = bsize;
- pick_sb_modes(cpi, mi_row, mi_col, &none_rate, &none_dist, bsize,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize,
get_block_context(x, bsize), INT64_MAX);
- pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+ pl = partition_plane_context(cpi->above_seg_context,
+ cpi->left_seg_context,
+ mi_row, mi_col, bsize);
none_rate += x->partition_cost[pl][PARTITION_NONE];
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -1063,12 +1066,12 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
switch (partition) {
case PARTITION_NONE:
- pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
bsize, get_block_context(x, bsize), INT64_MAX);
break;
case PARTITION_HORZ:
*get_sb_index(xd, subsize) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
subsize, get_block_context(x, subsize), INT64_MAX);
if (last_part_rate != INT_MAX &&
bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) {
@@ -1077,7 +1080,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
*get_sb_index(xd, subsize) = 1;
- pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
+ pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
get_block_context(x, subsize), INT64_MAX);
if (rt == INT_MAX || dt == INT_MAX) {
last_part_rate = INT_MAX;
@@ -1091,7 +1094,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
break;
case PARTITION_VERT:
*get_sb_index(xd, subsize) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, &last_part_rate, &last_part_dist,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
subsize, get_block_context(x, subsize), INT64_MAX);
if (last_part_rate != INT_MAX &&
bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
@@ -1100,7 +1103,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
*get_sb_index(xd, subsize) = 1;
- pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
+ pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
get_block_context(x, subsize), INT64_MAX);
if (rt == INT_MAX || dt == INT_MAX) {
last_part_rate = INT_MAX;
@@ -1127,7 +1130,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
*get_sb_index(xd, subsize) = i;
- rd_use_partition(cpi, mi_8x8 + jj * bss * mis + ii * bss, tp,
+ rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
i != 3);
if (rt == INT_MAX || dt == INT_MAX) {
@@ -1143,7 +1146,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
assert(0);
}
- pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+ pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+ mi_row, mi_col, bsize);
if (last_part_rate < INT_MAX)
last_part_rate += x->partition_cost[pl][partition];
@@ -1175,7 +1179,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
+ pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt,
split_subsize, get_block_context(x, split_subsize),
INT64_MAX);
@@ -1188,15 +1192,18 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
}
if (i != 3)
- encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
+ encode_sb(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, 0,
split_subsize);
split_rate += rt;
split_dist += dt;
- pl = partition_plane_context(cm, mi_row + y_idx, mi_col + x_idx, bsize);
+ pl = partition_plane_context(cpi->above_seg_context,
+ cpi->left_seg_context,
+ mi_row + y_idx, mi_col + x_idx, bsize);
split_rate += x->partition_cost[pl][PARTITION_NONE];
}
- pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+ pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+ mi_row, mi_col, bsize);
if (split_rate < INT_MAX) {
split_rate += x->partition_cost[pl][PARTITION_SPLIT];
@@ -1231,7 +1238,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8,
assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
if (do_recon)
- encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+ encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
*rate = chosen_rate;
*dist = chosen_dist;
@@ -1279,7 +1286,8 @@ static void get_sb_partition_size_range(VP9_COMP *cpi, MODE_INFO ** mi_8x8,
// Look at neighboring blocks and set a min and max partition size based on
// what they chose.
-static void rd_auto_partition_range(VP9_COMP *cpi, int row, int col,
+static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
+ int row, int col,
BLOCK_SIZE *min_block_size,
BLOCK_SIZE *max_block_size) {
VP9_COMMON * const cm = &cpi->common;
@@ -1293,8 +1301,8 @@ static void rd_auto_partition_range(VP9_COMP *cpi, int row, int col,
MODE_INFO ** above_sb64_mi_8x8;
MODE_INFO ** left_sb64_mi_8x8;
- int row8x8_remaining = cm->cur_tile_mi_row_end - row;
- int col8x8_remaining = cm->cur_tile_mi_col_end - col;
+ int row8x8_remaining = tile->mi_row_end - row;
+ int col8x8_remaining = tile->mi_col_end - col;
int bh, bw;
// Trap case where we do not have a prediction.
@@ -1444,7 +1452,8 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
// unlikely to be selected depending on previous rate-distortion optimization
// results, for encoding speed-up.
-static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
+static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
+ TOKENEXTRA **tp, int mi_row,
int mi_col, BLOCK_SIZE bsize, int *rate,
int64_t *dist, int do_recon, int64_t best_rd) {
VP9_COMMON * const cm = &cpi->common;
@@ -1481,10 +1490,11 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
return;
}
}
- assert(mi_height_log2(bsize) == mi_width_log2(bsize));
+ assert(num_8x8_blocks_wide_lookup[bsize] ==
+ num_8x8_blocks_high_lookup[bsize]);
if (bsize == BLOCK_16X16) {
- set_offsets(cpi, mi_row, mi_col, bsize);
+ set_offsets(cpi, tile, mi_row, mi_col, bsize);
x->mb_energy = vp9_block_energy(cpi, x, bsize);
}
@@ -1521,11 +1531,13 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
// PARTITION_NONE
if (partition_none_allowed) {
- pick_sb_modes(cpi, mi_row, mi_col, &this_rate, &this_dist, bsize,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize,
get_block_context(x, bsize), best_rd);
if (this_rate != INT_MAX) {
if (bsize >= BLOCK_8X8) {
- pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+ pl = partition_plane_context(cpi->above_seg_context,
+ cpi->left_seg_context,
+ mi_row, mi_col, bsize);
this_rate += x->partition_cost[pl][PARTITION_NONE];
}
sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
@@ -1573,7 +1585,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
*get_sb_index(xd, subsize) = i;
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, get_block_context(x, bsize));
- rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+ rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, subsize,
&this_rate, &this_dist, i != 3, best_rd - sum_rd);
if (this_rate == INT_MAX) {
@@ -1585,7 +1597,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
}
}
if (sum_rd < best_rd && i == 4) {
- pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+ pl = partition_plane_context(cpi->above_seg_context,
+ cpi->left_seg_context,
+ mi_row, mi_col, bsize);
sum_rate += x->partition_cost[pl][PARTITION_SPLIT];
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
if (sum_rd < best_rd) {
@@ -1618,7 +1632,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
*get_sb_index(xd, subsize) = 0;
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, get_block_context(x, bsize));
- pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
get_block_context(x, subsize), best_rd);
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
@@ -1629,7 +1643,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
*get_sb_index(xd, subsize) = 1;
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, get_block_context(x, bsize));
- pick_sb_modes(cpi, mi_row + ms, mi_col, &this_rate,
+ pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate,
&this_dist, subsize, get_block_context(x, subsize),
best_rd - sum_rd);
if (this_rate == INT_MAX) {
@@ -1641,7 +1655,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
}
}
if (sum_rd < best_rd) {
- pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+ pl = partition_plane_context(cpi->above_seg_context,
+ cpi->left_seg_context,
+ mi_row, mi_col, bsize);
sum_rate += x->partition_cost[pl][PARTITION_HORZ];
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
if (sum_rd < best_rd) {
@@ -1661,7 +1677,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
*get_sb_index(xd, subsize) = 0;
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, get_block_context(x, bsize));
- pick_sb_modes(cpi, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
get_block_context(x, subsize), best_rd);
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
@@ -1671,7 +1687,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
*get_sb_index(xd, subsize) = 1;
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, get_block_context(x, bsize));
- pick_sb_modes(cpi, mi_row, mi_col + ms, &this_rate,
+ pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate,
&this_dist, subsize, get_block_context(x, subsize),
best_rd - sum_rd);
if (this_rate == INT_MAX) {
@@ -1683,7 +1699,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
}
}
if (sum_rd < best_rd) {
- pl = partition_plane_context(cm, mi_row, mi_col, bsize);
+ pl = partition_plane_context(cpi->above_seg_context,
+ cpi->left_seg_context,
+ mi_row, mi_col, bsize);
sum_rate += x->partition_cost[pl][PARTITION_VERT];
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
if (sum_rd < best_rd) {
@@ -1701,7 +1719,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
*dist = best_dist;
if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon)
- encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+ encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
if (bsize == BLOCK_64X64) {
assert(tp_orig < *tp);
assert(best_rate < INT_MAX);
@@ -1712,7 +1730,8 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
}
// Examines 64x64 block and chooses a best reference frame
-static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
+static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile,
+ int mi_row, int mi_col) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
int bsl = b_width_log2(BLOCK_64X64), bs = 1 << bsl;
@@ -1732,9 +1751,10 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
if ((mi_row + (ms >> 1) < cm->mi_rows) &&
(mi_col + (ms >> 1) < cm->mi_cols)) {
cpi->set_ref_frame_mask = 1;
- pick_sb_modes(cpi, mi_row, mi_col, &r, &d, BLOCK_64X64,
+ pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64,
get_block_context(x, BLOCK_64X64), INT64_MAX);
- pl = partition_plane_context(cm, mi_row, mi_col, BLOCK_64X64);
+ pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
+ mi_row, mi_col, BLOCK_64X64);
r += x->partition_cost[pl][PARTITION_NONE];
*(get_sb_partitioning(x, BLOCK_64X64)) = BLOCK_64X64;
@@ -1744,17 +1764,17 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_64X64);
}
-static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
- int *totalrate) {
+static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
+ int mi_row, TOKENEXTRA **tp, int *totalrate) {
VP9_COMMON * const cm = &cpi->common;
int mi_col;
// Initialize the left context for the new SB row
- vpx_memset(&cm->left_context, 0, sizeof(cm->left_context));
- vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
+ vpx_memset(&cpi->left_context, 0, sizeof(cpi->left_context));
+ vpx_memset(cpi->left_seg_context, 0, sizeof(cpi->left_seg_context));
// Code each SB in the row
- for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+ for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
mi_col += MI_BLOCK_SIZE) {
int dummy_rate;
int64_t dummy_dist;
@@ -1762,7 +1782,7 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
vp9_zero(cpi->mb.pred_mv);
if (cpi->sf.reference_masking)
- rd_pick_reference_frame(cpi, mi_row, mi_col);
+ rd_pick_reference_frame(cpi, tile, mi_row, mi_col);
if (cpi->sf.use_lastframe_partitioning ||
cpi->sf.use_one_partition_size_always ) {
@@ -1772,9 +1792,9 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
cpi->mb.source_variance = UINT_MAX;
if (cpi->sf.use_one_partition_size_always) {
- set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
- set_partitioning(cpi, mi_8x8, mi_row, mi_col);
- rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+ set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+ set_partitioning(cpi, tile, mi_8x8, mi_row, mi_col);
+ rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rate, &dummy_dist, 1);
} else {
if ((cpi->common.current_video_frame
@@ -1788,28 +1808,28 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
sb_has_motion(cpi, prev_mi_8x8))) {
// If required set upper and lower partition size limits
if (cpi->sf.auto_min_max_partition_size) {
- set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
- rd_auto_partition_range(cpi, mi_row, mi_col,
+ set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+ rd_auto_partition_range(cpi, tile, mi_row, mi_col,
&cpi->sf.min_partition_size,
&cpi->sf.max_partition_size);
}
- rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64,
+ rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rate, &dummy_dist, 1, INT64_MAX);
} else {
copy_partitioning(cpi, mi_8x8, prev_mi_8x8);
- rd_use_partition(cpi, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
+ rd_use_partition(cpi, tile, mi_8x8, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rate, &dummy_dist, 1);
}
}
} else {
// If required set upper and lower partition size limits
if (cpi->sf.auto_min_max_partition_size) {
- set_offsets(cpi, mi_row, mi_col, BLOCK_64X64);
- rd_auto_partition_range(cpi, mi_row, mi_col,
+ set_offsets(cpi, tile, mi_row, mi_col, BLOCK_64X64);
+ rd_auto_partition_range(cpi, tile, mi_row, mi_col,
&cpi->sf.min_partition_size,
&cpi->sf.max_partition_size);
}
- rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_64X64,
+ rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rate, &dummy_dist, 1, INT64_MAX);
}
}
@@ -1836,7 +1856,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
// TODO(jkoleszar): are these initializations required?
setup_pre_planes(xd, 0, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]],
0, 0, NULL);
- setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
+ setup_dst_planes(xd, get_frame_new_buffer(cm), 0, 0);
setup_block_dptrs(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
@@ -1856,16 +1876,17 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
// Note: this memset assumes above_context[0], [1] and [2]
// are allocated as part of the same buffer.
- vpx_memset(cm->above_context[0], 0,
- sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * aligned_mi_cols);
- vpx_memset(cm->above_seg_context, 0,
- sizeof(PARTITION_CONTEXT) * aligned_mi_cols);
+ vpx_memset(cpi->above_context[0], 0,
+ sizeof(*cpi->above_context[0]) *
+ 2 * aligned_mi_cols * MAX_MB_PLANE);
+ vpx_memset(cpi->above_seg_context, 0,
+ sizeof(*cpi->above_seg_context) * aligned_mi_cols);
}
static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
if (lossless) {
// printf("Switching to lossless\n");
- cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
+ cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add;
cpi->mb.optimize = 0;
cpi->common.lf.filter_level = 0;
@@ -1873,7 +1894,7 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
cpi->common.tx_mode = ONLY_4X4;
} else {
// printf("Not lossless\n");
- cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
+ cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add;
}
}
@@ -1907,9 +1928,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
totalrate = 0;
- // Reset frame count of inter 0,0 motion vector usage.
- cpi->inter_zz_count = 0;
-
vp9_zero(cm->counts.switchable_interp);
vp9_zero(cpi->tx_stepdown_count);
@@ -1963,16 +1981,15 @@ static void encode_frame_internal(VP9_COMP *cpi) {
const int tile_rows = 1 << cm->log2_tile_rows;
for (tile_row = 0; tile_row < tile_rows; tile_row++) {
- vp9_get_tile_row_offsets(cm, tile_row);
-
for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ TileInfo tile;
TOKENEXTRA *tp_old = tp;
// For each row of SBs in the frame
- vp9_get_tile_col_offsets(cm, tile_col);
- for (mi_row = cm->cur_tile_mi_row_start;
- mi_row < cm->cur_tile_mi_row_end; mi_row += 8)
- encode_sb_row(cpi, mi_row, &tp, &totalrate);
+ vp9_tile_init(&tile, cm, tile_row, tile_col);
+ for (mi_row = tile.mi_row_start;
+ mi_row < tile.mi_row_end; mi_row += 8)
+ encode_sb_row(cpi, &tile, mi_row, &tp, &totalrate);
cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -2188,7 +2205,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
if (cpi->sf.RD) {
int i, pred_type;
- INTERPOLATIONFILTERTYPE filter_type;
+ INTERPOLATION_TYPE filter_type;
/*
* This code does a single RD pass over the whole frame assuming
* either compound, single or hybrid prediction as per whatever has
@@ -2256,7 +2273,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
}
- for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
const int64_t diff = cpi->rd_filter_diff[i] / cpi->common.MBs;
cpi->rd_filter_threshes[frame_type][i] =
(cpi->rd_filter_threshes[frame_type][i] + diff) / 2;
@@ -2470,7 +2487,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
(mbmi->skip_coeff ||
vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
const uint8_t context = vp9_get_pred_context_tx_size(xd);
- update_tx_counts(bsize, context, mbmi->tx_size, &cm->counts.tx);
+ ++get_tx_counts(bsize, context, &cm->counts.tx)[mbmi->tx_size];
} else {
int x, y;
TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode];
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 3358fbbe9..e52e8ec1e 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -19,6 +19,7 @@
#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/encoder/vp9_dct.h"
#include "vp9/encoder/vp9_encodemb.h"
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/encoder/vp9_rdopt.h"
@@ -365,9 +366,9 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
yoff = 32 * (block >> twl);
src_diff = p->src_diff + 4 * bw * yoff + xoff;
if (x->use_lp32x32fdct)
- vp9_short_fdct32x32_rd(src_diff, coeff, bw * 4);
+ vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
else
- vp9_short_fdct32x32(src_diff, coeff, bw * 4);
+ vp9_fdct32x32(src_diff, coeff, bw * 4);
vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -379,7 +380,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
xoff = 16 * (block & twmask);
yoff = 16 * (block >> twl);
src_diff = p->src_diff + 4 * bw * yoff + xoff;
- vp9_short_fdct16x16(src_diff, coeff, bw * 4);
+ vp9_fdct16x16(src_diff, coeff, bw * 4);
vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -391,7 +392,7 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
xoff = 8 * (block & twmask);
yoff = 8 * (block >> twl);
src_diff = p->src_diff + 4 * bw * yoff + xoff;
- vp9_short_fdct8x8(src_diff, coeff, bw * 4);
+ vp9_fdct8x8(src_diff, coeff, bw * 4);
vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -417,6 +418,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
struct encode_b_args *const args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
+ struct optimize_ctx *const ctx = args->ctx;
struct macroblockd_plane *const pd = &xd->plane[plane];
const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
block);
@@ -428,14 +430,18 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
// TODO(jingning): per transformed block zero forcing only enabled for
// luma component. will integrate chroma components as well.
if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+ int x, y;
pd->eobs[block] = 0;
+ txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
+ ctx->ta[plane][x] = 0;
+ ctx->tl[plane][y] = 0;
return;
}
vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
if (x->optimize)
- vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
+ vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
if (x->skip_encode || pd->eobs[block] == 0)
return;
@@ -461,6 +467,27 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
}
}
+static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct encode_b_args *const args = arg;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
+ block);
+
+ int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
+ pd->dst.buf, pd->dst.stride);
+
+ vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
+
+ if (pd->eobs[block] == 0)
+ return;
+
+ xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+}
+
void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
struct optimize_ctx ctx;
@@ -470,7 +497,7 @@ void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
if (x->optimize)
optimize_init_b(0, bsize, &arg);
- foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
+ foreach_transformed_block_in_plane(xd, bsize, 0, encode_block_pass1, &arg);
}
void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
@@ -532,9 +559,9 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_subtract_block(32, 32, src_diff, bw * 4,
src, p->src.stride, dst, pd->dst.stride);
if (x->use_lp32x32fdct)
- vp9_short_fdct32x32_rd(src_diff, coeff, bw * 4);
+ vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
else
- vp9_short_fdct32x32(src_diff, coeff, bw * 4);
+ vp9_fdct32x32(src_diff, coeff, bw * 4);
vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -556,10 +583,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
dst, pd->dst.stride, dst, pd->dst.stride);
vp9_subtract_block(16, 16, src_diff, bw * 4,
src, p->src.stride, dst, pd->dst.stride);
- if (tx_type != DCT_DCT)
- vp9_short_fht16x16(src_diff, coeff, bw * 4, tx_type);
- else
- vp9_short_fdct16x16(src_diff, coeff, bw * 4);
+ vp9_fht16x16(tx_type, src_diff, coeff, bw * 4);
vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
@@ -581,10 +605,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
dst, pd->dst.stride, dst, pd->dst.stride);
vp9_subtract_block(8, 8, src_diff, bw * 4,
src, p->src.stride, dst, pd->dst.stride);
- if (tx_type != DCT_DCT)
- vp9_short_fht8x8(src_diff, coeff, bw * 4, tx_type);
- else
- vp9_short_fdct8x8(src_diff, coeff, bw * 4);
+ vp9_fht8x8(tx_type, src_diff, coeff, bw * 4);
vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan, iscan);
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 9ebcc4983..e2c6c4c0c 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -124,8 +124,9 @@ static void build_nmv_component_cost_table(int *mvcost,
}
}
-static int update_mv(vp9_writer *w, const unsigned int ct[2],
- vp9_prob *cur_p, vp9_prob new_p, vp9_prob upd_p) {
+static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p,
+ vp9_prob upd_p) {
+ const vp9_prob new_p = get_binary_prob(ct[0], ct[1]);
vp9_prob mod_p = new_p | 1;
const int cur_b = cost_branch256(ct, *cur_p);
const int mod_b = cost_branch256(ct, mod_p);
@@ -143,7 +144,6 @@ static int update_mv(vp9_writer *w, const unsigned int ct[2],
static void counts_to_nmv_context(
nmv_context_counts *nmv_count,
- nmv_context *prob,
int usehp,
unsigned int (*branch_ct_joint)[2],
unsigned int (*branch_ct_sign)[2],
@@ -156,29 +156,24 @@ static void counts_to_nmv_context(
unsigned int (*branch_ct_hp)[2]) {
int i, j, k;
vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
- prob->joints,
branch_ct_joint,
nmv_count->joints, 0);
for (i = 0; i < 2; ++i) {
const uint32_t s0 = nmv_count->comps[i].sign[0];
const uint32_t s1 = nmv_count->comps[i].sign[1];
- prob->comps[i].sign = get_binary_prob(s0, s1);
branch_ct_sign[i][0] = s0;
branch_ct_sign[i][1] = s1;
vp9_tree_probs_from_distribution(vp9_mv_class_tree,
- prob->comps[i].classes,
- branch_ct_classes[i],
- nmv_count->comps[i].classes, 0);
+ branch_ct_classes[i],
+ nmv_count->comps[i].classes, 0);
vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
- prob->comps[i].class0,
branch_ct_class0[i],
nmv_count->comps[i].class0, 0);
for (j = 0; j < MV_OFFSET_BITS; ++j) {
const uint32_t b0 = nmv_count->comps[i].bits[j][0];
const uint32_t b1 = nmv_count->comps[i].bits[j][1];
- prob->comps[i].bits[j] = get_binary_prob(b0, b1);
branch_ct_bits[i][j][0] = b0;
branch_ct_bits[i][j][1] = b1;
}
@@ -186,12 +181,10 @@ static void counts_to_nmv_context(
for (i = 0; i < 2; ++i) {
for (k = 0; k < CLASS0_SIZE; ++k) {
vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
- prob->comps[i].class0_fp[k],
branch_ct_class0_fp[i][k],
nmv_count->comps[i].class0_fp[k], 0);
}
vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
- prob->comps[i].fp,
branch_ct_fp[i],
nmv_count->comps[i].fp, 0);
}
@@ -202,11 +195,9 @@ static void counts_to_nmv_context(
const uint32_t hp0 = nmv_count->comps[i].hp[0];
const uint32_t hp1 = nmv_count->comps[i].hp[1];
- prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
branch_ct_class0_hp[i][0] = c0_hp0;
branch_ct_class0_hp[i][1] = c0_hp1;
- prob->comps[i].hp = get_binary_prob(hp0, hp1);
branch_ct_hp[i][0] = hp0;
branch_ct_hp[i][1] = hp1;
}
@@ -215,7 +206,6 @@ static void counts_to_nmv_context(
void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
int i, j;
- nmv_context prob;
unsigned int branch_ct_joint[MV_JOINTS - 1][2];
unsigned int branch_ct_sign[2][2];
unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
@@ -227,30 +217,28 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
unsigned int branch_ct_hp[2][2];
nmv_context *mvc = &cpi->common.fc.nmvc;
- counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
+ counts_to_nmv_context(&cpi->NMVcount, usehp,
branch_ct_joint, branch_ct_sign, branch_ct_classes,
branch_ct_class0, branch_ct_bits,
branch_ct_class0_fp, branch_ct_fp,
branch_ct_class0_hp, branch_ct_hp);
for (j = 0; j < MV_JOINTS - 1; ++j)
- update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j],
- NMV_UPDATE_PROB);
+ update_mv(bc, branch_ct_joint[j], &mvc->joints[j], NMV_UPDATE_PROB);
for (i = 0; i < 2; ++i) {
- update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign,
- prob.comps[i].sign, NMV_UPDATE_PROB);
+ update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign, NMV_UPDATE_PROB);
for (j = 0; j < MV_CLASSES - 1; ++j)
update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j],
- prob.comps[i].classes[j], NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
for (j = 0; j < CLASS0_SIZE - 1; ++j)
update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j],
- prob.comps[i].class0[j], NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
for (j = 0; j < MV_OFFSET_BITS; ++j)
update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j],
- prob.comps[i].bits[j], NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
}
for (i = 0; i < 2; ++i) {
@@ -258,21 +246,19 @@ void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
int k;
for (k = 0; k < 3; ++k)
update_mv(bc, branch_ct_class0_fp[i][j][k],
- &mvc->comps[i].class0_fp[j][k],
- prob.comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
+ &mvc->comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
}
for (j = 0; j < 3; ++j)
- update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j],
- prob.comps[i].fp[j], NMV_UPDATE_PROB);
+ update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j], NMV_UPDATE_PROB);
}
if (usehp) {
for (i = 0; i < 2; ++i) {
update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp,
- prob.comps[i].class0_hp, NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp,
- prob.comps[i].hp, NMV_UPDATE_PROB);
+ NMV_UPDATE_PROB);
}
}
}
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index caf41625c..6a3555d68 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -481,13 +481,14 @@ void vp9_first_pass(VP9_COMP *cpi) {
MACROBLOCK *const x = &cpi->mb;
VP9_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
+ TileInfo tile;
int recon_yoffset, recon_uvoffset;
const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx];
const int gld_yv12_idx = cm->ref_frame_map[cpi->gld_fb_idx];
YV12_BUFFER_CONFIG *const lst_yv12 = &cm->yv12_fb[lst_yv12_idx];
- YV12_BUFFER_CONFIG *const new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
YV12_BUFFER_CONFIG *const gld_yv12 = &cm->yv12_fb[gld_yv12_idx];
+ YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
const int recon_y_stride = lst_yv12->y_stride;
const int recon_uv_stride = lst_yv12->uv_stride;
int64_t intra_error = 0;
@@ -532,6 +533,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
vp9_initialize_rd_consts(cpi);
}
+ // tiling is ignored in the first pass
+ vp9_tile_init(&tile, cm, 0, 0);
+
// for each macroblock row in image
for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
int_mv best_ref_mv;
@@ -578,11 +582,12 @@ void vp9_first_pass(VP9_COMP *cpi) {
}
}
xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
- set_mi_row_col(cm, xd,
+ set_mi_row_col(xd, &tile,
mb_row << 1,
- 1 << mi_height_log2(xd->mi_8x8[0]->mbmi.sb_type),
+ num_8x8_blocks_high_lookup[xd->mi_8x8[0]->mbmi.sb_type],
mb_col << 1,
- 1 << mi_width_log2(xd->mi_8x8[0]->mbmi.sb_type));
+ num_8x8_blocks_wide_lookup[xd->mi_8x8[0]->mbmi.sb_type],
+ cm->mi_rows, cm->mi_cols);
if (cpi->sf.variance_adaptive_quantization) {
int energy = vp9_block_energy(cpi, x, xd->mi_8x8[0]->mbmi.sb_type);
@@ -2164,17 +2169,14 @@ void vp9_second_pass(VP9_COMP *cpi) {
cpi->ni_av_qi = tmp_q;
cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
-#ifndef ONE_SHOT_Q_ESTIMATE
// Limit the maxq value returned subsequently.
// This increases the risk of overspend or underspend if the initial
// estimate for the clip is bad, but helps prevent excessive
// variation in Q, especially near the end of a clip
// where for example a small overspend may cause Q to crash
adjust_maxq_qrange(cpi);
-#endif
}
-#ifndef ONE_SHOT_Q_ESTIMATE
// The last few frames of a clip almost always have to few or too many
// bits and for the sake of over exact rate control we dont want to make
// radical adjustments to the allowed quantizer range just to use up a
@@ -2197,7 +2199,6 @@ void vp9_second_pass(VP9_COMP *cpi) {
cpi->active_worst_quality =
adjust_active_maxq(cpi->active_worst_quality, tmp_q);
}
-#endif
}
vp9_zero(this_frame);
if (EOF == input_stats(cpi, &this_frame))
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 644363158..7b605b212 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -194,8 +194,8 @@ static void update_mbgraph_mb_stats
x->plane[0].src.buf = buf->y_buffer + mb_y_offset;
x->plane[0].src.stride = buf->y_stride;
- xd->plane[0].dst.buf = cm->yv12_fb[cm->new_fb_idx].y_buffer + mb_y_offset;
- xd->plane[0].dst.stride = cm->yv12_fb[cm->new_fb_idx].y_stride;
+ xd->plane[0].dst.buf = get_frame_new_buffer(cm)->y_buffer + mb_y_offset;
+ xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride;
// do intra 16x16 prediction
intra_error = find_best_16x16_intra(cpi, mb_y_offset,
diff --git a/vp9/encoder/vp9_modecosts.c b/vp9/encoder/vp9_modecosts.c
index b867d8b71..7eb659232 100644
--- a/vp9/encoder/vp9_modecosts.c
+++ b/vp9/encoder/vp9_modecosts.c
@@ -36,7 +36,7 @@ void vp9_init_mode_costs(VP9_COMP *c) {
vp9_kf_uv_mode_prob[INTRA_MODES - 1],
vp9_intra_mode_tree);
- for (i = 0; i <= SWITCHABLE_FILTERS; ++i)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
cm->fc.switchable_interp_prob[i],
vp9_switchable_interp_tree);
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index accc338fb..f922f900a 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -312,6 +312,12 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
cpi->mb_activity_map = 0;
vpx_free(cpi->mb_norm_activity_map);
cpi->mb_norm_activity_map = 0;
+
+ vpx_free(cpi->above_context[0]);
+ cpi->above_context[0] = NULL;
+
+ vpx_free(cpi->above_seg_context);
+ cpi->above_seg_context = NULL;
}
// Computes a q delta (in "q index" terms) to get from a starting q value
@@ -959,9 +965,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->optimize_coefficients = 0;
}
- cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
+ cpi->mb.fwd_txm4x4 = vp9_fdct4x4;
if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) {
- cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
+ cpi->mb.fwd_txm4x4 = vp9_fwht4x4;
}
if (cpi->sf.subpel_search_method == SUBPEL_ITERATIVE) {
@@ -1026,11 +1032,6 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
}
- // Data used for real time vc mode to see if gf needs refreshing
- cpi->inter_zz_count = 0;
- cpi->gf_bad_count = 0;
- cpi->gf_update_recommended = 0;
-
vpx_free(cpi->mb_activity_map);
CHECK_MEM_ERROR(cm, cpi->mb_activity_map,
vpx_calloc(sizeof(unsigned int),
@@ -1040,6 +1041,19 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
CHECK_MEM_ERROR(cm, cpi->mb_norm_activity_map,
vpx_calloc(sizeof(unsigned int),
cm->mb_rows * cm->mb_cols));
+
+ // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
+ // block where mi unit size is 8x8.
+ vpx_free(cpi->above_context[0]);
+ CHECK_MEM_ERROR(cm, cpi->above_context[0],
+ vpx_calloc(2 * mi_cols_aligned_to_sb(cm->mi_cols) *
+ MAX_MB_PLANE,
+ sizeof(*cpi->above_context[0])));
+
+ vpx_free(cpi->above_seg_context);
+ CHECK_MEM_ERROR(cm, cpi->above_seg_context,
+ vpx_calloc(mi_cols_aligned_to_sb(cm->mi_cols),
+ sizeof(*cpi->above_seg_context)));
}
@@ -1072,6 +1086,15 @@ static void update_frame_size(VP9_COMP *cpi) {
vp9_init_dsmotion_compensation(&cpi->mb, y_stride);
}
}
+
+ {
+ int i;
+ for (i = 1; i < MAX_MB_PLANE; ++i) {
+ cpi->above_context[i] = cpi->above_context[0] +
+ i * sizeof(*cpi->above_context[0]) * 2 *
+ mi_cols_aligned_to_sb(cm->mi_cols);
+ }
+ }
}
@@ -1157,7 +1180,6 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
int i;
cpi->oxcf = *oxcf;
- cpi->goldfreq = 7;
cm->version = oxcf->version;
@@ -2669,8 +2691,7 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
vp9_clear_system_state(); // __asm emms;
- recon_err = vp9_calc_ss_err(cpi->Source,
- &cm->yv12_fb[cm->new_fb_idx]);
+ recon_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
if (cpi->twopass.total_left_stats.coded_error != 0.0)
fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
@@ -2829,19 +2850,11 @@ static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
cpi->active_best_quality = cpi->cq_target_quality;
} else {
-#ifdef ONE_SHOT_Q_ESTIMATE
-#ifdef STRICT_ONE_SHOT_Q
- cpi->active_best_quality = q;
-#else
- cpi->active_best_quality = inter_minq[q];
-#endif
-#else
cpi->active_best_quality = inter_minq[q];
// 1-pass: for now, use the average Q for the active_best, if its lower
// than active_worst.
- if (cpi->pass == 0 && (cpi->avg_frame_qindex < cpi->active_worst_quality))
+ if (cpi->pass == 0 && (cpi->avg_frame_qindex < q))
cpi->active_best_quality = inter_minq[cpi->avg_frame_qindex];
-#endif
// For the constrained quality mode we don't want
// q to fall below the cq level.
@@ -2875,7 +2888,14 @@ static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) {
*top_index =
(cpi->active_worst_quality + cpi->active_best_quality * 3) / 4;
+ // If this is the first (key) frame in 1-pass, active best is the user
+ // best-allowed, and leave the top_index to active_worst.
+ if (cpi->pass == 0 && cpi->common.current_video_frame == 0) {
+ cpi->active_best_quality = cpi->oxcf.best_allowed_q;
+ *top_index = cpi->oxcf.worst_allowed_q;
+ }
} else if (!cpi->is_src_frame_alt_ref &&
+ (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
*top_index =
(cpi->active_worst_quality + cpi->active_best_quality) / 2;
@@ -3169,8 +3189,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
// Special case handling for forced key frames
if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
int last_q = q;
- int kf_err = vp9_calc_ss_err(cpi->Source,
- &cm->yv12_fb[cm->new_fb_idx]);
+ int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
int high_err_target = cpi->ambient_err;
int low_err_target = cpi->ambient_err >> 1;
@@ -3306,14 +3325,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
// fixed interval. Note the reconstruction error if it is the frame before
// the force key frame
if (cpi->next_key_frame_forced && (cpi->twopass.frames_to_key == 0)) {
- cpi->ambient_err = vp9_calc_ss_err(cpi->Source,
- &cm->yv12_fb[cm->new_fb_idx]);
+ cpi->ambient_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
}
if (cm->frame_type == KEY_FRAME)
cpi->refresh_last_frame = 1;
- cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
+ cm->frame_to_show = get_frame_new_buffer(cm);
#if WRITE_RECON_BUFFER
if (cm->show_frame)
@@ -3912,7 +3930,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
cm->frame_flags = *frame_flags;
// Reset the frame pointers to the current frame size
- vp9_realloc_frame_buffer(&cm->yv12_fb[cm->new_fb_idx],
+ vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
cm->width, cm->height,
cm->subsampling_x, cm->subsampling_y,
VP9BORDERINPIXELS);
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index b1dfcbb9c..9429c7fed 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -29,11 +29,6 @@
#include "vp9/common/vp9_findnearmv.h"
#include "vp9/encoder/vp9_lookahead.h"
-// Experimental rate control switches
-#if CONFIG_ONESHOTQ
-#define ONE_SHOT_Q_ESTIMATE 0
-#define STRICT_ONE_SHOT_Q 0
-#endif
#define DISABLE_RC_LONG_TERM_MEM 0
// #define MODE_TEST_HIT_STATS
@@ -396,9 +391,9 @@ typedef struct VP9_COMP {
// FIXME(rbultje) can this overflow?
int rd_tx_select_threshes[4][TX_MODES];
- int64_t rd_filter_diff[SWITCHABLE_FILTERS + 1];
- int64_t rd_filter_threshes[4][SWITCHABLE_FILTERS + 1];
- int64_t rd_filter_cache[SWITCHABLE_FILTERS + 1];
+ int64_t rd_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
+ int64_t rd_filter_threshes[4][SWITCHABLE_FILTER_CONTEXTS];
+ int64_t rd_filter_cache[SWITCHABLE_FILTER_CONTEXTS];
int RDMULT;
int RDDIV;
@@ -506,14 +501,9 @@ typedef struct VP9_COMP {
int decimation_count;
// for real time encoding
- int avg_encode_time; // microsecond
- int avg_pick_mode_time; // microsecond
int speed;
- unsigned int cpu_freq; // Mhz
int compressor_speed;
- int interquantizer;
- int goldfreq;
int auto_worst_q;
int cpu_used;
int pass;
@@ -529,12 +519,6 @@ typedef struct VP9_COMP {
unsigned int max_mv_magnitude;
int mv_step_param;
- // Data used for real time conferencing mode to help determine if it
- // would be good to update the gf
- int inter_zz_count;
- int gf_bad_count;
- int gf_update_recommended;
-
unsigned char *segmentation_map;
// segment threashold for encode breakout
@@ -641,7 +625,7 @@ typedef struct VP9_COMP {
int dummy_packing; /* flag to indicate if packing is dummy */
- unsigned int switchable_interp_count[SWITCHABLE_FILTERS + 1]
+ unsigned int switchable_interp_count[SWITCHABLE_FILTER_CONTEXTS]
[SWITCHABLE_FILTERS];
unsigned int tx_stepdown_count[TX_SIZES];
@@ -675,6 +659,13 @@ typedef struct VP9_COMP {
// Debug / test stats
int64_t mode_test_hits[BLOCK_SIZES];
#endif
+
+ /* Y,U,V,(A) */
+ ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+ ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+
+ PARTITION_CONTEXT *above_seg_context;
+ PARTITION_CONTEXT left_seg_context[8];
} VP9_COMP;
static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 7ad8d1fb2..fca752524 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -22,12 +22,14 @@
extern int enc_debug;
#endif
-void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
- int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
- int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
- int16_t *dqcoeff_ptr, int16_t *dequant_ptr,
- int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
+void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr,
+ int zbin_oq_value, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
int i, rc, eob;
int zbins[2], nzbins[2], zbin;
int x, y, z, sz;
@@ -86,14 +88,15 @@ void vp9_quantize_b_c(int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
*eob_ptr = eob + 1;
}
-void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
int skip_block,
- int16_t *zbin_ptr, int16_t *round_ptr,
- int16_t *quant_ptr, int16_t *quant_shift_ptr,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
- int16_t *dequant_ptr, int zbin_oq_value,
- uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan) {
+ const int16_t *dequant_ptr,
+ int zbin_oq_value, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
int i, rc, eob;
int zbins[2], nzbins[2];
int x, y, z, sz;
@@ -174,25 +177,19 @@ static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
return res;
}
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
- int y_blocks) {
- MACROBLOCKD *const xd = &mb->e_mbd;
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+ const int16_t *scan, const int16_t *iscan) {
+ MACROBLOCKD *const xd = &x->e_mbd;
const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
- const int16_t *scan = get_scan_4x4(tx_type);
- const int16_t *iscan = get_iscan_4x4(tx_type);
-
- vp9_quantize_b(BLOCK_OFFSET(mb->plane[pb_idx.plane].coeff, pb_idx.block),
- 16, mb->skip_block,
- mb->plane[pb_idx.plane].zbin,
- mb->plane[pb_idx.plane].round,
- mb->plane[pb_idx.plane].quant,
- mb->plane[pb_idx.plane].quant_shift,
- BLOCK_OFFSET(xd->plane[pb_idx.plane].qcoeff, pb_idx.block),
- BLOCK_OFFSET(xd->plane[pb_idx.plane].dqcoeff, pb_idx.block),
- xd->plane[pb_idx.plane].dequant,
- mb->plane[pb_idx.plane].zbin_extra,
- &xd->plane[pb_idx.plane].eobs[pb_idx.block],
- scan, iscan);
+ struct macroblock_plane* p = &x->plane[pb_idx.plane];
+ struct macroblockd_plane* pd = &xd->plane[pb_idx.plane];
+
+ vp9_quantize_b(BLOCK_OFFSET(p->coeff, pb_idx.block),
+ 16, x->skip_block,
+ p->zbin, p->round, p->quant, p->quant_shift,
+ BLOCK_OFFSET(pd->qcoeff, pb_idx.block),
+ BLOCK_OFFSET(pd->dqcoeff, pb_idx.block),
+ pd->dequant, p->zbin_extra, &pd->eobs[pb_idx.block], scan, iscan);
}
static void invert_quant(int16_t *quant, int16_t *shift, int d) {
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h
index 459aa3359..c078e1d41 100644
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -13,8 +13,9 @@
#include "vp9/encoder/vp9_block.h"
-void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
- int y_blocks);
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+ const int16_t *scan, const int16_t *iscan);
+
struct VP9_COMP;
void vp9_set_quantizer(struct VP9_COMP *cpi, int q);
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f166b10a1..993919e5b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -251,8 +251,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs);
for (i = 0; i < PARTITION_CONTEXTS; i++)
- vp9_cost_tokens(cpi->mb.partition_cost[i],
- cm->fc.partition_prob[cm->frame_type][i],
+ vp9_cost_tokens(cpi->mb.partition_cost[i], get_partition_probs(cm, i),
vp9_partition_tree);
/*rough estimate for costing*/
@@ -611,7 +610,7 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
// TODO(jingning): temporarily enabled only for luma component
rd = MIN(rd1, rd2);
if (plane == 0)
- x->zcoeff_blk[tx_size][block] = rd1 > rd2;
+ x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block];
args->this_rate += args->rate;
args->this_dist += args->dist;
@@ -933,14 +932,15 @@ static void super_block_yrd(VP9_COMP *cpi,
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack;
+ const int b_inter_mode = is_inter_block(mbmi);
assert(bs == mbmi->sb_type);
- if (mbmi->ref_frame[0] > INTRA_FRAME)
+ if (b_inter_mode)
vp9_subtract_sby(x, bs);
if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
(cpi->sf.tx_size_search_method != USE_FULL_RD &&
- mbmi->ref_frame[0] == INTRA_FRAME)) {
+ !b_inter_mode)) {
vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
ref_best_rd, bs);
@@ -950,7 +950,7 @@ static void super_block_yrd(VP9_COMP *cpi,
}
if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
- mbmi->ref_frame[0] > INTRA_FRAME) {
+ b_inter_mode) {
if (bs >= BLOCK_32X32)
model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
&r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
@@ -1031,10 +1031,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
ENTROPY_CONTEXT ta[2], tempa[2];
ENTROPY_CONTEXT tl[2], templ[2];
- TX_TYPE tx_type = DCT_DCT;
+
const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
- int idx, idy, block;
+ int idx, idy;
uint8_t best_dst[8 * 8];
assert(ib < 4);
@@ -1070,8 +1070,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
const int16_t *nb;
uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
-
- block = ib + idy * 2 + idx;
+ const int block = ib + idy * 2 + idx;
+ TX_TYPE tx_type;
xd->mi_8x8[0]->bmi[block].as_mode = mode;
src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
@@ -1085,13 +1085,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
dst, dst_stride);
tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block);
+ get_scan_nb_4x4(tx_type, &scan, &nb);
+
if (tx_type != DCT_DCT)
vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
else
x->fwd_txm4x4(src_diff, coeff, 8);
- vp9_regular_quantize_b_4x4(x, block, tx_type, 16);
- get_scan_nb_4x4(tx_type, &scan, &nb);
+ vp9_regular_quantize_b_4x4(x, 4, block, scan, get_iscan_4x4(tx_type));
+
ratey += cost_coeffs(x, 0, block,
tempa + idx, templ + idy, TX_4X4, scan, nb);
distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
@@ -1431,10 +1433,6 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
int mi_row, int mi_col,
int_mv single_newmv[MAX_REF_FRAMES],
int *rate_mv);
-static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize,
- int mi_row, int mi_col,
- int_mv *tmp_mv, int *rate_mv);
static int labels2mode(MACROBLOCK *x, int i,
MB_PREDICTION_MODE this_mode,
@@ -1561,7 +1559,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
coeff = BLOCK_OFFSET(p->coeff, k);
x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
coeff, 8);
- vp9_regular_quantize_b_4x4(x, k, DCT_DCT, 16);
+ vp9_regular_quantize_b_4x4(x, 4, k, get_scan_4x4(DCT_DCT),
+ get_iscan_4x4(DCT_DCT));
thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
16, &ssz);
thissse += ssz;
@@ -1645,6 +1644,7 @@ static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
}
static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
BEST_SEG_INFO *bsi_buf, int filter_idx,
int_mv seg_mvs[4][MAX_REF_FRAMES],
int mi_row, int mi_col) {
@@ -1653,6 +1653,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
MB_PREDICTION_MODE this_mode;
MODE_INFO *mi = x->e_mbd.mi_8x8[0];
MB_MODE_INFO *const mbmi = &mi->mbmi;
+ struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
const int label_count = 4;
int64_t this_segment_rd = 0;
int label_mv_thresh;
@@ -1667,8 +1668,8 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
int subpelmv = 1, have_ref = 0;
const int has_second_rf = has_second_ref(mbmi);
- vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
- vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
+ vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
+ vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
v_fn_ptr = &cpi->fn_ptr[bsize];
@@ -1690,13 +1691,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
i = idy * 2 + idx;
frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0;
- vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
+ vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile,
&frame_mv[NEARESTMV][mbmi->ref_frame[0]],
&frame_mv[NEARMV][mbmi->ref_frame[0]],
i, 0, mi_row, mi_col);
if (has_second_rf) {
frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0;
- vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
+ vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile,
&frame_mv[NEARESTMV][mbmi->ref_frame[1]],
&frame_mv[NEARMV][mbmi->ref_frame[1]],
i, 1, mi_row, mi_col);
@@ -1746,7 +1747,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
}
}
- vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
+ vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
sizeof(bsi->rdstat[i][mode_idx].ta));
vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
@@ -1870,12 +1871,14 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
mi_buf_restore(x, orig_src, orig_pre);
}
- if (has_second_rf && this_mode == NEWMV &&
- mbmi->interp_filter == EIGHTTAP) {
+ if (has_second_rf) {
if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
continue;
+ }
+ if (has_second_rf && this_mode == NEWMV &&
+ mbmi->interp_filter == EIGHTTAP) {
// adjust src pointers
mi_buf_shift(x, i);
if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
@@ -1950,6 +1953,13 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
sizeof(SEG_RDSTAT));
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[i + 1][mode_idx].eobs =
+ ref_bsi->rdstat[i + 1][mode_idx].eobs;
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[i + 2][mode_idx].eobs =
+ ref_bsi->rdstat[i + 2][mode_idx].eobs;
+
if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
mode_selected = this_mode;
best_rd = bsi->rdstat[i][mode_idx].brdcost;
@@ -1970,7 +1980,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
bsi->rdstat[i][mode_idx].brate, 0);
bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
- bsi->rdstat[i][mode_idx].eobs = x->e_mbd.plane[0].eobs[i];
+ bsi->rdstat[i][mode_idx].eobs = pd->eobs[i];
+ if (num_4x4_blocks_wide > 1)
+ bsi->rdstat[i + 1][mode_idx].eobs = pd->eobs[i + 1];
+ if (num_4x4_blocks_high > 1)
+ bsi->rdstat[i + 2][mode_idx].eobs = pd->eobs[i + 2];
}
if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
@@ -2026,6 +2040,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
}
static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
int_mv *best_ref_mv,
int_mv *second_best_ref_mv,
int64_t best_rd,
@@ -2056,7 +2071,8 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
for (i = 0; i < 4; i++)
bsi->modes[i] = ZEROMV;
- rd_check_segment_txsize(cpi, x, bsi_buf, filter_idx, seg_mvs, mi_row, mi_col);
+ rd_check_segment_txsize(cpi, x, tile, bsi_buf, filter_idx, seg_mvs,
+ mi_row, mi_col);
if (bsi->segment_rd > best_rd)
return INT64_MAX;
@@ -2204,7 +2220,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
int_mv *second_ref_mv,
int64_t comp_pred_diff[NB_PREDICTION_TYPES],
int64_t tx_size_diff[TX_MODES],
- int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]) {
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
MACROBLOCKD *const xd = &x->e_mbd;
// Take a snapshot of the coding context so it can be
@@ -2222,7 +2238,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
- sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1));
+ sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
}
static void setup_pred_block(const MACROBLOCKD *xd,
@@ -2252,6 +2268,7 @@ static void setup_pred_block(const MACROBLOCKD *xd,
}
static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
int idx, MV_REFERENCE_FRAME frame_type,
BLOCK_SIZE block_size,
int mi_row, int mi_col,
@@ -2267,12 +2284,8 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
// set up scaling factors
scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
- scale[frame_type].x_offset_q4 =
- ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].x_scale_fp,
- REF_SCALE_SHIFT) & 0xf;
- scale[frame_type].y_offset_q4 =
- ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].y_scale_fp,
- REF_SCALE_SHIFT) & 0xf;
+ scale[frame_type].sfc->set_scaled_offsets(&scale[frame_type],
+ mi_row * MI_SIZE, mi_col * MI_SIZE);
// TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
// use the UV scaling factors.
@@ -2280,7 +2293,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
&scale[frame_type], &scale[frame_type]);
// Gets an initial list of candidate vectors from neighbours and orders them
- vp9_find_mv_refs(&cpi->common, xd, xd->mi_8x8[0],
+ vp9_find_mv_refs(cm, xd, tile, xd->mi_8x8[0],
xd->last_mi,
frame_type,
mbmi->ref_mvs[frame_type], mi_row, mi_col);
@@ -2294,7 +2307,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
// Further refinement that is encode side only to test the top few candidates
// in full and choose the best as the centre point for subsequent searches.
// The current implementation doesn't support scaling.
- if (!vp9_is_scaled(&scale[frame_type]) && block_size >= BLOCK_8X8)
+ if (!vp9_is_scaled(scale[frame_type].sfc) && block_size >= BLOCK_8X8)
mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,
frame_type, block_size);
}
@@ -2317,6 +2330,7 @@ static INLINE int get_switchable_rate(const MACROBLOCK *x) {
}
static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
BLOCK_SIZE bsize,
int mi_row, int mi_col,
int_mv *tmp_mv, int *rate_mv) {
@@ -2501,9 +2515,9 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
setup_pre_planes(xd, 1, scaled_ref_frame[1], mi_row, mi_col, NULL);
}
- xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+ xd->scale_factor[0].sfc->set_scaled_offsets(&xd->scale_factor[0],
mi_row, mi_col);
- xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+ xd->scale_factor[1].sfc->set_scaled_offsets(&xd->scale_factor[1],
mi_row, mi_col);
scaled_first_yv12 = xd->plane[0].pre[0];
@@ -2613,6 +2627,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
}
static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
BLOCK_SIZE bsize,
int64_t txfm_cache[],
int *rate2, int64_t *distortion,
@@ -2620,7 +2635,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
int *rate_y, int64_t *distortion_y,
int *rate_uv, int64_t *distortion_uv,
int *mode_excluded, int *disable_skip,
- INTERPOLATIONFILTERTYPE *best_filter,
+ INTERPOLATION_TYPE *best_filter,
int_mv (*mode_mv)[MAX_REF_FRAMES],
int mi_row, int mi_col,
int_mv single_newmv[MAX_REF_FRAMES],
@@ -2647,6 +2662,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
int orig_dst_stride[MAX_MB_PLANE];
int rs = 0;
+ if (is_comp_pred) {
+ if (frame_mv[refs[0]].as_int == INVALID_MV ||
+ frame_mv[refs[1]].as_int == INVALID_MV)
+ return INT64_MAX;
+ }
+
if (this_mode == NEWMV) {
int rate_mv;
if (is_comp_pred) {
@@ -2665,13 +2686,11 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
&mbmi->ref_mvs[refs[1]][0].as_mv,
x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
}
- if (frame_mv[refs[0]].as_int == INVALID_MV ||
- frame_mv[refs[1]].as_int == INVALID_MV)
- return INT64_MAX;
*rate2 += rate_mv;
} else {
int_mv tmp_mv;
- single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
+ single_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
+ &tmp_mv, &rate_mv);
*rate2 += rate_mv;
frame_mv[refs[0]].as_int =
xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
@@ -3082,6 +3101,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
int mi_row, int mi_col,
int *returnrate,
int64_t *returndistortion,
@@ -3111,8 +3131,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int64_t best_tx_diff[TX_MODES];
int64_t best_pred_diff[NB_PREDICTION_TYPES];
int64_t best_pred_rd[NB_PREDICTION_TYPES];
- int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
- int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+ int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
MB_MODE_INFO best_mbmode = { 0 };
int j;
int mode_index, best_mode_index = 0;
@@ -3122,7 +3142,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int64_t best_inter_rd = INT64_MAX;
MB_PREDICTION_MODE best_intra_mode = DC_PRED;
MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
- INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
+ INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE;
int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
int64_t dist_uv[TX_SIZES];
int skip_uv[TX_SIZES];
@@ -3150,7 +3170,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
best_pred_rd[i] = INT64_MAX;
for (i = 0; i < TX_MODES; i++)
best_tx_rd[i] = INT64_MAX;
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
best_filter_rd[i] = INT64_MAX;
for (i = 0; i < TX_SIZES; i++)
rate_uv_intra[i] = INT_MAX;
@@ -3192,8 +3212,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
if (cpi->ref_frame_flags & flag_list[ref_frame]) {
- setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
- mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
+ setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame,
+ block_size, mi_row, mi_col,
+ frame_mv[NEARESTMV], frame_mv[NEARMV],
yv12_mb, scale_factor);
}
frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
@@ -3437,7 +3458,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
} else {
mbmi->mode = this_mode;
compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
- this_rd = handle_inter_mode(cpi, x, bsize,
+ this_rd = handle_inter_mode(cpi, x, tile, bsize,
tx_cache,
&rate2, &distortion2, &skippable,
&rate_y, &distortion_y,
@@ -3520,17 +3541,16 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
// Keep record of best intra rd
- if (xd->mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME &&
- is_intra_mode(xd->mi_8x8[0]->mbmi.mode) &&
+ if (!is_inter_block(&xd->mi_8x8[0]->mbmi) &&
this_rd < best_intra_rd) {
best_intra_rd = this_rd;
best_intra_mode = xd->mi_8x8[0]->mbmi.mode;
}
+
// Keep record of best inter rd with single reference
- if (xd->mi_8x8[0]->mbmi.ref_frame[0] > INTRA_FRAME &&
- xd->mi_8x8[0]->mbmi.ref_frame[1] == NONE &&
- !mode_excluded &&
- this_rd < best_inter_rd) {
+ if (is_inter_block(&xd->mi_8x8[0]->mbmi) &&
+ !has_second_ref(&xd->mi_8x8[0]->mbmi) &&
+ !mode_excluded && this_rd < best_inter_rd) {
best_inter_rd = this_rd;
best_inter_ref_frame = ref_frame;
}
@@ -3538,7 +3558,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (!disable_skip && ref_frame == INTRA_FRAME) {
for (i = 0; i < NB_PREDICTION_TYPES; ++i)
best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
}
@@ -3621,7 +3641,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
cm->mcomp_filter_type != BILINEAR) {
int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
SWITCHABLE_FILTERS : cm->mcomp_filter_type];
- for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
int64_t adj_rd;
// In cases of poor prediction, filter_cache[] can contain really big
// values, which actually are bigger than this_rd itself. This can
@@ -3743,7 +3763,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
if (!x->skip) {
- for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
if (best_filter_rd[i] == INT64_MAX)
best_filter_diff[i] = 0;
else
@@ -3779,6 +3799,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+ const TileInfo *const tile,
int mi_row, int mi_col,
int *returnrate,
int64_t *returndistortion,
@@ -3807,15 +3828,15 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
int64_t best_tx_diff[TX_MODES];
int64_t best_pred_diff[NB_PREDICTION_TYPES];
int64_t best_pred_rd[NB_PREDICTION_TYPES];
- int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
- int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
+ int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
+ int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
MB_MODE_INFO best_mbmode = { 0 };
int mode_index, best_mode_index = 0;
unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
vp9_prob comp_mode_p;
int64_t best_inter_rd = INT64_MAX;
MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
- INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
+ INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE;
int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
int64_t dist_uv[TX_SIZES];
int skip_uv[TX_SIZES];
@@ -3830,7 +3851,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
int best_skip2 = 0;
x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
- vp9_zero(x->zcoeff_blk);
+ vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
for (i = 0; i < 4; i++) {
int j;
@@ -3845,7 +3866,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
best_pred_rd[i] = INT64_MAX;
for (i = 0; i < TX_MODES; i++)
best_tx_rd[i] = INT64_MAX;
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
best_filter_rd[i] = INT64_MAX;
for (i = 0; i < TX_SIZES; i++)
rate_uv_intra[i] = INT_MAX;
@@ -3863,8 +3884,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
if (cpi->ref_frame_flags & flag_list[ref_frame]) {
- setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
- mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
+ setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame,
+ block_size, mi_row, mi_col,
+ frame_mv[NEARESTMV], frame_mv[NEARMV],
yv12_mb, scale_factor);
}
frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
@@ -3962,11 +3984,11 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
// TODO(jingning, jkoleszar): scaling reference frame not supported for
// sub8x8 blocks.
if (ref_frame > 0 &&
- vp9_is_scaled(&scale_factor[ref_frame]))
+ vp9_is_scaled(scale_factor[ref_frame].sfc))
continue;
if (second_ref_frame > 0 &&
- vp9_is_scaled(&scale_factor[second_ref_frame]))
+ vp9_is_scaled(scale_factor[second_ref_frame].sfc))
continue;
set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
@@ -4094,7 +4116,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
mbmi->interp_filter = switchable_filter_index;
vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
- tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+ tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
&mbmi->ref_mvs[ref_frame][0],
second_ref,
best_yrd,
@@ -4130,8 +4152,10 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
tmp_best_sse = total_sse;
tmp_best_skippable = skippable;
tmp_best_mbmode = *mbmi;
- for (i = 0; i < 4; i++)
+ for (i = 0; i < 4; i++) {
tmp_best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
+ x->zcoeff_blk[TX_4X4][i] = !xd->plane[0].eobs[i];
+ }
pred_exists = 1;
if (switchable_filter_index == 0 &&
cpi->sf.use_rd_breakout &&
@@ -4158,7 +4182,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
if (!pred_exists) {
// Handles the special case when a filter that is not in the
// switchable list (bilinear, 6-tap) is indicated at the frame level
- tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
+ tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
&mbmi->ref_mvs[ref_frame][0],
second_ref,
best_yrd,
@@ -4286,7 +4310,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
if (!disable_skip && ref_frame == INTRA_FRAME) {
for (i = 0; i < NB_PREDICTION_TYPES; ++i)
best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
- for (i = 0; i <= SWITCHABLE_FILTERS; i++)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
}
@@ -4364,7 +4388,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
cm->mcomp_filter_type != BILINEAR) {
int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
SWITCHABLE_FILTERS : cm->mcomp_filter_type];
- for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
int64_t adj_rd;
// In cases of poor prediction, filter_cache[] can contain really big
// values, which actually are bigger than this_rd itself. This can
@@ -4480,7 +4504,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
}
if (!x->skip) {
- for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
if (best_filter_rd[i] == INT64_MAX)
best_filter_diff[i] = 0;
else
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 0b0bb18d7..92fb23548 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -18,6 +18,8 @@
(((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM))
#define QIDX_SKIP_THRESH 115
+struct TileInfo;
+
int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex);
void vp9_initialize_rd_consts(VP9_COMP *cpi);
@@ -29,14 +31,22 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
PICK_MODE_CONTEXT *ctx, int64_t best_rd);
int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
+ const struct TileInfo *const tile,
int mi_row, int mi_col,
- int *r, int64_t *d, BLOCK_SIZE bsize,
- PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+ int *returnrate,
+ int64_t *returndistortion,
+ BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far);
int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
+ const struct TileInfo *const tile,
int mi_row, int mi_col,
- int *r, int64_t *d, BLOCK_SIZE bsize,
- PICK_MODE_CONTEXT *ctx, int64_t best_rd);
+ int *returnrate,
+ int64_t *returndistortion,
+ BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far);
void vp9_init_me_luts();
diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c
index 72e6be1e8..24f011f83 100644
--- a/vp9/encoder/vp9_segmentation.c
+++ b/vp9/encoder/vp9_segmentation.c
@@ -117,7 +117,8 @@ static int cost_segmap(int *segcounts, vp9_prob *probs) {
return cost;
}
-static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+static void count_segs(VP9_COMP *cpi, const TileInfo *const tile,
+ MODE_INFO **mi_8x8,
int *no_pred_segcounts,
int (*temporal_predictor_count)[2],
int *t_unpred_seg_counts,
@@ -132,7 +133,7 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8,
xd->mi_8x8 = mi_8x8;
segment_id = xd->mi_8x8[0]->mbmi.segment_id;
- set_mi_row_col(cm, xd, mi_row, bh, mi_col, bw);
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
// Count the number of hits on each segment with no prediction
no_pred_segcounts[segment_id]++;
@@ -157,7 +158,8 @@ static void count_segs(VP9_COMP *cpi, MODE_INFO **mi_8x8,
}
}
-static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
+static void count_segs_sb(VP9_COMP *cpi, const TileInfo *const tile,
+ MODE_INFO **mi_8x8,
int *no_pred_segcounts,
int (*temporal_predictor_count)[2],
int *t_unpred_seg_counts,
@@ -175,19 +177,20 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
bh = num_8x8_blocks_high_lookup[mi_8x8[0]->mbmi.sb_type];
if (bw == bs && bh == bs) {
- count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+ count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
t_unpred_seg_counts, bs, bs, mi_row, mi_col);
} else if (bw == bs && bh < bs) {
- count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+ count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
- count_segs(cpi, mi_8x8 + hbs * mis, no_pred_segcounts,
+ count_segs(cpi, tile, mi_8x8 + hbs * mis, no_pred_segcounts,
temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
mi_row + hbs, mi_col);
} else if (bw < bs && bh == bs) {
- count_segs(cpi, mi_8x8, no_pred_segcounts, temporal_predictor_count,
+ count_segs(cpi, tile, mi_8x8, no_pred_segcounts, temporal_predictor_count,
t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
- count_segs(cpi, mi_8x8 + hbs, no_pred_segcounts, temporal_predictor_count,
- t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs);
+ count_segs(cpi, tile, mi_8x8 + hbs,
+ no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts,
+ hbs, bs, mi_row, mi_col + hbs);
} else {
const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
int n;
@@ -198,7 +201,7 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO **mi_8x8,
const int mi_dc = hbs * (n & 1);
const int mi_dr = hbs * (n >> 1);
- count_segs_sb(cpi, &mi_8x8[mi_dr * mis + mi_dc],
+ count_segs_sb(cpi, tile, &mi_8x8[mi_dr * mis + mi_dc],
no_pred_segcounts, temporal_predictor_count,
t_unpred_seg_counts,
mi_row + mi_dr, mi_col + mi_dc, subsize);
@@ -234,15 +237,18 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
// First of all generate stats regarding how well the last segment map
// predicts this one
for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
- vp9_get_tile_col_offsets(cm, tile_col);
- mi_ptr = cm->mi_grid_visible + cm->cur_tile_mi_col_start;
+ TileInfo tile;
+
+ vp9_tile_init(&tile, cm, 0, tile_col);
+ mi_ptr = cm->mi_grid_visible + tile.mi_col_start;
for (mi_row = 0; mi_row < cm->mi_rows;
mi_row += 8, mi_ptr += 8 * mis) {
mi = mi_ptr;
- for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+ for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
mi_col += 8, mi += 8)
- count_segs_sb(cpi, mi, no_pred_segcounts, temporal_predictor_count,
- t_unpred_seg_counts, mi_row, mi_col, BLOCK_64X64);
+ count_segs_sb(cpi, &tile, mi, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts,
+ mi_row, mi_col, BLOCK_64X64);
}
}
diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index eb864d96c..387fc9056 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c
@@ -221,7 +221,7 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
}
void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
- unsigned int *ct) {
+ const unsigned int ct[2]) {
const vp9_prob upd = DIFF_UPDATE_PROB;
vp9_prob newp = get_binary_prob(ct[0], ct[1]);
const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 6ea05793d..2cace0378 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -38,14 +38,15 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
int stride,
int mv_row,
int mv_col,
- uint8_t *pred) {
+ uint8_t *pred,
+ struct scale_factors *scale) {
const int which_mv = 0;
MV mv = { mv_row, mv_col };
vp9_build_inter_predictor(y_mb_ptr, stride,
&pred[0], 16,
&mv,
- &xd->scale_factor[which_mv],
+ scale,
16, 16,
which_mv,
&xd->subpix, MV_PRECISION_Q3);
@@ -55,7 +56,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
vp9_build_inter_predictor(u_mb_ptr, stride,
&pred[256], 8,
&mv,
- &xd->scale_factor[which_mv],
+ scale,
8, 8,
which_mv,
&xd->subpix, MV_PRECISION_Q4);
@@ -63,7 +64,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
vp9_build_inter_predictor(v_mb_ptr, stride,
&pred[320], 8,
&mv,
- &xd->scale_factor[which_mv],
+ scale,
8, 8,
which_mv,
&xd->subpix, MV_PRECISION_Q4);
@@ -186,7 +187,8 @@ static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
static void temporal_filter_iterate_c(VP9_COMP *cpi,
int frame_count,
int alt_ref_index,
- int strength) {
+ int strength,
+ struct scale_factors *scale) {
int byte;
int frame;
int mb_col, mb_row;
@@ -280,7 +282,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
cpi->frames[frame]->y_stride,
mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.row,
mbd->mi_8x8[0]->bmi[0].as_mv[0].as_mv.col,
- predictor);
+ predictor, scale);
// Apply the filter (YUV)
vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
@@ -374,6 +376,9 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead)
- (num_frames_backward + 1);
+ struct scale_factors scale;
+ struct scale_factors_common scale_comm;
+
switch (blur_type) {
case 1:
// Backward Blur
@@ -432,9 +437,9 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
#endif
// Setup scaling factors. Scaling on each of the arnr frames is not supported
- vp9_setup_scale_factors_for_frame(&cpi->mb.e_mbd.scale_factor[0],
- cm->yv12_fb[cm->new_fb_idx].y_crop_width,
- cm->yv12_fb[cm->new_fb_idx].y_crop_height,
+ vp9_setup_scale_factors_for_frame(&scale, &scale_comm,
+ get_frame_new_buffer(cm)->y_crop_width,
+ get_frame_new_buffer(cm)->y_crop_height,
cm->width, cm->height);
// Setup frame pointers, NULL indicates frame not included in filter
@@ -447,7 +452,7 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) {
}
temporal_filter_iterate_c(cpi, frames_to_blur, frames_to_blur_backward,
- strength);
+ strength, &scale);
}
void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 550263aa8..7d4676e97 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -21,14 +21,6 @@
#include "vp9/common/vp9_seg_common.h"
#include "vp9/common/vp9_entropy.h"
-/* Global event counters used for accumulating statistics across several
- compressions, then generating vp9_context.c = initial stats. */
-
-#ifdef ENTROPY_STATS
-vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES];
-extern vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES];
-#endif /* ENTROPY_STATS */
-
static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
const TOKENVALUE *vp9_dct_value_tokens_ptr;
static int dct_value_cost[DCT_MAX_VALUE * 2];
@@ -89,6 +81,7 @@ struct tokenize_b_args {
MACROBLOCKD *xd;
TOKENEXTRA **tp;
TX_SIZE tx_size;
+ uint8_t *token_cache;
};
static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -107,6 +100,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
VP9_COMP *cpi = args->cpi;
MACROBLOCKD *xd = args->xd;
TOKENEXTRA **tp = args->tp;
+ uint8_t *token_cache = args->token_cache;
struct macroblockd_plane *pd = &xd->plane[plane];
MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
int pt; /* near block/prev token context index */
@@ -121,7 +115,6 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
const int ref = is_inter_block(mbmi);
- uint8_t token_cache[1024];
const uint8_t *const band_translate = get_band_translate(tx_size);
const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
int aoff, loff;
@@ -205,7 +198,7 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
const int mb_skip_context = vp9_get_pred_context_mbskip(xd);
const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
SEG_LVL_SKIP);
- struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size};
+ struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size, cpi->mb.token_cache};
mbmi->skip_coeff = vp9_sb_is_skippable(xd, bsize);
if (mbmi->skip_coeff) {
@@ -226,149 +219,6 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
}
}
-#ifdef ENTROPY_STATS
-void init_context_counters(void) {
- FILE *f = fopen("context.bin", "rb");
- if (!f) {
- vp9_zero(context_counters);
- } else {
- fread(context_counters, sizeof(context_counters), 1, f);
- fclose(f);
- }
-
- f = fopen("treeupdate.bin", "rb");
- if (!f) {
- vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));
- } else {
- fread(tree_update_hist, sizeof(tree_update_hist), 1, f);
- fclose(f);
- }
-}
-
-static void print_counter(FILE *f, vp9_coeff_accum *context_counters,
- int block_types, const char *header) {
- int type, ref, band, pt, t;
-
- fprintf(f, "static const vp9_coeff_count %s = {\n", header);
-
-#define Comma(X) (X ? "," : "")
- type = 0;
- do {
- ref = 0;
- fprintf(f, "%s\n { /* block Type %d */", Comma(type), type);
- do {
- fprintf(f, "%s\n { /* %s */", Comma(type), ref ? "Inter" : "Intra");
- band = 0;
- do {
- fprintf(f, "%s\n { /* Coeff Band %d */", Comma(band), band);
- pt = 0;
- do {
- fprintf(f, "%s\n {", Comma(pt));
-
- t = 0;
- do {
- const int64_t x = context_counters[type][ref][band][pt][t];
- const int y = (int) x;
-
- assert(x == (int64_t) y); /* no overflow handling yet */
- fprintf(f, "%s %d", Comma(t), y);
- } while (++t < 1 + MAX_ENTROPY_TOKENS);
- fprintf(f, "}");
- } while (++pt < PREV_COEF_CONTEXTS);
- fprintf(f, "\n }");
- } while (++band < COEF_BANDS);
- fprintf(f, "\n }");
- } while (++ref < REF_TYPES);
- fprintf(f, "\n }");
- } while (++type < block_types);
- fprintf(f, "\n};\n");
-}
-
-static void print_probs(FILE *f, vp9_coeff_accum *context_counters,
- int block_types, const char *header) {
- int type, ref, band, pt, t;
-
- fprintf(f, "static const vp9_coeff_probs %s = {", header);
-
- type = 0;
-#define Newline(x, spaces) (x ? " " : "\n" spaces)
- do {
- fprintf(f, "%s%s{ /* block Type %d */",
- Comma(type), Newline(type, " "), type);
- ref = 0;
- do {
- fprintf(f, "%s%s{ /* %s */",
- Comma(band), Newline(band, " "), ref ? "Inter" : "Intra");
- band = 0;
- do {
- fprintf(f, "%s%s{ /* Coeff Band %d */",
- Comma(band), Newline(band, " "), band);
- pt = 0;
- do {
- unsigned int branch_ct[ENTROPY_NODES][2];
- unsigned int coef_counts[MAX_ENTROPY_TOKENS + 1];
- vp9_prob coef_probs[ENTROPY_NODES];
-
- if (pt >= 3 && band == 0)
- break;
- for (t = 0; t < MAX_ENTROPY_TOKENS + 1; ++t)
- coef_counts[t] = context_counters[type][ref][band][pt][t];
- vp9_tree_probs_from_distribution(vp9_coef_tree, coef_probs,
- branch_ct, coef_counts, 0);
- branch_ct[0][1] = coef_counts[MAX_ENTROPY_TOKENS] - branch_ct[0][0];
- coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
- fprintf(f, "%s\n {", Comma(pt));
-
- t = 0;
- do {
- fprintf(f, "%s %3d", Comma(t), coef_probs[t]);
- } while (++t < ENTROPY_NODES);
-
- fprintf(f, " }");
- } while (++pt < PREV_COEF_CONTEXTS);
- fprintf(f, "\n }");
- } while (++band < COEF_BANDS);
- fprintf(f, "\n }");
- } while (++ref < REF_TYPES);
- fprintf(f, "\n }");
- } while (++type < block_types);
- fprintf(f, "\n};\n");
-}
-
-void print_context_counters() {
- FILE *f = fopen("vp9_context.c", "w");
-
- fprintf(f, "#include \"vp9_entropy.h\"\n");
- fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
-
- /* print counts */
- print_counter(f, context_counters[TX_4X4], BLOCK_TYPES,
- "vp9_default_coef_counts_4x4[BLOCK_TYPES]");
- print_counter(f, context_counters[TX_8X8], BLOCK_TYPES,
- "vp9_default_coef_counts_8x8[BLOCK_TYPES]");
- print_counter(f, context_counters[TX_16X16], BLOCK_TYPES,
- "vp9_default_coef_counts_16x16[BLOCK_TYPES]");
- print_counter(f, context_counters[TX_32X32], BLOCK_TYPES,
- "vp9_default_coef_counts_32x32[BLOCK_TYPES]");
-
- /* print coefficient probabilities */
- print_probs(f, context_counters[TX_4X4], BLOCK_TYPES,
- "default_coef_probs_4x4[BLOCK_TYPES]");
- print_probs(f, context_counters[TX_8X8], BLOCK_TYPES,
- "default_coef_probs_8x8[BLOCK_TYPES]");
- print_probs(f, context_counters[TX_16X16], BLOCK_TYPES,
- "default_coef_probs_16x16[BLOCK_TYPES]");
- print_probs(f, context_counters[TX_32X32], BLOCK_TYPES,
- "default_coef_probs_32x32[BLOCK_TYPES]");
-
- fclose(f);
-
- f = fopen("context.bin", "wb");
- fwrite(context_counters, sizeof(context_counters), 1, f);
- fclose(f);
-}
-#endif
-
void vp9_tokenize_initialize() {
fill_value_tokens();
}
diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h
index b78e100ec..e24e31b80 100644
--- a/vp9/encoder/vp9_tokenize.h
+++ b/vp9/encoder/vp9_tokenize.h
@@ -28,9 +28,6 @@ typedef struct {
uint8_t skip_eob_node;
} TOKENEXTRA;
-typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
- [MAX_ENTROPY_TOKENS + 1];
-
int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize);
int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
int plane);
@@ -39,13 +36,6 @@ struct VP9_COMP;
void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
BLOCK_SIZE bsize);
-#ifdef ENTROPY_STATS
-void init_context_counters();
-void print_context_counters();
-
-extern vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES];
-#endif
-
extern const int *vp9_dct_value_cost_ptr;
/* TODO: The Token field should be broken out into a separate char array to
* improve cache locality, since it's needed for costing when the rest of the
diff --git a/vp9/encoder/vp9_vaq.c b/vp9/encoder/vp9_vaq.c
index 3179ae301..1f9cb8709 100644
--- a/vp9/encoder/vp9_vaq.c
+++ b/vp9/encoder/vp9_vaq.c
@@ -118,8 +118,8 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
((-xd->mb_to_bottom_edge) >> 3) : 0;
if (right_overflow || bottom_overflow) {
- int bw = (1 << (mi_width_log2(bs) + 3)) - right_overflow;
- int bh = (1 << (mi_height_log2(bs) + 3)) - bottom_overflow;
+ const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow;
+ const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow;
int avg;
variance(x->plane[0].src.buf, x->plane[0].src.stride,
vp9_64_zeros, 0, bw, bh, &sse, &avg);
diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c
index de47a5bf1..2d59775ce 100644
--- a/vp9/encoder/x86/vp9_dct32x32_sse2.c
+++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -29,7 +29,7 @@ static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
}
#endif
-void FDCT32x32_2D(int16_t *input,
+void FDCT32x32_2D(const int16_t *input,
int16_t *output_org, int stride) {
// Calculate pre-multiplied strides
const int str1 = stride;
@@ -93,13 +93,13 @@ void FDCT32x32_2D(int16_t *input,
// Note: even though all the loads below are aligned, using the aligned
// intrinsic make the code slightly slower.
if (0 == pass) {
- int16_t *in = &input[column_start];
+ const int16_t *in = &input[column_start];
// step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
// Note: the next four blocks could be in a loop. That would help the
// instruction cache but is actually slower.
{
- int16_t *ina = in + 0 * str1;
- int16_t *inb = in + 31 * str1;
+ const int16_t *ina = in + 0 * str1;
+ const int16_t *inb = in + 31 * str1;
__m128i *step1a = &step1[ 0];
__m128i *step1b = &step1[31];
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
@@ -128,8 +128,8 @@ void FDCT32x32_2D(int16_t *input,
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
}
{
- int16_t *ina = in + 4 * str1;
- int16_t *inb = in + 27 * str1;
+ const int16_t *ina = in + 4 * str1;
+ const int16_t *inb = in + 27 * str1;
__m128i *step1a = &step1[ 4];
__m128i *step1b = &step1[27];
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
@@ -158,8 +158,8 @@ void FDCT32x32_2D(int16_t *input,
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
}
{
- int16_t *ina = in + 8 * str1;
- int16_t *inb = in + 23 * str1;
+ const int16_t *ina = in + 8 * str1;
+ const int16_t *inb = in + 23 * str1;
__m128i *step1a = &step1[ 8];
__m128i *step1b = &step1[23];
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
@@ -188,8 +188,8 @@ void FDCT32x32_2D(int16_t *input,
step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
}
{
- int16_t *ina = in + 12 * str1;
- int16_t *inb = in + 19 * str1;
+ const int16_t *ina = in + 12 * str1;
+ const int16_t *inb = in + 19 * str1;
__m128i *step1a = &step1[12];
__m128i *step1b = &step1[19];
const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index fa60e80eb..dc115018e 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -12,7 +12,7 @@
#include "vp9/common/vp9_idct.h" // for cospi constants
#include "vpx_ports/mem.h"
-void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
@@ -111,7 +111,8 @@ void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int stride) {
}
}
-static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) {
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+ int stride) {
const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
__m128i mask;
@@ -242,7 +243,7 @@ void fadst4_1d_sse2(__m128i *in) {
transpose_4x4(in);
}
-void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
+void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in[4];
load_buffer_4x4(input, in, stride);
@@ -270,7 +271,7 @@ void vp9_short_fht4x4_sse2(int16_t *input, int16_t *output,
write_buffer_4x4(output, in);
}
-void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int stride) {
+void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
int pass;
// Constants
// When we use them, in one case, they are all the same. In all others
@@ -527,15 +528,16 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int stride) {
}
// load 8x8 array
-static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) {
- in[0] = _mm_load_si128((__m128i *)(input + 0 * stride));
- in[1] = _mm_load_si128((__m128i *)(input + 1 * stride));
- in[2] = _mm_load_si128((__m128i *)(input + 2 * stride));
- in[3] = _mm_load_si128((__m128i *)(input + 3 * stride));
- in[4] = _mm_load_si128((__m128i *)(input + 4 * stride));
- in[5] = _mm_load_si128((__m128i *)(input + 5 * stride));
- in[6] = _mm_load_si128((__m128i *)(input + 6 * stride));
- in[7] = _mm_load_si128((__m128i *)(input + 7 * stride));
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+ int stride) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
in[0] = _mm_slli_epi16(in[0], 2);
in[1] = _mm_slli_epi16(in[1], 2);
@@ -1025,7 +1027,7 @@ void fadst8_1d_sse2(__m128i *in) {
array_transpose_8x8(in, in);
}
-void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
+void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in[8];
load_buffer_8x8(input, in, stride);
@@ -1054,7 +1056,7 @@ void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
write_buffer_8x8(output, in, 8);
}
-void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) {
+void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
// The 2D transform is done with two passes which are actually pretty
// similar. In the first one, we transform the columns and transpose
// the results. In the second one, we transform the rows. To achieve that,
@@ -1064,7 +1066,7 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) {
int pass;
// We need an intermediate buffer between passes.
DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
- int16_t *in = input;
+ const int16_t *in = input;
int16_t *out = intermediate;
// Constants
// When we use them, in one case, they are all the same. In all others
@@ -1679,7 +1681,7 @@ void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int stride) {
}
}
-static INLINE void load_buffer_16x16(int16_t* input, __m128i *in0,
+static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
__m128i *in1, int stride) {
// load first 8 columns
load_buffer_8x8(input, in0, stride);
@@ -2531,7 +2533,7 @@ void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
}
-void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
+void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in0[16], in1[16];
load_buffer_16x16(input, in0, in1, stride);
@@ -2563,13 +2565,13 @@ void vp9_short_fht16x16_sse2(int16_t *input, int16_t *output,
write_buffer_16x16(output, in0, in1, 16);
}
-#define FDCT32x32_2D vp9_short_fdct32x32_rd_sse2
+#define FDCT32x32_2D vp9_fdct32x32_rd_sse2
#define FDCT32x32_HIGH_PRECISION 0
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
#undef FDCT32x32_2D
#undef FDCT32x32_HIGH_PRECISION
-#define FDCT32x32_2D vp9_short_fdct32x32_sse2
+#define FDCT32x32_2D vp9_fdct32x32_sse2
#define FDCT32x32_HIGH_PRECISION 1
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
#undef FDCT32x32_2D
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index af6e66538..0badb0855 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -74,6 +74,7 @@ VP9_COMMON_SRCS-yes += common/vp9_scan.h
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
+VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
@@ -102,6 +103,11 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans32_cols_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans32_dspr2.c
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index ec2eac359..194203967 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -973,37 +973,16 @@ static vpx_codec_err_t vp9e_use_reference(vpx_codec_alg_priv_t *ctx,
static vpx_codec_err_t vp9e_set_roi_map(vpx_codec_alg_priv_t *ctx,
int ctr_id,
va_list args) {
- vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
-
- if (data) {
- vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
-
- if (!vp9_set_roimap(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
- roi->delta_q, roi->delta_lf, roi->static_threshold))
- return VPX_CODEC_OK;
- else
- return VPX_CODEC_INVALID_PARAM;
- } else {
- return VPX_CODEC_INVALID_PARAM;
- }
+ // TODO(yaowu): Need to re-implement and test for VP9.
+ return VPX_CODEC_INVALID_PARAM;
}
static vpx_codec_err_t vp9e_set_activemap(vpx_codec_alg_priv_t *ctx,
int ctr_id,
va_list args) {
- vpx_active_map_t *data = va_arg(args, vpx_active_map_t *);
-
- if (data) {
- vpx_active_map_t *map = (vpx_active_map_t *)data;
-
- if (!vp9_set_active_map(ctx->cpi, map->active_map, map->rows, map->cols))
- return VPX_CODEC_OK;
- else
- return VPX_CODEC_INVALID_PARAM;
- } else {
- return VPX_CODEC_INVALID_PARAM;
- }
+ // TODO(yaowu): Need to re-implement and test for VP9.
+ return VPX_CODEC_INVALID_PARAM;
}
static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx,
@@ -1014,8 +993,9 @@ static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx,
if (data) {
int res;
vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
- res = vp9_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
- scalemode.v_scaling_mode);
+ res = vp9_set_internal_size(ctx->cpi,
+ (VPX_SCALING)scalemode.h_scaling_mode,
+ (VPX_SCALING)scalemode.v_scaling_mode);
if (!res) {
return VPX_CODEC_OK;
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 6b923162f..5dacab454 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -172,9 +172,9 @@ static vpx_codec_err_t vp9_peek_si(const uint8_t *data,
rb.bit_offset += 1; // show frame
rb.bit_offset += 1; // error resilient
- if (vp9_rb_read_literal(&rb, 8) != SYNC_CODE_0 ||
- vp9_rb_read_literal(&rb, 8) != SYNC_CODE_1 ||
- vp9_rb_read_literal(&rb, 8) != SYNC_CODE_2) {
+ if (vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_0 ||
+ vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_1 ||
+ vp9_rb_read_literal(&rb, 8) != VP9_SYNC_CODE_2) {
return VPX_CODEC_UNSUP_BITSTREAM;
}
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index b454eee02..0993c6ce6 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -20,6 +20,7 @@ VP9_CX_SRCS-yes += vp9_cx_iface.c
VP9_CX_SRCS-yes += encoder/vp9_bitstream.c
VP9_CX_SRCS-yes += encoder/vp9_boolhuff.c
VP9_CX_SRCS-yes += encoder/vp9_dct.c
+VP9_CX_SRCS-yes += encoder/vp9_dct.h
VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h
VP9_CX_SRCS-yes += encoder/vp9_encodeintra.c