diff options
Diffstat (limited to 'vp9')
34 files changed, 1330 insertions, 1090 deletions
diff --git a/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c index bc422bcf1..92644f2c8 100644 --- a/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_convolve2_dspr2.c @@ -16,702 +16,702 @@ #include "vp9/common/vp9_common.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -#include "vp9/common/vp9_convolve.h" +#include "vp9/common/vp9_filter.h" #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
- int32_t y;
- uint8_t *cm = vp9_ff_cropTbl;
- uint8_t *dst_ptr;
- int32_t Temp1, Temp2;
- uint32_t vector4a = 64;
- uint32_t tp1, tp2;
- uint32_t p1, p2;
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- dst_ptr = dst;
- /* prefetch data to cache memory */
- vp9_prefetch_load(src + src_stride);
- vp9_prefetch_load(src + src_stride + 32);
-
- __asm__ __volatile__ (
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[tp1] \n\t"
- "preceu.ph.qbl %[p2], %[tp1] \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
-
- /* even 2. pixel */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "balign %[tp2], %[tp1], 3 \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
- "extp %[Temp2], $ac2, 31 \n\t"
-
- /* odd 1. pixel */
- "lbux %[tp1], %[Temp1](%[cm]) \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[tp2] \n\t"
- "preceu.ph.qbl %[p2], %[tp2] \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
-
- /* odd 2. pixel */
- "lbux %[tp2], %[Temp2](%[cm]) \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
- "extp %[Temp2], $ac2, 31 \n\t"
-
- /* clamp */
- "lbux %[p1], %[Temp1](%[cm]) \n\t"
- "lbux %[p2], %[Temp2](%[cm]) \n\t"
-
- /* store bytes */
- "sb %[tp1], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
-
- "sb %[p1], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
-
- "sb %[tp2], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
-
- "sb %[p2], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
-
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
- [p1] "=&r" (p1), [p2] "=&r" (p2),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [dst_ptr] "+r" (dst_ptr)
- : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
- [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
- );
-
- /* Next row... */
- src += src_stride;
- dst += 1;
- }
+static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + uint8_t *dst_ptr; + int32_t Temp1, Temp2; + uint32_t vector4a = 64; + uint32_t tp1, tp2; + uint32_t p1, p2; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + dst_ptr = dst; + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" + + /* odd 1. pixel */ + "lbux %[tp1], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp2], %[Temp2](%[cm]) \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p1], %[Temp1](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + + /* store bytes */ + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[tp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + "sb %[p2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), + [p1] "=&r" (p1), [p2] "=&r" (p2), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [dst_ptr] "+r" (dst_ptr) + : [filter45] "r" (filter45),[vector4a] "r" (vector4a), + [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride) + ); + + /* Next row... */ + src += src_stride; + dst += 1; + } } -static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
- int32_t y;
- uint8_t *cm = vp9_ff_cropTbl;
- uint8_t *dst_ptr;
- uint32_t vector4a = 64;
- int32_t Temp1, Temp2, Temp3;
- uint32_t tp1, tp2, tp3;
- uint32_t p1, p2, p3, p4;
- uint8_t *odd_dst;
- uint32_t dst_pitch_2 = (dst_stride << 1);
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- vp9_prefetch_load(src + src_stride);
- vp9_prefetch_load(src + src_stride + 32);
-
- dst_ptr = dst;
- odd_dst = (dst_ptr + dst_stride);
-
- __asm__ __volatile__ (
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[tp1] \n\t"
- "preceu.ph.qbl %[p2], %[tp1] \n\t"
- "preceu.ph.qbr %[p3], %[tp2] \n\t"
- "preceu.ph.qbl %[p4], %[tp2] \n\t"
- "ulw %[tp3], 8(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
-
- /* even 2. pixel */
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- /* even 3. pixel */
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "balign %[tp3], %[tp2], 3 \n\t"
- "balign %[tp2], %[tp1], 3 \n\t"
- "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
- "lbux %[tp1], %[Temp3](%[cm]) \n\t"
- "extp %[p3], $ac1, 31 \n\t"
-
- /* even 4. pixel */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "sb %[Temp2], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
- "sb %[tp1], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
-
- "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- "lbux %[Temp1], %[p3](%[cm]) \n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector4a], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[tp2] \n\t"
- "preceu.ph.qbl %[p2], %[tp2] \n\t"
- "preceu.ph.qbr %[p3], %[tp3] \n\t"
- "preceu.ph.qbl %[p4], %[tp3] \n\t"
- "sb %[Temp1], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
-
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- /* odd 2. pixel */
- "lbux %[tp1], %[Temp3](%[cm]) \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
- "sb %[tp1], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
- "extp %[Temp3], $ac1, 31 \n\t"
-
- /* odd 3. pixel */
- "lbux %[tp3], %[Temp2](%[cm]) \n\t"
- "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- /* odd 4. pixel */
- "sb %[tp3], 0(%[odd_dst]) \n\t"
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
- "extp %[Temp1], $ac2, 31 \n\t"
-
- /* clamp */
- "lbux %[p4], %[Temp3](%[cm]) \n\t"
- "lbux %[p2], %[Temp2](%[cm]) \n\t"
- "lbux %[p1], %[Temp1](%[cm]) \n\t"
-
- /* store bytes */
- "sb %[p4], 0(%[odd_dst]) \n\t"
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[p2], 0(%[odd_dst]) \n\t"
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[p1], 0(%[odd_dst]) \n\t"
-
- : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
- [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
- : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),
- [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
- );
-
- /* Next row... */
- src += src_stride;
- dst += 1;
- }
+static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t y; + uint8_t *cm = vp9_ff_cropTbl; + uint8_t *dst_ptr; + uint32_t vector4a = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t tp1, tp2, tp3; + uint32_t p1, p2, p3, p4; + uint8_t *odd_dst; + uint32_t dst_pitch_2 = (dst_stride << 1); + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src + src_stride); + vp9_prefetch_load(src + src_stride + 32); + + dst_ptr = dst; + odd_dst = (dst_ptr + dst_stride); + + __asm__ __volatile__ ( + "ulw %[tp1], 0(%[src]) \n\t" + "ulw %[tp2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[tp1] \n\t" + "preceu.ph.qbl %[p2], %[tp1] \n\t" + "preceu.ph.qbr %[p3], %[tp2] \n\t" + "preceu.ph.qbl %[p4], %[tp2] \n\t" + "ulw %[tp3], 8(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp1], $ac3, 31 \n\t" + + /* even 2. pixel */ + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + /* even 3. pixel */ + "lbux %[Temp2], %[Temp1](%[cm]) \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "balign %[tp3], %[tp2], 3 \n\t" + "balign %[tp2], %[tp1], 3 \n\t" + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "extp %[p3], $ac1, 31 \n\t" + + /* even 4. pixel */ + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "sb %[Temp2], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp3], $ac2, 31 \n\t" + + "lbux %[Temp1], %[p3](%[cm]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector4a], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[tp2] \n\t" + "preceu.ph.qbl %[p2], %[tp2] \n\t" + "preceu.ph.qbr %[p3], %[tp3] \n\t" + "preceu.ph.qbl %[p4], %[tp3] \n\t" + "sb %[Temp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 2. pixel */ + "lbux %[tp1], %[Temp3](%[cm]) \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + "sb %[tp1], 0(%[dst_ptr]) \n\t" + "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" + "extp %[Temp3], $ac1, 31 \n\t" + + /* odd 3. pixel */ + "lbux %[tp3], %[Temp2](%[cm]) \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + /* odd 4. pixel */ + "sb %[tp3], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + /* clamp */ + "lbux %[p4], %[Temp3](%[cm]) \n\t" + "lbux %[p2], %[Temp2](%[cm]) \n\t" + "lbux %[p1], %[Temp1](%[cm]) \n\t" + + /* store bytes */ + "sb %[p4], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p2], 0(%[odd_dst]) \n\t" + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[p1], 0(%[odd_dst]) \n\t" + + : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst) + : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm), + [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) + ); + + /* Next row... */ + src += src_stride; + dst += 1; + } +} + +static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h, + int32_t count) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < count; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload1], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p5], %[qload1] \n\t" + "ulw %[qload2], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload2] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload2] \n\t" + "preceu.ph.qbl %[p5], %[qload2] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) + : [filter45] "r" (filter45), [vector_64] "r" (vector_64), + [cm] "r" (cm), + [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) + ); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += 1; + } } -static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
- int32_t src_stride,
- uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h,
- int32_t count) {
- int32_t c, y;
- const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = vp9_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
- uint32_t dst_pitch_2 = (dst_stride << 1);
- uint8_t *odd_dst;
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- vp9_prefetch_load(src_ptr + src_stride);
- vp9_prefetch_load(src_ptr + src_stride + 32);
-
- src = src_ptr;
- dst = dst_ptr;
-
- odd_dst = (dst + dst_stride);
-
- for (c = 0; c < count; c++) {
- __asm__ __volatile__ (
- "ulw %[qload1], 0(%[src]) \n\t"
- "ulw %[qload2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
- "mthi $zero, $ac1 \n\t"
- "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "ulw %[qload1], 8(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
-
- /* even 2. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p5], %[qload1] \n\t"
- "ulw %[qload2], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
-
- /* even 3. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p2], %[qload2] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
-
- /* even 4. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbl %[p3], %[qload2] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
-
- /* even 5. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
- "mthi $zero, $ac3 \n\t"
- "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
-
- /* even 6. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
- "mthi $zero, $ac1 \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "ulw %[qload1], 20(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
-
- /* even 7. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p5], %[qload1] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
-
- /* even 8. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
- "mthi $zero, $ac3 \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
- "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
-
- /* ODD pixels */
- "ulw %[qload1], 1(%[src]) \n\t"
- "ulw %[qload2], 5(%[src]) \n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "ulw %[qload2], 9(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
-
- /* odd 2. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload2] \n\t"
- "preceu.ph.qbl %[p5], %[qload2] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
- "ulw %[qload1], 13(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
-
- /* odd 3. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
-
- /* odd 4. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
-
- /* odd 5. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
- "mthi $zero, $ac2 \n\t"
- "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
-
- /* odd 6. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
- "mthi $zero, $ac3 \n\t"
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "ulw %[qload1], 21(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
-
- /* odd 7. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p5], %[qload1] \n\t"
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
-
- /* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
-
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
-
- "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
-
- : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
- [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
- [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
- : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
- [cm] "r" (cm),
- [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
- );
-
- src += 16;
- dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
- odd_dst = (dst + dst_stride);
- }
-
- /* Next row... */
- src_ptr += src_stride;
- dst_ptr += 1;
- }
-}
- -static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
- int32_t src_stride,
- uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0,
- int32_t h) {
- int32_t c, y;
- const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = vp9_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
- uint32_t dst_pitch_2 = (dst_stride << 1);
- uint8_t *odd_dst;
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- vp9_prefetch_load(src_ptr + src_stride);
- vp9_prefetch_load(src_ptr + src_stride + 32);
- vp9_prefetch_load(src_ptr + src_stride + 64);
-
- src = src_ptr;
- dst = dst_ptr;
-
- odd_dst = (dst + dst_stride);
-
- for (c = 0; c < 4; c++) {
- __asm__ __volatile__ (
- "ulw %[qload1], 0(%[src]) \n\t"
- "ulw %[qload2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
- "mthi $zero, $ac1 \n\t"
- "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "ulw %[qload1], 8(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
-
- /* even 2. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p5], %[qload1] \n\t"
- "ulw %[qload2], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
-
- /* even 3. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p2], %[qload2] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
-
- /* even 4. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbl %[p3], %[qload2] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
-
- /* even 5. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
- "mthi $zero, $ac3 \n\t"
- "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
-
- /* even 6. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
- "mthi $zero, $ac1 \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "ulw %[qload1], 20(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
-
- /* even 7. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p5], %[qload1] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
-
- /* even 8. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
- "mthi $zero, $ac3 \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
- "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
-
- /* ODD pixels */
- "ulw %[qload1], 1(%[src]) \n\t"
- "ulw %[qload2], 5(%[src]) \n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
- "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
- "ulw %[qload2], 9(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
-
- /* odd 2. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload2] \n\t"
- "preceu.ph.qbl %[p5], %[qload2] \n\t"
- "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
- "ulw %[qload1], 13(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
-
- /* odd 3. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
-
- /* odd 4. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
-
- /* odd 5. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
- "mthi $zero, $ac2 \n\t"
- "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
-
- /* odd 6. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
- "mthi $zero, $ac3 \n\t"
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "ulw %[qload1], 21(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
-
- /* odd 7. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p5], %[qload1] \n\t"
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
-
- /* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
-
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
-
- "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
-
- : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
- [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
- [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
- [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
- : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
- [cm] "r" (cm),
- [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
- );
-
- src += 16;
- dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
- odd_dst = (dst + dst_stride);
- }
-
- /* Next row... */
- src_ptr += src_stride;
- dst_ptr += 1;
- }
-}
+static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr, + int32_t src_stride, + uint8_t *dst_ptr, + int32_t dst_stride, + const int16_t *filter_x0, + int32_t h) { + int32_t c, y; + const uint8_t *src; + uint8_t *dst; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector_64 = 64; + int32_t Temp1, Temp2, Temp3; + uint32_t qload1, qload2; + uint32_t p1, p2, p3, p4, p5; + uint32_t st1, st2, st3; + uint32_t dst_pitch_2 = (dst_stride << 1); + uint8_t *odd_dst; + const int16_t *filter = &filter_x0[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_load(src_ptr + src_stride); + vp9_prefetch_load(src_ptr + src_stride + 32); + vp9_prefetch_load(src_ptr + src_stride + 64); + + src = src_ptr; + dst = dst_ptr; + + odd_dst = (dst + dst_stride); + + for (c = 0; c < 4; c++) { + __asm__ __volatile__ ( + "ulw %[qload1], 0(%[src]) \n\t" + "ulw %[qload2], 4(%[src]) \n\t" + + /* even 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ + "mthi $zero, $ac1 \n\t" + "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "ulw %[qload1], 8(%[src]) \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ + + /* even 2. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p5], %[qload1] \n\t" + "ulw %[qload2], 12(%[src]) \n\t" + "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ + + /* even 3. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p2], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ + + /* even 4. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbl %[p3], %[qload2] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 2 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ + + /* even 5. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 0(%[dst]) \n\t" /* even 3 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ + "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ + + /* even 6. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ + "mthi $zero, $ac1 \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 4 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 20(%[src]) \n\t" + "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ + "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ + + /* even 7. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 5 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ + "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ + + /* even 8. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ + "mthi $zero, $ac3 \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ + "sb %[st3], 0(%[dst]) \n\t" /* even 6 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ + + /* ODD pixels */ + "ulw %[qload1], 1(%[src]) \n\t" + "ulw %[qload2], 5(%[src]) \n\t" + + /* odd 1. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p1], %[qload1] \n\t" + "preceu.ph.qbl %[p2], %[qload1] \n\t" + "preceu.ph.qbr %[p3], %[qload2] \n\t" + "preceu.ph.qbl %[p4], %[qload2] \n\t" + "sb %[st1], 0(%[dst]) \n\t" /* even 7 */ + "addu %[dst], %[dst], %[dst_pitch_2] \n\t" + "ulw %[qload2], 9(%[src]) \n\t" + "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ + + /* odd 2. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ + "mthi $zero, $ac2 \n\t" + "preceu.ph.qbr %[p1], %[qload2] \n\t" + "preceu.ph.qbl %[p5], %[qload2] \n\t" + "sb %[st2], 0(%[dst]) \n\t" /* even 8 */ + "ulw %[qload1], 13(%[src]) \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ + + /* odd 3. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ + "mthi $zero, $ac3 \n\t" + "preceu.ph.qbr %[p2], %[qload1] \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ + + /* odd 4. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbl %[p3], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ + + /* odd 5. pixel */ + "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ + "mthi $zero, $ac2 \n\t" + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ + + /* odd 6. pixel */ + "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ + "mthi $zero, $ac3 \n\t" + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "ulw %[qload1], 21(%[src]) \n\t" + "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ + "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ + + /* odd 7. pixel */ + "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ + "mthi $zero, $ac1 \n\t" + "preceu.ph.qbr %[p5], %[qload1] \n\t" + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ + "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ + + /* odd 8. pixel */ + "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ + "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ + + "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ + "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ + "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ + + "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */ + "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" + + "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */ + + : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5), + [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), + [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), + [dst] "+r" (dst), [odd_dst] "+r" (odd_dst) + : [filter45] "r" (filter45), [vector_64] "r" (vector_64), + [cm] "r" (cm), + [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2) + ); + + src += 16; + dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); + odd_dst = (dst + dst_stride); + } + + /* Next row... */ + src_ptr += src_stride; + dst_ptr += 1; + } +} void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, diff --git a/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c index 8eb105c0c..bf01f1154 100644 --- a/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_convolve2_vert_dspr2.c @@ -20,103 +20,103 @@ #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 -static void convolve_bi_vert_4_dspr2(const uint8_t *src,
- int32_t src_stride,
- uint8_t *dst,
- int32_t dst_stride,
- const int16_t *filter_y,
- int32_t w,
- int32_t h) {
- int32_t x, y;
- const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = vp9_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2;
- uint32_t p1, p2;
- uint32_t scratch1;
- uint32_t store1, store2;
- int32_t Temp1, Temp2;
- const int16_t *filter = &filter_y[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- vp9_prefetch_store(dst + dst_stride);
-
- for (x = 0; x < w; x += 4) {
- src_ptr = src + x;
- dst_ptr = dst + x;
-
- __asm__ __volatile__ (
- "ulw %[load1], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load2], 0(%[src_ptr]) \n\t"
-
- "mtlo %[vector4a], $ac0 \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "preceu.ph.qbr %[scratch1], %[load1] \n\t"
- "preceu.ph.qbr %[p1], %[load2] \n\t"
-
- "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
-
- "preceu.ph.qbl %[scratch1], %[load1] \n\t"
- "preceu.ph.qbl %[p1], %[load2] \n\t"
-
- "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
- "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
-
- "extp %[Temp1], $ac0, 31 \n\t"
- "extp %[Temp2], $ac1, 31 \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "extp %[Temp1], $ac2, 31 \n\t"
-
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- "sb %[store1], 0(%[dst_ptr]) \n\t"
- "sb %[store2], 1(%[dst_ptr]) \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
-
- "sb %[store1], 2(%[dst_ptr]) \n\t"
- "sb %[store2], 3(%[dst_ptr]) \n\t"
-
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [p1] "=&r" (p1), [p2] "=&r" (p2),
- [scratch1] "=&r" (scratch1),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [store1] "=&r" (store1), [store2] "=&r" (store2),
- [src_ptr] "+r" (src_ptr)
- : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
- [src_stride] "r" (src_stride),
- [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
- );
- }
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
+static void convolve_bi_vert_4_dspr2(const uint8_t *src, + int32_t src_stride, + uint8_t *dst, + int32_t dst_stride, + const int16_t *filter_y, + int32_t w, + int32_t h) { + int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_store(dst + dst_stride); + + for (x = 0; x < w; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__ ( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [p1] "=&r" (p1), [p2] "=&r" (p2), + [scratch1] "=&r" (scratch1), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [store1] "=&r" (store1), [store2] "=&r" (store2), + [src_ptr] "+r" (src_ptr) + : [filter45] "r" (filter45),[vector4a] "r" (vector4a), + [src_stride] "r" (src_stride), + [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) + ); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } +} static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride, @@ -124,95 +124,95 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t dst_stride, const int16_t *filter_y, int32_t h) { - int32_t x, y;
- const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = vp9_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2;
- uint32_t p1, p2;
- uint32_t scratch1;
- uint32_t store1, store2;
- int32_t Temp1, Temp2;
- const int16_t *filter = &filter_y[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- vp9_prefetch_store(dst + dst_stride);
-
- for (x = 0; x < 64; x += 4) {
- src_ptr = src + x;
- dst_ptr = dst + x;
-
- __asm__ __volatile__ (
- "ulw %[load1], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load2], 0(%[src_ptr]) \n\t"
-
- "mtlo %[vector4a], $ac0 \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "preceu.ph.qbr %[scratch1], %[load1] \n\t"
- "preceu.ph.qbr %[p1], %[load2] \n\t"
-
- "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
-
- "preceu.ph.qbl %[scratch1], %[load1] \n\t"
- "preceu.ph.qbl %[p1], %[load2] \n\t"
-
- "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
- "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
-
- "extp %[Temp1], $ac0, 31 \n\t"
- "extp %[Temp2], $ac1, 31 \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "extp %[Temp1], $ac2, 31 \n\t"
-
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- "sb %[store1], 0(%[dst_ptr]) \n\t"
- "sb %[store2], 1(%[dst_ptr]) \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
-
- "sb %[store1], 2(%[dst_ptr]) \n\t"
- "sb %[store2], 3(%[dst_ptr]) \n\t"
-
- : [load1] "=&r" (load1), [load2] "=&r" (load2),
- [p1] "=&r" (p1), [p2] "=&r" (p2),
- [scratch1] "=&r" (scratch1),
- [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
- [store1] "=&r" (store1), [store2] "=&r" (store2),
- [src_ptr] "+r" (src_ptr)
- : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
- [src_stride] "r" (src_stride),
- [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
- );
- }
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
+ int32_t x, y; + const uint8_t *src_ptr; + uint8_t *dst_ptr; + uint8_t *cm = vp9_ff_cropTbl; + uint32_t vector4a = 64; + uint32_t load1, load2; + uint32_t p1, p2; + uint32_t scratch1; + uint32_t store1, store2; + int32_t Temp1, Temp2; + const int16_t *filter = &filter_y[3]; + uint32_t filter45; + + filter45 = ((const int32_t *)filter)[0]; + + for (y = h; y--;) { + /* prefetch data to cache memory */ + vp9_prefetch_store(dst + dst_stride); + + for (x = 0; x < 64; x += 4) { + src_ptr = src + x; + dst_ptr = dst + x; + + __asm__ __volatile__ ( + "ulw %[load1], 0(%[src_ptr]) \n\t" + "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" + "ulw %[load2], 0(%[src_ptr]) \n\t" + + "mtlo %[vector4a], $ac0 \n\t" + "mtlo %[vector4a], $ac1 \n\t" + "mtlo %[vector4a], $ac2 \n\t" + "mtlo %[vector4a], $ac3 \n\t" + "mthi $zero, $ac0 \n\t" + "mthi $zero, $ac1 \n\t" + "mthi $zero, $ac2 \n\t" + "mthi $zero, $ac3 \n\t" + + "preceu.ph.qbr %[scratch1], %[load1] \n\t" + "preceu.ph.qbr %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac0, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" + + "preceu.ph.qbl %[scratch1], %[load1] \n\t" + "preceu.ph.qbl %[p1], %[load2] \n\t" + + "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */ + "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ + + "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" + "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" + + "extp %[Temp1], $ac0, 31 \n\t" + "extp %[Temp2], $ac1, 31 \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "extp %[Temp1], $ac2, 31 \n\t" + + "lbux %[store2], %[Temp2](%[cm]) \n\t" + "extp %[Temp2], $ac3, 31 \n\t" + + "sb %[store1], 0(%[dst_ptr]) \n\t" + "sb %[store2], 1(%[dst_ptr]) \n\t" + + "lbux %[store1], %[Temp1](%[cm]) \n\t" + "lbux %[store2], %[Temp2](%[cm]) \n\t" + + "sb %[store1], 2(%[dst_ptr]) \n\t" + "sb %[store2], 3(%[dst_ptr]) \n\t" + + : [load1] "=&r" (load1), [load2] "=&r" (load2), + [p1] "=&r" (p1), [p2] "=&r" (p2), + [scratch1] "=&r" (scratch1), + [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), + [store1] "=&r" (store1), [store2] "=&r" (store2), + [src_ptr] "+r" (src_ptr) + : [filter45] "r" (filter45),[vector4a] "r" (vector4a), + [src_stride] "r" (src_stride), + [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) + ); + } + + /* Next row... */ + src += src_stride; + dst += dst_stride; + } } void vp9_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, diff --git a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c index da7f0fdbb..ab18490dc 100644 --- a/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_convolve8_avg_dspr2.c @@ -435,7 +435,7 @@ void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, filter_y, y_step_q4, w, intermediate_height); - vp9_convolve8_avg_vert(temp + (64*3), 64, + vp9_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, diff --git a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c index 126e05a67..0ef9dd508 100644 --- a/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_convolve8_dspr2.c @@ -16,7 +16,7 @@ #include "vp9/common/vp9_common.h" #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -#include "vp9/common/vp9_convolve.h" +#include "vp9/common/vp9_filter.h" #include "vp9/common/mips/dspr2/vp9_common_dspr2.h" #if HAVE_DSPR2 diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h index 179690652..3ac192b4a 100644 --- a/vp9/common/vp9_common.h +++ b/vp9/common/vp9_common.h @@ -40,8 +40,8 @@ vpx_memcpy(dest, src, n * sizeof(*src)); \ } -#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest)); -#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest)); +#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest)) +#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest)) static INLINE uint8_t clip_pixel(int val) { return (val > 255) ? 255u : (val < 0) ? 0u : val; diff --git a/vp9/common/vp9_convolve.h b/vp9/common/vp9_convolve.h index 9a5caa662..29d499063 100644 --- a/vp9/common/vp9_convolve.h +++ b/vp9/common/vp9_convolve.h @@ -13,8 +13,6 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" -#define FILTER_BITS 7 - typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c index 8ebe0e5fb..2640ac72b 100644 --- a/vp9/common/vp9_entropy.c +++ b/vp9/common/vp9_entropy.c @@ -56,7 +56,7 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = { /* Array indices are identical to previously-existing CONTEXT_NODE indices */ -const vp9_tree_index vp9_coef_tree[ 22] = { +const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = { -DCT_EOB_TOKEN, 2, /* 0 = EOB */ -ZERO_TOKEN, 4, /* 1 = ZERO */ -ONE_TOKEN, 6, /* 2 = ONE */ @@ -274,7 +274,7 @@ static void init_bit_trees() { init_bit_tree(cat6, 14); } -const vp9_extra_bit vp9_extra_bits[12] = { +const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = { { 0, 0, 0, 0}, { 0, 0, 0, 1}, { 0, 0, 0, 2}, @@ -348,7 +348,7 @@ void vp9_adapt_coef_probs(VP9_COMMON *cm) { TX_SIZE t; unsigned int count_sat, update_factor; - if (cm->frame_type == KEY_FRAME || cm->intra_only) { + if (frame_is_intra_only(cm)) { update_factor = COEF_MAX_UPDATE_FACTOR_KEY; count_sat = COEF_COUNT_SAT_KEY; } else if (cm->last_frame_type == KEY_FRAME) { diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index d3b5bba7c..45c4f6c01 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -43,7 +43,7 @@ extern DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]); -extern const vp9_tree_index vp9_coef_tree[]; +extern const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)]; #define DCT_EOB_MODEL_TOKEN 3 /* EOB Extra Bits 0+0 */ extern const vp9_tree_index vp9_coefmodel_tree[]; @@ -57,7 +57,8 @@ typedef struct { int base_val; } vp9_extra_bit; -extern const vp9_extra_bit vp9_extra_bits[12]; /* indexed by token value */ +// indexed by token value +extern const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS]; #define MAX_PROB 255 #define DCT_MAX_VALUE 16384 diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h index 36d19a76d..302945374 100644 --- a/vp9/common/vp9_filter.h +++ b/vp9/common/vp9_filter.h @@ -14,6 +14,8 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#define FILTER_BITS 7 + #define SUBPEL_BITS 4 #define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) #define SUBPEL_SHIFTS (1 << SUBPEL_BITS) diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h index 953764c85..5c8c03e51 100644 --- a/vp9/common/vp9_onyxc_int.h +++ b/vp9/common/vp9_onyxc_int.h @@ -302,4 +302,9 @@ static void set_prev_mi(VP9_COMMON *cm) { cm->prev_mi = use_prev_in_find_mv_refs ? cm->prev_mip + cm->mode_info_stride + 1 : NULL; } + +static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) { + return cm->frame_type == KEY_FRAME || cm->intra_only; +} + #endif // VP9_COMMON_VP9_ONYXC_INT_H_ diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 21513d414..526be87df 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -701,9 +701,6 @@ specialize vp9_short_fdct8x8 sse2 prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch" specialize vp9_short_fdct4x4 sse2 -prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_fdct8x4 sse2 - prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch" specialize vp9_short_fdct32x32 sse2 @@ -716,9 +713,6 @@ specialize vp9_short_fdct16x16 sse2 prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch" specialize vp9_short_walsh4x4 -prototype void vp9_short_walsh8x4 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_walsh8x4 - # # Motion search # diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index ba9ceb26a..106e6d426 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -217,7 +217,7 @@ void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71); + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); assert(w <= 64); assert(h <= 64); @@ -238,7 +238,7 @@ void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71); + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); assert(w <= 64); assert(h <= 64); @@ -428,7 +428,7 @@ void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71); + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); assert(w <= 64); assert(h <= 64); @@ -449,7 +449,7 @@ void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71); + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); assert(w <= 64); assert(h <= 64); diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index d89d6b803..cc9984ab3 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -504,7 +504,11 @@ static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi, if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { mbmi->mode = ZEROMV; - assert(bsize >= BLOCK_8X8); + if (bsize < BLOCK_8X8) { + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Invalid usage of segement feature on small blocks"); + return; + } } else { if (bsize >= BLOCK_8X8) mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx); @@ -627,7 +631,7 @@ void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r) { for (k = 0; k < MBSKIP_CONTEXTS; ++k) vp9_diff_update_prob(r, &cm->fc.mbskip_probs[k]); - if (cm->frame_type != KEY_FRAME && !cm->intra_only) { + if (!frame_is_intra_only(cm)) { nmv_context *const nmvc = &pbi->common.fc.nmvc; MACROBLOCKD *const xd = &pbi->mb; int i, j; @@ -665,7 +669,7 @@ void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r) { const int x_mis = MIN(bw, cm->mi_cols - mi_col); int x, y, z; - if (cm->frame_type == KEY_FRAME || cm->intra_only) + if (frame_is_intra_only(cm)) read_intra_frame_mode_info(pbi, mi, mi_row, mi_col, r); else read_inter_frame_mode_info(pbi, mi, mi_row, mi_col, r); diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index cc3422f97..bc5543826 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -894,9 +894,11 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, cm->frame_parallel_decoding_mode = 1; } + // This flag will be overridden by the call to vp9_setup_past_independence + // below, forcing the use of context 0 for those frame types. cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LOG2); - if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode || cm->intra_only) + if (frame_is_intra_only(cm) || cm->error_resilient_mode) vp9_setup_past_independence(cm); setup_loopfilter(&cm->lf, rb); @@ -955,9 +957,15 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { YV12_BUFFER_CONFIG *new_fb = &cm->yv12_fb[cm->new_fb_idx]; if (!first_partition_size) { - // showing a frame directly - *p_data_end = data + 1; - return 0; + if (!keyframe) { + // showing a frame directly + *p_data_end = data + 1; + return 0; + } else { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Invalid key frame"); + return -1; + } } data += vp9_rb_bytes_read(&rb); xd->corrupted = 0; @@ -1010,7 +1018,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) { vp9_adapt_coef_probs(cm); - if (!keyframe && !cm->intra_only) { + if (!frame_is_intra_only(cm)) { vp9_adapt_mode_probs(cm); vp9_adapt_mv_probs(cm, xd->allow_high_precision_mv); } diff --git a/vp9/decoder/vp9_dsubexp.c b/vp9/decoder/vp9_dsubexp.c index df044c411..fcca01729 100644 --- a/vp9/decoder/vp9_dsubexp.c +++ b/vp9/decoder/vp9_dsubexp.c @@ -48,8 +48,6 @@ static int merge_index(int v, int n, int modulus) { static int inv_remap_prob(int v, int m) { static int inv_map_table[MAX_PROB - 1] = { - // generated by: - // inv_map_table[j] = merge_index(j, MAX_PROB - 1, MODULUS_PARAM); 6, 19, 32, 45, 58, 71, 84, 97, 110, 123, 136, 149, 162, 175, 188, 201, 214, 227, 240, 253, 0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25, 26, @@ -66,9 +64,11 @@ static int inv_remap_prob(int v, int m) { 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, - 238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, + 238, 239, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252 }; - // v = merge_index(v, MAX_PROBS - 1, MODULUS_PARAM); + // The clamp is not necessary for conforming VP9 stream, it is added to + // prevent out of bound access for bad input data + v = clamp(v, 0, 253); v = inv_map_table[v]; m--; if ((m << 1) <= MAX_PROB) { diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index ab11c740e..c73fb6529 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -580,7 +580,7 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO **mi_8x8, vp9_writer *bc, set_mi_row_col(&cpi->common, xd, mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type], mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type]); - if ((cm->frame_type == KEY_FRAME) || cm->intra_only) { + if (frame_is_intra_only(cm)) { write_mb_modes_kf(cpi, mi_8x8, bc); #ifdef ENTROPY_STATS active_section = 8; @@ -1420,7 +1420,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { vp9_update_skip_probs(cpi, &header_bc); - if (cm->frame_type != KEY_FRAME) { + if (!frame_is_intra_only(cm)) { int i; #ifdef ENTROPY_STATS active_section = 1; diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 9b57bc364..1b5d09810 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -26,7 +26,7 @@ typedef struct { // Structure to hold snapshot of coding context during the mode picking process typedef struct { MODE_INFO mic; - unsigned char zcoeff_blk[256]; + uint8_t zcoeff_blk[256]; int skip; int_mv best_ref_mv; int_mv second_best_ref_mv; @@ -55,8 +55,8 @@ typedef struct { } PICK_MODE_CONTEXT; struct macroblock_plane { - DECLARE_ALIGNED(16, int16_t, src_diff[64*64]); - DECLARE_ALIGNED(16, int16_t, coeff[64*64]); + DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]); + DECLARE_ALIGNED(16, int16_t, coeff[64 * 64]); struct buf_2d src; // Quantizer setings @@ -90,6 +90,7 @@ struct macroblock { int sadperbit4; int rddiv; int rdmult; + unsigned int mb_energy; unsigned int *mb_activity_ptr; int *mb_norm_activity_ptr; signed int act_zbin_adj; @@ -126,7 +127,7 @@ struct macroblock { int mv_row_min; int mv_row_max; - unsigned char zcoeff_blk[TX_SIZES][256]; + uint8_t zcoeff_blk[TX_SIZES][256]; int skip; int encode_breakout; @@ -172,7 +173,6 @@ struct macroblock { BLOCK_SIZE sb64_partitioning; void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch); - void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch); void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch); void (*fwd_txm16x16)(int16_t *input, int16_t *output, int pitch); void (*quantize_b_4x4)(MACROBLOCK *x, int b_idx, TX_TYPE tx_type, diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index a232a8674..b6555bc05 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -178,11 +178,6 @@ void vp9_short_fht4x4_c(int16_t *input, int16_t *output, } } -void vp9_short_fdct8x4_c(int16_t *input, int16_t *output, int pitch) { - vp9_short_fdct4x4_c(input, output, pitch); - vp9_short_fdct4x4_c(input + 4, output + 16, pitch); -} - static void fdct8(const int16_t *input, int16_t *output) { /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; /*needs32*/ int t0, t1, t2, t3; @@ -647,12 +642,6 @@ void vp9_short_walsh4x4_c(int16_t *input, int16_t *output, int pitch) { } } -void vp9_short_walsh8x4_c(int16_t *input, int16_t *output, int pitch) { - vp9_short_walsh4x4_c(input, output, pitch); - vp9_short_walsh4x4_c(input + 4, output + 16, pitch); -} - - // Rewrote to use same algorithm as others. static void fdct16(const int16_t in[16], int16_t out[16]) { /*canbe16*/ int step1[8]; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 631a27688..15a3a70e3 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -38,7 +38,10 @@ #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_segmentation.h" +#include "vp9/common/vp9_systemdependent.h" #include "vp9/encoder/vp9_tokenize.h" +#include "vp9/encoder/vp9_vaq.h" + #define DBG_PRNT_SEGMAP 0 @@ -123,7 +126,6 @@ static unsigned int tt_activity_measure(MACROBLOCK *x) { static unsigned int alt_activity_measure(MACROBLOCK *x, int use_dc_pred) { return vp9_encode_intra(x, use_dc_pred); } -DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = {0}; // Measure the activity of the current macroblock // What we measure here is TBD so abstracted to this function @@ -373,6 +375,10 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) xd->mi_8x8[x_idx + y * mis] = mi_addr; + if (cpi->sf.variance_adaptive_quantization) { + vp9_mb_init_quantizer(cpi, x); + } + // FIXME(rbultje) I'm pretty sure this should go to the end of this block // (i.e. after the output_enabled) if (bsize < BLOCK_32X32) { @@ -398,15 +404,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i]; } - if (cm->frame_type == KEY_FRAME) { - // Restore the coding modes to that held in the coding context - // if (mb_mode == I4X4_PRED) - // for (i = 0; i < 16; i++) - // { - // xd->block[i].bmi.as_mode = - // xd->mode_info_context->bmi[i].as_mode; - // assert(xd->mode_info_context->bmi[i].as_mode < MB_MODE_COUNT); - // } + if (frame_is_intra_only(cm)) { #if CONFIG_INTERNAL_STATS static const int kf_mode_index[] = { THR_DC /*DC_PRED*/, @@ -526,10 +524,11 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, /* segment ID */ if (seg->enabled) { - uint8_t *map = seg->update_map ? cpi->segmentation_map - : cm->last_frame_seg_map; - mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); - + if (!cpi->sf.variance_adaptive_quantization) { + uint8_t *map = seg->update_map ? cpi->segmentation_map + : cm->last_frame_seg_map; + mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col); + } vp9_mb_init_quantizer(cpi, x); if (seg->enabled && cpi->seg0_cnt > 0 @@ -563,6 +562,8 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; + int orig_rdmult = x->rdmult; + double rdmult_ratio = 1.0; // Use the lower precision, but faster, 32x32 fdct for mode selection. x->use_lp32x32fdct = 1; @@ -585,12 +586,27 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, x->source_variance = get_sby_perpixel_variance(cpi, x, bsize); + if (cpi->sf.variance_adaptive_quantization) { + int energy; + if (bsize <= BLOCK_16X16) { + energy = x->mb_energy; + } else { + energy = vp9_block_energy(cpi, x, bsize); + } + + xd->this_mi->mbmi.segment_id = vp9_vaq_segment_id(energy); + rdmult_ratio = vp9_vaq_rdmult_ratio(energy); + vp9_mb_init_quantizer(cpi, x); + } + if (cpi->oxcf.tuning == VP8_TUNE_SSIM) vp9_activity_masking(cpi, x); + x->rdmult = round(x->rdmult * rdmult_ratio); + // Find best coding mode & reconstruct the MB so it is available // as a predictor for MBs that follow in the SB - if (cm->frame_type == KEY_FRAME) { + if (frame_is_intra_only(cm)) { vp9_rd_pick_intra_mode_sb(cpi, x, totalrate, totaldist, bsize, ctx, best_rd); } else { @@ -601,6 +617,10 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, vp9_rd_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, totalrate, totaldist, bsize, ctx, best_rd); } + + x->rdmult = orig_rdmult; + if (*totalrate != INT_MAX) + *totalrate = round(*totalrate * rdmult_ratio); } static void update_stats(VP9_COMP *cpi) { @@ -610,7 +630,7 @@ static void update_stats(VP9_COMP *cpi) { MODE_INFO *mi = xd->this_mi; MB_MODE_INFO *const mbmi = &mi->mbmi; - if (cm->frame_type != KEY_FRAME) { + if (!frame_is_intra_only(cm)) { const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); @@ -1018,6 +1038,11 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO **mi_8x8, } save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + if (bsize == BLOCK_16X16) { + set_offsets(cpi, mi_row, mi_col, bsize); + x->mb_energy = vp9_block_energy(cpi, x, bsize); + } + x->fast_ms = 0; x->subblock_ref = 0; @@ -1478,6 +1503,11 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, } assert(mi_height_log2(bsize) == mi_width_log2(bsize)); + if (bsize == BLOCK_16X16) { + set_offsets(cpi, mi_row, mi_col, bsize); + x->mb_energy = vp9_block_energy(cpi, x, bsize); + } + // Determine partition types in search according to the speed features. // The threshold set here has to be of square block size. if (cpi->sf.auto_min_max_partition_size) { @@ -1823,7 +1853,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { xd->mode_info_stride = cm->mode_info_stride; // reset intra mode contexts - if (cm->frame_type == KEY_FRAME) + if (frame_is_intra_only(cm)) vp9_init_mbmode_probs(cm); // Copy data over into macro block data structures. @@ -1839,9 +1869,9 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { xd->this_mi->mbmi.mode = DC_PRED; xd->this_mi->mbmi.uv_mode = DC_PRED; - vp9_zero(cpi->y_mode_count) - vp9_zero(cpi->y_uv_mode_count) - vp9_zero(cm->counts.inter_mode) + vp9_zero(cpi->y_mode_count); + vp9_zero(cpi->y_uv_mode_count); + vp9_zero(cm->counts.inter_mode); vp9_zero(cpi->partition_count); vp9_zero(cpi->intra_inter_count); vp9_zero(cpi->comp_inter_count); @@ -1861,7 +1891,6 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { if (lossless) { // printf("Switching to lossless\n"); - cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4; cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; cpi->mb.e_mbd.itxm_add = vp9_iwht4x4_add; cpi->mb.optimize = 0; @@ -1870,7 +1899,6 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { cpi->common.tx_mode = ONLY_4X4; } else { // printf("Not lossless\n"); - cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; cpi->mb.e_mbd.itxm_add = vp9_idct4x4_add; } @@ -1928,7 +1956,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_frame_init_quantizer(cpi); - vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q); + vp9_initialize_rd_consts(cpi); vp9_initialize_me_consts(cpi, cm->base_qindex); switch_tx_mode(cpi); @@ -2123,7 +2151,7 @@ static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) { static int get_frame_type(VP9_COMP *cpi) { int frame_type; - if (cpi->common.frame_type == KEY_FRAME) + if (frame_is_intra_only(&cpi->common)) frame_type = 0; else if (cpi->is_src_frame_alt_ref && cpi->refresh_golden_frame) frame_type = 3; @@ -2171,16 +2199,18 @@ void vp9_encode_frame(VP9_COMP *cpi) { // requires further work in the rd loop. For now the only supported encoder // side behavior is where the ALT ref buffer has opposite sign bias to // the other two. - if ((cm->ref_frame_sign_bias[ALTREF_FRAME] - == cm->ref_frame_sign_bias[GOLDEN_FRAME]) - || (cm->ref_frame_sign_bias[ALTREF_FRAME] - == cm->ref_frame_sign_bias[LAST_FRAME])) { - cm->allow_comp_inter_inter = 0; - } else { - cm->allow_comp_inter_inter = 1; - cm->comp_fixed_ref = ALTREF_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = GOLDEN_FRAME; + if (!frame_is_intra_only(cm)) { + if ((cm->ref_frame_sign_bias[ALTREF_FRAME] + == cm->ref_frame_sign_bias[GOLDEN_FRAME]) + || (cm->ref_frame_sign_bias[ALTREF_FRAME] + == cm->ref_frame_sign_bias[LAST_FRAME])) { + cm->allow_comp_inter_inter = 0; + } else { + cm->allow_comp_inter_inter = 1; + cm->comp_fixed_ref = ALTREF_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = GOLDEN_FRAME; + } } if (cpi->sf.RD) { diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index b2becbb44..aed7a95a5 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -30,6 +30,7 @@ #include "vp9/common/vp9_quant_common.h" #include "vp9/common/vp9_entropymv.h" #include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_vaq.h" #include "./vpx_scale_rtcd.h" // TODO(jkoleszar): for setup_dst_planes #include "vp9/common/vp9_reconinter.h" @@ -530,7 +531,7 @@ void vp9_first_pass(VP9_COMP *cpi) { // if ( 0 ) { vp9_init_mv_probs(cm); - vp9_initialize_rd_consts(cpi, cm->base_qindex + cm->y_dc_delta_q); + vp9_initialize_rd_consts(cpi); } // for each macroblock row in image @@ -555,6 +556,7 @@ void vp9_first_pass(VP9_COMP *cpi) { int this_error; int gf_motion_error = INT_MAX; int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); + double error_weight = 1.0; xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset; xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset; @@ -581,8 +583,13 @@ void vp9_first_pass(VP9_COMP *cpi) { mb_col << 1, 1 << mi_width_log2(xd->this_mi->mbmi.sb_type)); + if (cpi->sf.variance_adaptive_quantization) { + int energy = vp9_block_energy(cpi, x, xd->this_mi->mbmi.sb_type); + error_weight = vp9_vaq_inv_q_ratio(energy); + } + // do intra 16x16 prediction - this_error = vp9_encode_intra(x, use_dc_pred); + this_error = error_weight * vp9_encode_intra(x, use_dc_pred); // intrapenalty below deals with situations where the intra and inter // error scores are very low (eg a plain black frame). @@ -617,6 +624,7 @@ void vp9_first_pass(VP9_COMP *cpi) { first_pass_motion_search(cpi, x, &best_ref_mv, &mv.as_mv, lst_yv12, &motion_error, recon_yoffset); + motion_error *= error_weight; // If the current best reference mv is not centered on 0,0 then do a 0,0 // based search as well. @@ -624,6 +632,7 @@ void vp9_first_pass(VP9_COMP *cpi) { tmp_err = INT_MAX; first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv, lst_yv12, &tmp_err, recon_yoffset); + tmp_err *= error_weight; if (tmp_err < motion_error) { motion_error = tmp_err; @@ -640,6 +649,7 @@ void vp9_first_pass(VP9_COMP *cpi) { first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv, gld_yv12, &gf_motion_error, recon_yoffset); + gf_motion_error *= error_weight; if ((gf_motion_error < motion_error) && (gf_motion_error < this_error)) { diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index 0833b4ac8..ba09622d7 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -33,6 +33,7 @@ #include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_segmentation.h" #include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_vaq.h" #include "vpx_ports/vpx_timer.h" @@ -315,7 +316,7 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { // Computes a q delta (in "q index" terms) to get from a starting q value // to a target value // target q value -static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) { +int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) { int i; int start_index = cpi->worst_quality; int target_index = cpi->worst_quality; @@ -379,7 +380,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) { seg->update_map = 1; seg->update_data = 1; - qi_delta = compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875)); + qi_delta = vp9_compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875)); vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta - 2)); vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2); @@ -400,8 +401,8 @@ static void configure_static_seg_features(VP9_COMP *cpi) { seg->update_data = 1; seg->abs_delta = SEGMENT_DELTADATA; - qi_delta = compute_qdelta(cpi, cpi->avg_q, - (cpi->avg_q * 1.125)); + qi_delta = vp9_compute_qdelta(cpi, cpi->avg_q, + (cpi->avg_q * 1.125)); vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta + 2)); vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); @@ -756,6 +757,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->static_segmentation = 0; #endif + sf->variance_adaptive_quantization = 0; + switch (mode) { case 0: // This is the best quality mode. break; @@ -772,11 +775,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->recode_loop = (speed < 1); if (speed == 1) { - sf->use_square_partition_only = !(cpi->common.frame_type == KEY_FRAME || - cpi->common.intra_only); + sf->use_square_partition_only = !frame_is_intra_only(&cpi->common); sf->less_rectangular_check = 1; - sf->tx_size_search_method = (cpi->common.frame_type == KEY_FRAME || - cpi->common.intra_only) + sf->tx_size_search_method = frame_is_intra_only(&cpi->common) ? USE_FULL_RD : USE_LARGESTALL; if (MIN(cpi->common.width, cpi->common.height) >= 720) @@ -795,12 +796,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; } if (speed == 2) { - sf->use_square_partition_only = !(cpi->common.frame_type == KEY_FRAME || - cpi->common.intra_only); + sf->use_square_partition_only = !frame_is_intra_only(&cpi->common); sf->less_rectangular_check = 1; - sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME || - cpi->common.intra_only) - ? USE_FULL_RD : USE_LARGESTALL); + sf->tx_size_search_method = frame_is_intra_only(&cpi->common) + ? USE_FULL_RD : USE_LARGESTALL; if (MIN(cpi->common.width, cpi->common.height) >= 720) sf->disable_split_mask = cpi->common.show_frame ? @@ -911,8 +910,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->comp_inter_joint_search_thresh = BLOCK_SIZES; sf->use_one_partition_size_always = 1; sf->always_this_block_size = BLOCK_16X16; - sf->tx_size_search_method = (cpi->common.frame_type == KEY_FRAME || - cpi->common.intra_only) ? + sf->tx_size_search_method = frame_is_intra_only(&cpi->common) ? USE_FULL_RD : USE_LARGESTALL; sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | @@ -953,12 +951,16 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->optimize_coefficients = 0; } + // No recode for 1 pass. + if (cpi->pass == 0) { + sf->recode_loop = 0; + sf->optimize_coefficients = 0; + } + cpi->mb.fwd_txm16x16 = vp9_short_fdct16x16; cpi->mb.fwd_txm8x8 = vp9_short_fdct8x8; - cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) { - cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4; cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; } @@ -1219,6 +1221,12 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { switch (cpi->oxcf.Mode) { // Real time and one pass deprecated in test code base + case MODE_GOODQUALITY: + cpi->pass = 0; + cpi->compressor_speed = 2; + cpi->oxcf.cpu_used = clamp(cpi->oxcf.cpu_used, -5, 5); + break; + case MODE_FIRSTPASS: cpi->pass = 1; cpi->compressor_speed = 1; @@ -1733,7 +1741,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->common.error.setjmp = 0; - vp9_zero(cpi->y_uv_mode_count) + vp9_zero(cpi->y_uv_mode_count); #ifdef MODE_TEST_HIT_STATS vp9_zero(cpi->mode_test_hits) @@ -2628,7 +2636,7 @@ static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi, int q = cpi->active_worst_quality; VP9_COMMON *const cm = &cpi->common; - if (cm->frame_type == KEY_FRAME) { + if (frame_is_intra_only(cm)) { #if !CONFIG_MULTIPLE_ARF // Handle the special case for key frames forced when we have75 reached // the maximum key frame interval. Here force the Q to a range @@ -2638,8 +2646,8 @@ static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi, int qindex = cpi->last_boosted_qindex; double last_boosted_q = vp9_convert_qindex_to_q(qindex); - delta_qindex = compute_qdelta(cpi, last_boosted_q, - (last_boosted_q * 0.75)); + delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q, + (last_boosted_q * 0.75)); cpi->active_best_quality = MAX(qindex + delta_qindex, cpi->best_quality); @@ -2667,14 +2675,14 @@ static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi, // on active_best_quality. q_val = vp9_convert_qindex_to_q(cpi->active_best_quality); cpi->active_best_quality += - compute_qdelta(cpi, q_val, (q_val * q_adj_factor)); + vp9_compute_qdelta(cpi, q_val, (q_val * q_adj_factor)); } #else double current_q; // Force the KF quantizer to be 30% of the active_worst_quality. current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality); cpi->active_best_quality = cpi->active_worst_quality - + compute_qdelta(cpi, current_q, current_q * 0.3); + + vp9_compute_qdelta(cpi, current_q, current_q * 0.3); #endif } else if (!cpi->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { @@ -2740,6 +2748,10 @@ static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi, #endif #else cpi->active_best_quality = inter_minq[q]; + // 1-pass: for now, use the average Q for the active_best, if its lower + // than active_worst. + if (cpi->pass == 0 && (cpi->avg_frame_qindex < cpi->active_worst_quality)) + cpi->active_best_quality = inter_minq[cpi->avg_frame_qindex]; #endif // For the constrained quality mode we don't want @@ -2788,8 +2800,15 @@ static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi, } else if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) { q = cpi->last_boosted_qindex; } else { - // Determine initial Q to try - q = vp9_regulate_q(cpi, cpi->this_frame_target); + // Determine initial Q to try. + if (cpi->pass == 0) { + // 1-pass: for now, use per-frame-bw for target size of frame, scaled + // by |x| for key frame. + int scale = (cm->frame_type == KEY_FRAME) ? 5 : 1; + q = vp9_regulate_q(cpi, scale * cpi->av_per_frame_bandwidth); + } else { + q = vp9_regulate_q(cpi, cpi->this_frame_target); + } if (q > *top_index) q = *top_index; } @@ -2878,7 +2897,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cpi->mv_step_param = vp9_init_search_range(cpi, max_mv_def); // Initialize cpi->max_mv_magnitude and cpi->mv_step_param if appropriate. if (sf->auto_mv_step_size) { - if ((cpi->common.frame_type == KEY_FRAME) || cpi->common.intra_only) { + if (frame_is_intra_only(&cpi->common)) { // Initialize max_mv_magnitude for use in the first INTER frame // after a key/intra-only frame. cpi->max_mv_magnitude = max_mv_def; @@ -2894,8 +2913,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, } // Set various flags etc to special state if it is a key frame. - if (cm->frame_type == KEY_FRAME) { - // Reset the loop filter deltas and segmentation map. + if (frame_is_intra_only(cm)) { + // Reset the loop filter deltas and segmentation map setup_features(cm); // If segmentation is enabled force a map update for key frames. @@ -2914,6 +2933,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, cm->frame_parallel_decoding_mode = 1; cm->reset_frame_context = 0; cm->refresh_frame_context = 0; + } else if (cm->intra_only) { + // Only reset the current context. + cm->reset_frame_context = 2; } } @@ -2949,7 +2971,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, // Set quantizer steps at 10% increments. new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level))); - q = cpi->active_worst_quality + compute_qdelta(cpi, current_q, new_q); + q = cpi->active_worst_quality + vp9_compute_qdelta(cpi, current_q, new_q); bottom_index = q; top_index = q; @@ -2963,7 +2985,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, loop_count = 0; vp9_zero(cpi->rd_tx_select_threshes); - if (cm->frame_type != KEY_FRAME) { + if (!frame_is_intra_only(cm)) { cm->mcomp_filter_type = DEFAULT_INTERP_FILTER; /* TODO: Decide this more intelligently */ xd->allow_high_precision_mv = q < HIGH_PRECISION_MV_QTHRESH; @@ -3009,24 +3031,25 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_set_quantizer(cpi, q); if (loop_count == 0) { - // Set up entropy depending on frame type. + // Set up entropy context depending on frame type. The decoder mandates + // the use of the default context, index 0, for keyframes and inter + // frames where the error_resilient_mode or intra_only flag is set. For + // other inter-frames the encoder currently uses only two contexts; + // context 1 for ALTREF frames and context 0 for the others. if (cm->frame_type == KEY_FRAME) { - /* Choose which entropy context to use. When using a forward reference - * frame, it immediately follows the keyframe, and thus benefits from - * using the same entropy context established by the keyframe. - * Otherwise, use the default context 0. - */ - cm->frame_context_idx = cpi->oxcf.play_alternate; vp9_setup_key_frame(cpi); } else { - /* Choose which entropy context to use. Currently there are only two - * contexts used, one for normal frames and one for alt ref frames. - */ - cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame; + if (!cm->intra_only && !cm->error_resilient_mode) { + cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame; + } vp9_setup_inter_frame(cpi); } } + if (cpi->sf.variance_adaptive_quantization) { + vp9_vaq_frame_setup(cpi); + } + // transform / motion compensation build reconstruction frame vp9_encode_frame(cpi); @@ -3241,7 +3264,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, vp9_adapt_coef_probs(&cpi->common); } - if (cpi->common.frame_type != KEY_FRAME) { + if (!frame_is_intra_only(&cpi->common)) { FRAME_COUNTS *counts = &cpi->common.counts; vp9_copy(counts->y_mode, cpi->y_mode_count); @@ -3620,7 +3643,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, } cm->show_frame = 0; - cm->intra_only = 0; cpi->refresh_alt_ref_frame = 1; cpi->refresh_golden_frame = 0; cpi->refresh_last_frame = 0; @@ -3642,6 +3664,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, #endif if ((cpi->source = vp9_lookahead_pop(cpi->lookahead, flush))) { cm->show_frame = 1; + cm->intra_only = 0; #if CONFIG_MULTIPLE_ARF // Is this frame the ARF overlay. @@ -3809,6 +3832,10 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm); + if (cpi->sf.variance_adaptive_quantization) { + vp9_vaq_init(); + } + if (cpi->pass == 1) { Pass1Encode(cpi, size, dest, frame_flags); } else if (cpi->pass == 2) { diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index f88ae8ad0..2e5c7bc7e 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -253,6 +253,7 @@ typedef struct { int auto_mv_step_size; int optimize_coefficients; int static_segmentation; + int variance_adaptive_quantization; int comp_inter_joint_search_thresh; int adaptive_rd_thresh; int skip_encode_sb; @@ -379,9 +380,9 @@ typedef struct VP9_COMP { int ref_frame_mask; int set_ref_frame_mask; - int rd_threshes[BLOCK_SIZES][MAX_MODES]; + int rd_threshes[MAX_SEGMENTS][BLOCK_SIZES][MAX_MODES]; int rd_thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; - int rd_thresh_sub8x8[BLOCK_SIZES][MAX_REFS]; + int rd_thresh_sub8x8[MAX_SEGMENTS][BLOCK_SIZES][MAX_REFS]; int rd_thresh_freq_sub8x8[BLOCK_SIZES][MAX_REFS]; int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES]; @@ -710,6 +711,8 @@ int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest); void vp9_alloc_compressor_data(VP9_COMP *cpi); +int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget); + static int get_token_alloc(int mb_rows, int mb_cols) { return mb_rows * mb_cols * (48 * 16 + 4); } diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 05e893ee9..048a6e1eb 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -12,6 +12,7 @@ #include "vpx_mem/vpx_mem.h" #include "vp9/encoder/vp9_onyx_int.h" +#include "vp9/encoder/vp9_rdopt.h" #include "vp9/encoder/vp9_quantize.h" #include "vp9/common/vp9_quant_common.h" @@ -271,12 +272,15 @@ void vp9_init_quantizer(VP9_COMP *cpi) { void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { int i; + VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; int zbin_extra; int segment_id = xd->this_mi->mbmi.segment_id; const int qindex = vp9_get_qindex(&cpi->common.seg, segment_id, cpi->common.base_qindex); + int rdmult = vp9_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q); + // Y zbin_extra = (cpi->common.y_dequant[qindex][1] * (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7; @@ -315,6 +319,12 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) { /* save this macroblock QIndex for vp9_update_zbin_extra() */ x->e_mbd.q_index = qindex; + + /* R/D setup */ + cpi->mb.errorperbit = rdmult >> 6; + cpi->mb.errorperbit += (cpi->mb.errorperbit == 0); + + vp9_initialize_me_consts(cpi, xd->q_index); } void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) { diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index ba521afa0..56a080377 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -161,10 +161,17 @@ void vp9_init_me_luts() { } } -static int compute_rd_mult(int qindex) { +int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex) { const int q = vp9_dc_quant(qindex, 0); // TODO(debargha): Adjust the function below - return (88 * q * q / 25); + int rdmult = 88 * q * q / 25; + if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { + if (cpi->twopass.next_iiratio > 31) + rdmult += (rdmult * rd_iifactor[31]) >> 4; + else + rdmult += (rdmult * rd_iifactor[cpi->twopass.next_iiratio]) >> 4; + } + return rdmult; } static int compute_rd_thresh_factor(int qindex) { @@ -181,41 +188,47 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) { cpi->mb.sadperbit4 = sad_per_bit4lut[qindex]; } -static void set_block_thresholds(VP9_COMP *cpi, int qindex) { - int q, i, bsize; - q = compute_rd_thresh_factor(qindex); +static void set_block_thresholds(VP9_COMP *cpi) { + int i, bsize, segment_id; + VP9_COMMON *cm = &cpi->common; + + for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) { + int q; + int segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); + segment_qindex = clamp(segment_qindex + cm->y_dc_delta_q, 0, MAXQ); + q = compute_rd_thresh_factor(segment_qindex); - for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { - for (i = 0; i < MAX_MODES; ++i) { + for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { // Threshold here seem unecessarily harsh but fine given actual // range of values used for cpi->sf.thresh_mult[] int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]); - if (cpi->sf.thresh_mult[i] < thresh_max) { - cpi->rd_threshes[bsize][i] = - cpi->sf.thresh_mult[i] * q * - rd_thresh_block_size_factor[bsize] / 4; - } else { - cpi->rd_threshes[bsize][i] = INT_MAX; + for (i = 0; i < MAX_MODES; ++i) { + if (cpi->sf.thresh_mult[i] < thresh_max) { + cpi->rd_threshes[segment_id][bsize][i] = + cpi->sf.thresh_mult[i] * q * + rd_thresh_block_size_factor[bsize] / 4; + } else { + cpi->rd_threshes[segment_id][bsize][i] = INT_MAX; + } } - } - for (i = 0; i < MAX_REFS; ++i) { - int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]); - - if (cpi->sf.thresh_mult_sub8x8[i] < thresh_max) { - cpi->rd_thresh_sub8x8[bsize][i] = - cpi->sf.thresh_mult_sub8x8[i] * q * - rd_thresh_block_size_factor[bsize] / 4; - } else { - cpi->rd_thresh_sub8x8[bsize][i] = INT_MAX; + for (i = 0; i < MAX_REFS; ++i) { + if (cpi->sf.thresh_mult_sub8x8[i] < thresh_max) { + cpi->rd_thresh_sub8x8[segment_id][bsize][i] = + cpi->sf.thresh_mult_sub8x8[i] * q * + rd_thresh_block_size_factor[bsize] / 4; + } else { + cpi->rd_thresh_sub8x8[segment_id][bsize][i] = INT_MAX; + } } } } } -void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { - int i; +void vp9_initialize_rd_consts(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int qindex, i; vp9_clear_system_state(); // __asm emms; @@ -223,23 +236,17 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { // for key frames, golden frames and arf frames. // if (cpi->common.refresh_golden_frame || // cpi->common.refresh_alt_ref_frame) - qindex = clamp(qindex, 0, MAXQ); + qindex = clamp(cm->base_qindex + cm->y_dc_delta_q, 0, MAXQ); cpi->RDDIV = RDDIV_BITS; // in bits (to multiply D by 128) - cpi->RDMULT = compute_rd_mult(qindex); - if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { - if (cpi->twopass.next_iiratio > 31) - cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4; - else - cpi->RDMULT += - (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4; - } + cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex); + cpi->mb.errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO; cpi->mb.errorperbit += (cpi->mb.errorperbit == 0); vp9_set_speed_features(cpi); - set_block_thresholds(cpi, qindex); + set_block_thresholds(cpi); fill_token_costs(cpi->mb.token_costs, cpi->common.fc.coef_probs); @@ -251,7 +258,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { /*rough estimate for costing*/ vp9_init_mode_costs(cpi); - if (cpi->common.frame_type != KEY_FRAME) { + if (!frame_is_intra_only(&cpi->common)) { vp9_build_nmv_cost_table( cpi->mb.nmvjointcost, cpi->mb.e_mbd.allow_high_precision_mv ? @@ -2231,9 +2238,6 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, ctx->comp_pred_diff = (int)comp_pred_diff[COMP_PREDICTION_ONLY]; ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION]; - vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[xd->this_mi->mbmi.tx_size], - sizeof(ctx->zcoeff_blk)); - vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff)); vpx_memcpy(ctx->best_filter_diff, best_filter_diff, sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1)); @@ -3149,11 +3153,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, const int bws = num_8x8_blocks_wide_lookup[bsize] / 2; const int bhs = num_8x8_blocks_high_lookup[bsize] / 2; int best_skip2 = 0; - unsigned char best_zcoeff_blk[256] = { 0 }; x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH; - vp9_zero(x->zcoeff_blk); - vp9_zero(ctx->zcoeff_blk); // Everywhere the flag is set the error is much higher than its neighbors. ctx->frames_with_high_error = 0; @@ -3270,9 +3271,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, continue; // Test best rd so far against threshold for trying this mode. - if ((best_rd < ((int64_t)cpi->rd_threshes[bsize][mode_index] * + if ((best_rd < ((int64_t)cpi->rd_threshes[segment_id][bsize][mode_index] * cpi->rd_thresh_freq_fact[bsize][mode_index] >> 5)) || - cpi->rd_threshes[bsize][mode_index] == INT_MAX) + cpi->rd_threshes[segment_id][bsize][mode_index] == INT_MAX) continue; // Do not allow compound prediction if the segment level reference @@ -3584,8 +3585,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_rd = this_rd; best_mbmode = *mbmi; best_skip2 = this_skip2; - vpx_memcpy(best_zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], - sizeof(best_zcoeff_blk)); + vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], + sizeof(ctx->zcoeff_blk)); // TODO(debargha): enhance this test with a better distortion prediction // based on qp, activity mask and history @@ -3751,9 +3752,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, *mbmi = best_mbmode; x->skip |= best_skip2; - vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], best_zcoeff_blk, - sizeof(best_zcoeff_blk)); - for (i = 0; i < NB_PREDICTION_TYPES; ++i) { if (best_pred_rd[i] == INT64_MAX) best_pred_diff[i] = INT_MIN; @@ -3847,11 +3845,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int_mv seg_mvs[4][MAX_REF_FRAMES]; b_mode_info best_bmodes[4]; int best_skip2 = 0; - unsigned char best_zcoeff_blk[256] = { 0 }; x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH; vp9_zero(x->zcoeff_blk); - vp9_zero(ctx->zcoeff_blk); for (i = 0; i < 4; i++) { int j; @@ -3945,9 +3941,10 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, continue; // Test best rd so far against threshold for trying this mode. - if ((best_rd < ((int64_t)cpi->rd_thresh_sub8x8[bsize][mode_index] * - cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 5)) || - cpi->rd_thresh_sub8x8[bsize][mode_index] == INT_MAX) + if ((best_rd < + ((int64_t)cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] * + cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 5)) || + cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] == INT_MAX) continue; // Do not allow compound prediction if the segment level reference @@ -4092,10 +4089,10 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int uv_skippable; this_rd_thresh = (ref_frame == LAST_FRAME) ? - cpi->rd_thresh_sub8x8[bsize][THR_LAST] : - cpi->rd_thresh_sub8x8[bsize][THR_ALTR]; + cpi->rd_thresh_sub8x8[segment_id][bsize][THR_LAST] : + cpi->rd_thresh_sub8x8[segment_id][bsize][THR_ALTR]; this_rd_thresh = (ref_frame == GOLDEN_FRAME) ? - cpi->rd_thresh_sub8x8[bsize][THR_GOLD] : this_rd_thresh; + cpi->rd_thresh_sub8x8[segment_id][bsize][THR_GOLD] : this_rd_thresh; xd->this_mi->mbmi.tx_size = TX_4X4; cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX; @@ -4328,8 +4325,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv); best_mbmode = *mbmi; best_skip2 = this_skip2; - vpx_memcpy(best_zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], - sizeof(best_zcoeff_blk)); + vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size], + sizeof(ctx->zcoeff_blk)); for (i = 0; i < 4; i++) best_bmodes[i] = xd->this_mi->bmi[i]; @@ -4492,9 +4489,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, mbmi->mv[1].as_int = xd->this_mi->bmi[3].as_mv[1].as_int; } - vpx_memcpy(x->zcoeff_blk[mbmi->tx_size], best_zcoeff_blk, - sizeof(best_zcoeff_blk)); - for (i = 0; i < NB_PREDICTION_TYPES; ++i) { if (best_pred_rd[i] == INT64_MAX) best_pred_diff[i] = INT_MIN; @@ -4536,4 +4530,3 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, return best_rd; } - diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index aa4068d76..0b0bb18d7 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -18,7 +18,9 @@ (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM)) #define QIDX_SKIP_THRESH 115 -void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex); +int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex); + +void vp9_initialize_rd_consts(VP9_COMP *cpi); void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex); diff --git a/vp9/encoder/vp9_segmentation.c b/vp9/encoder/vp9_segmentation.c index 874b71ab1..5137d367b 100644 --- a/vp9/encoder/vp9_segmentation.c +++ b/vp9/encoder/vp9_segmentation.c @@ -253,13 +253,13 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) { no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree); // Key frames cannot use temporal prediction - if (cm->frame_type != KEY_FRAME) { + if (!frame_is_intra_only(cm)) { // Work out probability tree for coding those segments not // predicted using the temporal method and the cost. calc_segtree_probs(t_unpred_seg_counts, t_pred_tree); t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree); - // Add in the cost of the signalling for each prediction context + // Add in the cost of the signaling for each prediction context. for (i = 0; i < PREDICTION_PROBS; i++) { const int count0 = temporal_predictor_count[i][0]; const int count1 = temporal_predictor_count[i][1]; diff --git a/vp9/encoder/vp9_vaq.c b/vp9/encoder/vp9_vaq.c new file mode 100644 index 000000000..3d3b4b0f1 --- /dev/null +++ b/vp9/encoder/vp9_vaq.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <math.h> + +#include "vp9/encoder/vp9_vaq.h" + +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/encoder/vp9_ratectrl.h" +#include "vp9/encoder/vp9_rdopt.h" +#include "vp9/encoder/vp9_segmentation.h" +#include "vp9/common/vp9_systemdependent.h" + +#define ENERGY_MIN (-3) +#define ENERGY_MAX (3) +#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1) +#define ENERGY_IN_BOUNDS(energy)\ + assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX) + +static double q_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 }; +static double rdmult_ratio[MAX_SEGMENTS] = { 1, 1, 1, 1, 1, 1, 1, 1 }; +static int segment_id[MAX_SEGMENTS] = { 5, 3, 1, 0, 2, 4, 6, 7 }; + +#define Q_RATIO(i) q_ratio[(i) - ENERGY_MIN] +#define RDMULT_RATIO(i) rdmult_ratio[(i) - ENERGY_MIN] +#define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN] + +DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = {0}; + +unsigned int vp9_vaq_segment_id(int energy) { + ENERGY_IN_BOUNDS(energy); + return SEGMENT_ID(energy); +} + +double vp9_vaq_rdmult_ratio(int energy) { + ENERGY_IN_BOUNDS(energy); + return RDMULT_RATIO(energy); +} + +double vp9_vaq_inv_q_ratio(int energy) { + ENERGY_IN_BOUNDS(energy); + return Q_RATIO(-energy); +} + +void vp9_vaq_init() { + int i; + double base_ratio = 1.8; + + assert(ENERGY_SPAN <= MAX_SEGMENTS); + + for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) { + Q_RATIO(i) = pow(base_ratio, i/3.0); + } +} + +void vp9_vaq_frame_setup(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + struct segmentation *seg = &cm->seg; + int base_q = vp9_convert_qindex_to_q(cm->base_qindex); + int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + + cm->y_dc_delta_q); + int i; + + vp9_enable_segmentation((VP9_PTR)cpi); + vp9_clearall_segfeatures(seg); + + seg->abs_delta = SEGMENT_DELTADATA; + + for (i = ENERGY_MIN; i <= ENERGY_MAX; i++) { + int qindex_delta, segment_rdmult; + + if (Q_RATIO(i) == 1) { + // No need to enable SEG_LVL_ALT_Q for this segment + RDMULT_RATIO(i) = 1; + continue; + } + + qindex_delta = vp9_compute_qdelta(cpi, base_q, base_q * Q_RATIO(i)); + vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta); + vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q); + + segment_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + qindex_delta + + cm->y_dc_delta_q); + RDMULT_RATIO(i) = (double) segment_rdmult / base_rdmult; + } +} + + +static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bs) { + MACROBLOCKD *xd = &x->e_mbd; + unsigned int var, sse; + int right_overflow = (xd->mb_to_right_edge < 0) ? + ((-xd->mb_to_right_edge) >> 3) : 0; + int bottom_overflow = (xd->mb_to_bottom_edge < 0) ? + ((-xd->mb_to_bottom_edge) >> 3) : 0; + + if (right_overflow || bottom_overflow) { + int bw = (1 << (mi_width_log2(bs) + 3)) - right_overflow; + int bh = (1 << (mi_height_log2(bs) + 3)) - bottom_overflow; + int avg; + variance(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, bw, bh, &sse, &avg); + var = sse - (((int64_t)avg * avg) / (bw * bh)); + return (256 * var) / (bw * bh); + } else { + var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf, + x->plane[0].src.stride, + vp9_64_zeros, 0, &sse); + return (256 * var) >> num_pels_log2_lookup[bs]; + } +} + +int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { + // if (var <= 1000) + // return 0; + unsigned int var = block_variance(cpi, x, bs); + double energy = 0.9*(logf(var + 1) - 10.0); + return clamp(round(energy), ENERGY_MIN, ENERGY_MAX); +} diff --git a/vp9/encoder/vp9_vaq.h b/vp9/encoder/vp9_vaq.h new file mode 100644 index 000000000..dc18b22f2 --- /dev/null +++ b/vp9/encoder/vp9_vaq.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_ENCODER_VP9_CONFIG_VAQ_H_ +#define VP9_ENCODER_VP9_CONFIG_VAQ_H_ + +#include "vp9/encoder/vp9_onyx_int.h" + +unsigned int vp9_vaq_segment_id(int energy); +double vp9_vaq_rdmult_ratio(int energy); +double vp9_vaq_inv_q_ratio(int energy); + +void vp9_vaq_init(); +void vp9_vaq_frame_setup(VP9_COMP *cpi); + +int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); + +#endif // VP9_ENCODER_VP9_CONFIG_VAQ_H_ diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h index 61031e064..2ded97c55 100644 --- a/vp9/encoder/vp9_variance.h +++ b/vp9/encoder/vp9_variance.h @@ -14,6 +14,15 @@ #include "vpx/vpx_integer.h" // #include "./vpx_config.h" +void variance(const uint8_t *src_ptr, + int source_stride, + const uint8_t *ref_ptr, + int recon_stride, + int w, + int h, + unsigned int *sse, + int *sum); + typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c index 991ef4d29..8bc385089 100644 --- a/vp9/encoder/vp9_variance_c.c +++ b/vp9/encoder/vp9_variance_c.c @@ -14,18 +14,18 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_convolve.h" #include "vp9/common/vp9_filter.h" + #include "vp9/encoder/vp9_variance.h" -static void variance(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, - int w, - int h, - unsigned int *sse, - int *sum) { +void variance(const uint8_t *src_ptr, + int source_stride, + const uint8_t *ref_ptr, + int recon_stride, + int w, + int h, + unsigned int *sse, + int *sum) { int i, j; int diff; diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c index ad3d01da9..5e1e5ed4a 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_sse2.c @@ -112,11 +112,6 @@ void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) { } } -void vp9_short_fdct8x4_sse2(int16_t *input, int16_t *output, int pitch) { - vp9_short_fdct4x4_sse2(input, output, pitch); - vp9_short_fdct4x4_sse2(input + 4, output + 16, pitch); -} - static INLINE void load_buffer_4x4(int16_t *input, __m128i *in, int stride) { const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 810fdf51f..fb380e1db 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -256,7 +256,7 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, switch (cfg.g_pass) { case VPX_RC_ONE_PASS: - oxcf->Mode = MODE_BESTQUALITY; + oxcf->Mode = MODE_GOODQUALITY; break; case VPX_RC_FIRST_PASS: oxcf->Mode = MODE_FIRSTPASS; @@ -282,6 +282,8 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf, oxcf->end_usage = USAGE_CONSTRAINED_QUALITY; else if (cfg.rc_end_usage == VPX_Q) oxcf->end_usage = USAGE_CONSTANT_QUALITY; + else if (cfg.rc_end_usage == VPX_CBR) + oxcf->end_usage = USAGE_STREAM_FROM_SERVER; oxcf->target_bandwidth = cfg.rc_target_bitrate; oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct; diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 7a5b78634..6b923162f 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -658,8 +658,10 @@ static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx, if (corrupted) { VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi; - *corrupted = pbi->common.frame_to_show->corrupted; - + if (pbi) + *corrupted = pbi->common.frame_to_show->corrupted; + else + return VPX_CODEC_ERROR; return VPX_CODEC_OK; } else { return VPX_CODEC_INVALID_PARAM; diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 9fbf100f5..b454eee02 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -64,6 +64,8 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c VP9_CX_SRCS-yes += encoder/vp9_tokenize.c VP9_CX_SRCS-yes += encoder/vp9_treewriter.c VP9_CX_SRCS-yes += encoder/vp9_variance_c.c +VP9_CX_SRCS-yes += encoder/vp9_vaq.c +VP9_CX_SRCS-yes += encoder/vp9_vaq.h ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.h VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c |