diff options
author | Kaustubh Raste <kaustubh.raste@imgtec.com> | 2017-01-09 16:35:28 +0530 |
---|---|---|
committer | Kaustubh Raste <kaustubh.raste@imgtec.com> | 2017-01-09 16:35:28 +0530 |
commit | c06991fce6377d450d466518aa8d7fce2060df68 (patch) | |
tree | 82f60e114ec1ba1800700f87f3c260fbb4716885 /vpx_dsp/mips | |
parent | c7e2bd62987ba5e4f8c74feed9175e07bb99ff22 (diff) | |
download | libvpx-c06991fce6377d450d466518aa8d7fce2060df68.tar libvpx-c06991fce6377d450d466518aa8d7fce2060df68.tar.gz libvpx-c06991fce6377d450d466518aa8d7fce2060df68.tar.bz2 libvpx-c06991fce6377d450d466518aa8d7fce2060df68.zip |
Fix mips dspr2 idct16x16 functions for large coefficient input
Change-Id: I9be3d3d040837f658c6314606e28db8c31092a1a
Diffstat (limited to 'vpx_dsp/mips')
-rw-r--r-- | vpx_dsp/mips/itrans16_dspr2.c | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/vpx_dsp/mips/itrans16_dspr2.c b/vpx_dsp/mips/itrans16_dspr2.c index 224181d6e..44ba65c7a 100644 --- a/vpx_dsp/mips/itrans16_dspr2.c +++ b/vpx_dsp/mips/itrans16_dspr2.c @@ -983,6 +983,47 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, [dest] "+&r"(dest) : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); } + } else if (a1 > 255) { + int32_t a11, a12, vector_a11, vector_a12; + + /* use quad-byte + * input and output memory are four byte aligned */ + a11 = a1 >> 1; + a12 = a1 - a11; + __asm__ __volatile__( + "replv.qb %[vector_a11], %[a11] \n\t" + "replv.qb %[vector_a12], %[a12] \n\t" + + : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) + : [a11] "r"(a11), [a12] "r"(a12)); + + for (r = 16; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" + "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t" + "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a11] "r"(vector_a11), + [vector_a12] "r"(vector_a12)); + } } else { /* use quad-byte * input and output memory are four byte aligned */ |