diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/arm/neon/vp9_loopfilter_16_neon.c | 45 | ||||
-rw-r--r-- | vp9/common/arm/neon/vp9_loopfilter_4_neon.c | 274 | ||||
-rw-r--r-- | vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm | 277 | ||||
-rw-r--r-- | vp9/common/arm/neon/vp9_loopfilter_8_neon.c | 453 | ||||
-rw-r--r-- | vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm (renamed from vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm) | 257 | ||||
-rw-r--r-- | vp9/common/arm/neon/vp9_loopfilter_neon.c | 732 | ||||
-rw-r--r-- | vp9/common/vp9_idct.c | 344 | ||||
-rw-r--r-- | vp9/common/vp9_idct.h | 36 | ||||
-rw-r--r-- | vp9/decoder/vp9_decoder.c | 4 | ||||
-rw-r--r-- | vp9/decoder/vp9_detokenize.c | 12 | ||||
-rw-r--r-- | vp9/encoder/vp9_bitstream.c | 4 | ||||
-rw-r--r-- | vp9/encoder/vp9_denoiser.c | 12 | ||||
-rw-r--r-- | vp9/encoder/vp9_denoiser.h | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 30 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodemb.c | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 32 | ||||
-rw-r--r-- | vp9/encoder/vp9_tokenize.h | 2 | ||||
-rw-r--r-- | vp9/vp9_common.mk | 10 | ||||
-rw-r--r-- | vp9/vp9_dx_iface.c | 6 |
19 files changed, 1319 insertions, 1214 deletions
diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c index 09f470e97..c69ee1009 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c +++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c @@ -124,7 +124,6 @@ static INLINE void vp9_loop_filter_neon_16( return; } -#if !HAVE_NEON_ASM void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */, const uint8_t *blimit0, const uint8_t *limit0, @@ -178,47 +177,3 @@ void vp9_lpf_horizontal_4_dual_neon(uint8_t *s, int p /* pitch */, vst1q_u8(s, q8u8); return; } -#endif // !HAVE_NEON_ASM - -void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1); - vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1); -} - -void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1); - vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); -} - -void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p, - const uint8_t *blimit0, - const uint8_t *limit0, - const uint8_t *thresh0, - const uint8_t *blimit1, - const uint8_t *limit1, - const uint8_t *thresh1) { - vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1); - vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); -} - -#if HAVE_NEON_ASM -void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p, - const uint8_t *blimit, - const uint8_t *limit, - const uint8_t *thresh) { - vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh); - vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh); -} -#endif // HAVE_NEON_ASM diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon.c b/vp9/common/arm/neon/vp9_loopfilter_4_neon.c new file mode 100644 index 000000000..fd9db6187 --- /dev/null +++ b/vp9/common/arm/neon/vp9_loopfilter_4_neon.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" + +static INLINE void vp9_loop_filter_neon( + uint8x8_t dblimit, // flimit + uint8x8_t dlimit, // limit + uint8x8_t dthresh, // thresh + uint8x8_t d3u8, // p3 + uint8x8_t d4u8, // p2 + uint8x8_t d5u8, // p1 + uint8x8_t d6u8, // p0 + uint8x8_t d7u8, // q0 + uint8x8_t d16u8, // q1 + uint8x8_t d17u8, // q2 + uint8x8_t d18u8, // q3 + uint8x8_t *d4ru8, // p1 + uint8x8_t *d5ru8, // p0 + uint8x8_t *d6ru8, // q0 + uint8x8_t *d7ru8) { // q1 + uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8; + int16x8_t q12s16; + int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8; + + d19u8 = vabd_u8(d3u8, d4u8); + d20u8 = vabd_u8(d4u8, d5u8); + d21u8 = vabd_u8(d5u8, d6u8); + d22u8 = vabd_u8(d16u8, d7u8); + d3u8 = vabd_u8(d17u8, d16u8); + d4u8 = vabd_u8(d18u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + d20u8 = vmax_u8(d21u8, d22u8); + d3u8 = vmax_u8(d3u8, d4u8); + d23u8 = vmax_u8(d19u8, d20u8); + + d17u8 = vabd_u8(d6u8, d7u8); + + d21u8 = vcgt_u8(d21u8, dthresh); + d22u8 = vcgt_u8(d22u8, dthresh); + d23u8 = vmax_u8(d23u8, d3u8); + + d28u8 = vabd_u8(d5u8, d16u8); + d17u8 = vqadd_u8(d17u8, d17u8); + + d23u8 = vcge_u8(dlimit, d23u8); + + d18u8 = vdup_n_u8(0x80); + d5u8 = veor_u8(d5u8, d18u8); + d6u8 = veor_u8(d6u8, d18u8); + d7u8 = veor_u8(d7u8, d18u8); + d16u8 = veor_u8(d16u8, d18u8); + + d28u8 = vshr_n_u8(d28u8, 1); + d17u8 = vqadd_u8(d17u8, d28u8); + + d19u8 = vdup_n_u8(3); + + d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), + vreinterpret_s8_u8(d6u8)); + + d17u8 = vcge_u8(dblimit, d17u8); + + d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), + vreinterpret_s8_u8(d16u8)); + + d22u8 = vorr_u8(d21u8, d22u8); + + q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8)); + + d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8); + d23u8 = vand_u8(d23u8, d17u8); + + q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8)); + + d17u8 = vdup_n_u8(4); + + d27s8 = vqmovn_s16(q12s16); + d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8); + d27s8 = vreinterpret_s8_u8(d27u8); + + d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8)); + d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8)); + d28s8 = vshr_n_s8(d28s8, 3); + d27s8 = vshr_n_s8(d27s8, 3); + + d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8); + d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8); + + d27s8 = vrshr_n_s8(d27s8, 1); + d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8)); + + d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8); + d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8); + + *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8); + *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8); + *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8); + *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8); + return; +} + +void vp9_lpf_horizontal_4_neon( + unsigned char *src, + int pitch, + unsigned char *blimit, + unsigned char *limit, + unsigned char *thresh, + int count) { + int i; + uint8_t *s, *psrc; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; + + if (count == 0) // end_vp9_lf_h_edge + return; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + psrc = src - (pitch << 2); + for (i = 0; i < count; i++) { + s = psrc + i * 8; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + vp9_loop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d4u8, &d5u8, &d6u8, &d7u8); + + s -= (pitch * 5); + vst1_u8(s, d4u8); + s += pitch; + vst1_u8(s, d5u8); + s += pitch; + vst1_u8(s, d6u8); + s += pitch; + vst1_u8(s, d7u8); + } + return; +} + +void vp9_lpf_vertical_4_neon( + unsigned char *src, + int pitch, + unsigned char *blimit, + unsigned char *limit, + unsigned char *thresh, + int count) { + int i, pitch8; + uint8_t *s; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; + uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; + uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; + uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; + uint8x8x4_t d4Result; + + if (count == 0) // end_vp9_lf_h_edge + return; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + pitch8 = pitch * 8; + for (i = 0; i < count; i++, src += pitch8) { + s = src - (i + 1) * 4; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), + vreinterpret_u32_u8(d7u8)); + d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), + vreinterpret_u32_u8(d16u8)); + d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), + vreinterpret_u32_u8(d17u8)); + d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), + vreinterpret_u32_u8(d18u8)); + + d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), + vreinterpret_u16_u32(d2tmp2.val[0])); + d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), + vreinterpret_u16_u32(d2tmp3.val[0])); + d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), + vreinterpret_u16_u32(d2tmp2.val[1])); + d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), + vreinterpret_u16_u32(d2tmp3.val[1])); + + d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), + vreinterpret_u8_u16(d2tmp5.val[0])); + d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), + vreinterpret_u8_u16(d2tmp5.val[1])); + d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), + vreinterpret_u8_u16(d2tmp7.val[0])); + d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), + vreinterpret_u8_u16(d2tmp7.val[1])); + + d3u8 = d2tmp8.val[0]; + d4u8 = d2tmp8.val[1]; + d5u8 = d2tmp9.val[0]; + d6u8 = d2tmp9.val[1]; + d7u8 = d2tmp10.val[0]; + d16u8 = d2tmp10.val[1]; + d17u8 = d2tmp11.val[0]; + d18u8 = d2tmp11.val[1]; + + vp9_loop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d4u8, &d5u8, &d6u8, &d7u8); + + d4Result.val[0] = d4u8; + d4Result.val[1] = d5u8; + d4Result.val[2] = d6u8; + d4Result.val[3] = d7u8; + + src -= 2; + vst4_lane_u8(src, d4Result, 0); + src += pitch; + vst4_lane_u8(src, d4Result, 1); + src += pitch; + vst4_lane_u8(src, d4Result, 2); + src += pitch; + vst4_lane_u8(src, d4Result, 3); + src += pitch; + vst4_lane_u8(src, d4Result, 4); + src += pitch; + vst4_lane_u8(src, d4Result, 5); + src += pitch; + vst4_lane_u8(src, d4Result, 6); + src += pitch; + vst4_lane_u8(src, d4Result, 7); + } + return; +} diff --git a/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm b/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm new file mode 100644 index 000000000..7738e0d3a --- /dev/null +++ b/vp9/common/arm/neon/vp9_loopfilter_4_neon_asm.asm @@ -0,0 +1,277 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_lpf_horizontal_4_neon| + EXPORT |vp9_lpf_vertical_4_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter +; works on 16 iterations at a time. +; TODO(fgalligan): See about removing the count code as this function is only +; called with a count of 1. +; +; void vp9_lpf_horizontal_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; sp+4 int count +|vp9_lpf_horizontal_4_neon| PROC + push {lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r12, [sp, #8] ; load count + ldr r2, [sp, #4] ; load thresh + add r1, r1, r1 ; double pitch + + cmp r12, #0 + beq end_vp9_lf_h_edge + + vld1.8 {d1[]}, [r3] ; duplicate *limit + vld1.8 {d2[]}, [r2] ; duplicate *thresh + +count_lf_h_loop + sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines + add r3, r2, r1, lsr #1 ; set to 3 lines down + + vld1.u8 {d3}, [r2@64], r1 ; p3 + vld1.u8 {d4}, [r3@64], r1 ; p2 + vld1.u8 {d5}, [r2@64], r1 ; p1 + vld1.u8 {d6}, [r3@64], r1 ; p0 + vld1.u8 {d7}, [r2@64], r1 ; q0 + vld1.u8 {d16}, [r3@64], r1 ; q1 + vld1.u8 {d17}, [r2@64] ; q2 + vld1.u8 {d18}, [r3@64] ; q3 + + sub r2, r2, r1, lsl #1 + sub r3, r3, r1, lsl #1 + + bl vp9_loop_filter_neon + + vst1.u8 {d4}, [r2@64], r1 ; store op1 + vst1.u8 {d5}, [r3@64], r1 ; store op0 + vst1.u8 {d6}, [r2@64], r1 ; store oq0 + vst1.u8 {d7}, [r3@64], r1 ; store oq1 + + add r0, r0, #8 + subs r12, r12, #1 + bne count_lf_h_loop + +end_vp9_lf_h_edge + pop {pc} + ENDP ; |vp9_lpf_horizontal_4_neon| + +; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter +; works on 16 iterations at a time. +; TODO(fgalligan): See about removing the count code as this function is only +; called with a count of 1. +; +; void vp9_lpf_vertical_4_neon(uint8_t *s, +; int p /* pitch */, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh, +; int count) +; +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +; sp+4 int count +|vp9_lpf_vertical_4_neon| PROC + push {lr} + + vld1.8 {d0[]}, [r2] ; duplicate *blimit + ldr r12, [sp, #8] ; load count + vld1.8 {d1[]}, [r3] ; duplicate *limit + + ldr r3, [sp, #4] ; load thresh + sub r2, r0, #4 ; move s pointer down by 4 columns + cmp r12, #0 + beq end_vp9_lf_v_edge + + vld1.8 {d2[]}, [r3] ; duplicate *thresh + +count_lf_v_loop + vld1.u8 {d3}, [r2], r1 ; load s data + vld1.u8 {d4}, [r2], r1 + vld1.u8 {d5}, [r2], r1 + vld1.u8 {d6}, [r2], r1 + vld1.u8 {d7}, [r2], r1 + vld1.u8 {d16}, [r2], r1 + vld1.u8 {d17}, [r2], r1 + vld1.u8 {d18}, [r2] + + ;transpose to 8x16 matrix + vtrn.32 d3, d7 + vtrn.32 d4, d16 + vtrn.32 d5, d17 + vtrn.32 d6, d18 + + vtrn.16 d3, d5 + vtrn.16 d4, d6 + vtrn.16 d7, d17 + vtrn.16 d16, d18 + + vtrn.8 d3, d4 + vtrn.8 d5, d6 + vtrn.8 d7, d16 + vtrn.8 d17, d18 + + bl vp9_loop_filter_neon + + sub r0, r0, #2 + + ;store op1, op0, oq0, oq1 + vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 + vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 + vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 + vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 + vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 + vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 + vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 + vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] + + add r0, r0, r1, lsl #3 ; s += pitch * 8 + subs r12, r12, #1 + subne r2, r0, #4 ; move s pointer down by 4 columns + bne count_lf_v_loop + +end_vp9_lf_v_edge + pop {pc} + ENDP ; |vp9_lpf_vertical_4_neon| + +; void vp9_loop_filter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. The function does not use +; registers d8-d15. +; +; Inputs: +; r0-r3, r12 PRESERVE +; d0 blimit +; d1 limit +; d2 thresh +; d3 p3 +; d4 p2 +; d5 p1 +; d6 p0 +; d7 q0 +; d16 q1 +; d17 q2 +; d18 q3 +; +; Outputs: +; d4 op1 +; d5 op0 +; d6 oq0 +; d7 oq1 +|vp9_loop_filter_neon| PROC + ; filter_mask + vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) + vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) + vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) + vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) + vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) + vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) + vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) + + vabd.u8 d17, d6, d7 ; abs(p0 - q0) + + vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) + + vmov.u8 d18, #0x80 + + vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) + + ; hevmask + vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 + vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) + + vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) + vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 + + veor d7, d7, d18 ; qs0 + + vcge.u8 d23, d1, d23 ; abs(m1) > limit + + ; filter() function + ; convert to signed + + vshr.u8 d28, d28, #1 ; a = a / 2 + veor d6, d6, d18 ; ps0 + + veor d5, d5, d18 ; ps1 + vqadd.u8 d17, d17, d28 ; a = b + a + + veor d16, d16, d18 ; qs1 + + vmov.u8 d19, #3 + + vsub.s8 d28, d7, d6 ; ( qs0 - ps0) + + vcge.u8 d17, d0, d17 ; a > blimit + + vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) + vorr d22, d21, d22 ; hevmask + + vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) + + vand d27, d27, d22 ; filter &= hev + vand d23, d23, d17 ; filter_mask + + vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) + + vmov.u8 d17, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d27, q12 + + vand d27, d27, d23 ; filter &= mask + + vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) + vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) + vshr.s8 d28, d28, #3 ; filter2 >>= 3 + vshr.s8 d27, d27, #3 ; filter1 >>= 3 + + vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) + vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) + + ; outer tap adjustments + vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 + + veor d6, d26, d18 ; *oq0 = u^0x80 + + vbic d27, d27, d22 ; filter &= ~hev + + vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) + vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) + + veor d5, d19, d18 ; *op0 = u^0x80 + veor d4, d21, d18 ; *op1 = u^0x80 + veor d7, d20, d18 ; *oq1 = u^0x80 + + bx lr + ENDP ; |vp9_loop_filter_neon| + + END diff --git a/vp9/common/arm/neon/vp9_loopfilter_8_neon.c b/vp9/common/arm/neon/vp9_loopfilter_8_neon.c new file mode 100644 index 000000000..33068a8a2 --- /dev/null +++ b/vp9/common/arm/neon/vp9_loopfilter_8_neon.c @@ -0,0 +1,453 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" + +static INLINE void vp9_mbloop_filter_neon( + uint8x8_t dblimit, // mblimit + uint8x8_t dlimit, // limit + uint8x8_t dthresh, // thresh + uint8x8_t d3u8, // p2 + uint8x8_t d4u8, // p2 + uint8x8_t d5u8, // p1 + uint8x8_t d6u8, // p0 + uint8x8_t d7u8, // q0 + uint8x8_t d16u8, // q1 + uint8x8_t d17u8, // q2 + uint8x8_t d18u8, // q3 + uint8x8_t *d0ru8, // p1 + uint8x8_t *d1ru8, // p1 + uint8x8_t *d2ru8, // p0 + uint8x8_t *d3ru8, // q0 + uint8x8_t *d4ru8, // q1 + uint8x8_t *d5ru8) { // q1 + uint32_t flat; + uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; + uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; + int16x8_t q15s16; + uint16x8_t q10u16, q14u16; + int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; + + d19u8 = vabd_u8(d3u8, d4u8); + d20u8 = vabd_u8(d4u8, d5u8); + d21u8 = vabd_u8(d5u8, d6u8); + d22u8 = vabd_u8(d16u8, d7u8); + d23u8 = vabd_u8(d17u8, d16u8); + d24u8 = vabd_u8(d18u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + d20u8 = vmax_u8(d21u8, d22u8); + + d25u8 = vabd_u8(d6u8, d4u8); + + d23u8 = vmax_u8(d23u8, d24u8); + + d26u8 = vabd_u8(d7u8, d17u8); + + d19u8 = vmax_u8(d19u8, d20u8); + + d24u8 = vabd_u8(d6u8, d7u8); + d27u8 = vabd_u8(d3u8, d6u8); + d28u8 = vabd_u8(d18u8, d7u8); + + d19u8 = vmax_u8(d19u8, d23u8); + + d23u8 = vabd_u8(d5u8, d16u8); + d24u8 = vqadd_u8(d24u8, d24u8); + + + d19u8 = vcge_u8(dlimit, d19u8); + + + d25u8 = vmax_u8(d25u8, d26u8); + d26u8 = vmax_u8(d27u8, d28u8); + + d23u8 = vshr_n_u8(d23u8, 1); + + d25u8 = vmax_u8(d25u8, d26u8); + + d24u8 = vqadd_u8(d24u8, d23u8); + + d20u8 = vmax_u8(d20u8, d25u8); + + d23u8 = vdup_n_u8(1); + d24u8 = vcge_u8(dblimit, d24u8); + + d21u8 = vcgt_u8(d21u8, dthresh); + + d20u8 = vcge_u8(d23u8, d20u8); + + d19u8 = vand_u8(d19u8, d24u8); + + d23u8 = vcgt_u8(d22u8, dthresh); + + d20u8 = vand_u8(d20u8, d19u8); + + d22u8 = vdup_n_u8(0x80); + + d23u8 = vorr_u8(d21u8, d23u8); + + q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), + vreinterpret_u16_u8(d21u8)); + + d30u8 = vshrn_n_u16(q10u16, 4); + flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); + + if (flat == 0xffffffff) { // Check for all 1's, power_branch_only + d27u8 = vdup_n_u8(3); + d21u8 = vdup_n_u8(2); + q14u16 = vaddl_u8(d6u8, d7u8); + q14u16 = vmlal_u8(q14u16, d3u8, d27u8); + q14u16 = vmlal_u8(q14u16, d4u8, d21u8); + q14u16 = vaddw_u8(q14u16, d5u8); + *d0ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vaddw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d16u8); + *d1ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d17u8); + *d2ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d3ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vsubw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d4ru8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vsubw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d17u8); + q14u16 = vaddw_u8(q14u16, d18u8); + *d5ru8 = vqrshrn_n_u16(q14u16, 3); + } else { + d21u8 = veor_u8(d7u8, d22u8); + d24u8 = veor_u8(d6u8, d22u8); + d25u8 = veor_u8(d5u8, d22u8); + d26u8 = veor_u8(d16u8, d22u8); + + d27u8 = vdup_n_u8(3); + + d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); + d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); + + q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); + + d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); + + q15s16 = vaddw_s8(q15s16, d29s8); + + d29u8 = vdup_n_u8(4); + + d28s8 = vqmovn_s16(q15s16); + + d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); + + d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); + d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); + d30s8 = vshr_n_s8(d30s8, 3); + d29s8 = vshr_n_s8(d29s8, 3); + + d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); + d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); + + d29s8 = vrshr_n_s8(d29s8, 1); + d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); + + d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); + d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); + + if (flat == 0) { // filter_branch_only + *d0ru8 = d4u8; + *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); + *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); + *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); + *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); + *d5ru8 = d17u8; + return; + } + + d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); + d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); + d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); + d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); + + d23u8 = vdup_n_u8(2); + q14u16 = vaddl_u8(d6u8, d7u8); + q14u16 = vmlal_u8(q14u16, d3u8, d27u8); + q14u16 = vmlal_u8(q14u16, d4u8, d23u8); + + d0u8 = vbsl_u8(d20u8, dblimit, d4u8); + + q14u16 = vaddw_u8(q14u16, d5u8); + + d1u8 = vbsl_u8(d20u8, dlimit, d25u8); + + d30u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vaddw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d16u8); + + d2u8 = vbsl_u8(d20u8, dthresh, d24u8); + + d31u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vaddw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d17u8); + + *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); + + d23u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d3u8); + q14u16 = vsubw_u8(q14u16, d6u8); + q14u16 = vaddw_u8(q14u16, d7u8); + + *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); + + q14u16 = vaddw_u8(q14u16, d18u8); + + *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); + + d22u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d4u8); + q14u16 = vsubw_u8(q14u16, d7u8); + q14u16 = vaddw_u8(q14u16, d16u8); + + d3u8 = vbsl_u8(d20u8, d3u8, d21u8); + + q14u16 = vaddw_u8(q14u16, d18u8); + + d4u8 = vbsl_u8(d20u8, d4u8, d26u8); + + d6u8 = vqrshrn_n_u16(q14u16, 3); + + q14u16 = vsubw_u8(q14u16, d5u8); + q14u16 = vsubw_u8(q14u16, d16u8); + q14u16 = vaddw_u8(q14u16, d17u8); + q14u16 = vaddw_u8(q14u16, d18u8); + + d5u8 = vbsl_u8(d20u8, d5u8, d17u8); + + d7u8 = vqrshrn_n_u16(q14u16, 3); + + *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); + *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); + *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); + } + return; +} + +void vp9_lpf_horizontal_8_neon( + unsigned char *src, + int pitch, + unsigned char *blimit, + unsigned char *limit, + unsigned char *thresh, + int count) { + int i; + uint8_t *s, *psrc; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint8x8_t d16u8, d17u8, d18u8; + + if (count == 0) // end_vp9_mblf_h_edge + return; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + psrc = src - (pitch << 2); + for (i = 0; i < count; i++) { + s = psrc + i * 8; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + vp9_mbloop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); + + s -= (pitch * 6); + vst1_u8(s, d0u8); + s += pitch; + vst1_u8(s, d1u8); + s += pitch; + vst1_u8(s, d2u8); + s += pitch; + vst1_u8(s, d3u8); + s += pitch; + vst1_u8(s, d4u8); + s += pitch; + vst1_u8(s, d5u8); + } + return; +} + +void vp9_lpf_vertical_8_neon( + unsigned char *src, + int pitch, + unsigned char *blimit, + unsigned char *limit, + unsigned char *thresh, + int count) { + int i; + uint8_t *s; + uint8x8_t dblimit, dlimit, dthresh; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + uint8x8_t d16u8, d17u8, d18u8; + uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; + uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; + uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; + uint8x8x4_t d4Result; + uint8x8x2_t d2Result; + + if (count == 0) + return; + + dblimit = vld1_u8(blimit); + dlimit = vld1_u8(limit); + dthresh = vld1_u8(thresh); + + for (i = 0; i < count; i++) { + s = src + (i * (pitch << 3)) - 4; + + d3u8 = vld1_u8(s); + s += pitch; + d4u8 = vld1_u8(s); + s += pitch; + d5u8 = vld1_u8(s); + s += pitch; + d6u8 = vld1_u8(s); + s += pitch; + d7u8 = vld1_u8(s); + s += pitch; + d16u8 = vld1_u8(s); + s += pitch; + d17u8 = vld1_u8(s); + s += pitch; + d18u8 = vld1_u8(s); + + d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), + vreinterpret_u32_u8(d7u8)); + d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), + vreinterpret_u32_u8(d16u8)); + d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), + vreinterpret_u32_u8(d17u8)); + d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), + vreinterpret_u32_u8(d18u8)); + + d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), + vreinterpret_u16_u32(d2tmp2.val[0])); + d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), + vreinterpret_u16_u32(d2tmp3.val[0])); + d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), + vreinterpret_u16_u32(d2tmp2.val[1])); + d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), + vreinterpret_u16_u32(d2tmp3.val[1])); + + d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), + vreinterpret_u8_u16(d2tmp5.val[0])); + d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), + vreinterpret_u8_u16(d2tmp5.val[1])); + d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), + vreinterpret_u8_u16(d2tmp7.val[0])); + d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), + vreinterpret_u8_u16(d2tmp7.val[1])); + + d3u8 = d2tmp8.val[0]; + d4u8 = d2tmp8.val[1]; + d5u8 = d2tmp9.val[0]; + d6u8 = d2tmp9.val[1]; + d7u8 = d2tmp10.val[0]; + d16u8 = d2tmp10.val[1]; + d17u8 = d2tmp11.val[0]; + d18u8 = d2tmp11.val[1]; + + vp9_mbloop_filter_neon(dblimit, dlimit, dthresh, + d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, + &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); + + d4Result.val[0] = d0u8; + d4Result.val[1] = d1u8; + d4Result.val[2] = d2u8; + d4Result.val[3] = d3u8; + + d2Result.val[0] = d4u8; + d2Result.val[1] = d5u8; + + s = src - 3; + vst4_lane_u8(s, d4Result, 0); + s += pitch; + vst4_lane_u8(s, d4Result, 1); + s += pitch; + vst4_lane_u8(s, d4Result, 2); + s += pitch; + vst4_lane_u8(s, d4Result, 3); + s += pitch; + vst4_lane_u8(s, d4Result, 4); + s += pitch; + vst4_lane_u8(s, d4Result, 5); + s += pitch; + vst4_lane_u8(s, d4Result, 6); + s += pitch; + vst4_lane_u8(s, d4Result, 7); + + s = src + 1; + vst2_lane_u8(s, d2Result, 0); + s += pitch; + vst2_lane_u8(s, d2Result, 1); + s += pitch; + vst2_lane_u8(s, d2Result, 2); + s += pitch; + vst2_lane_u8(s, d2Result, 3); + s += pitch; + vst2_lane_u8(s, d2Result, 4); + s += pitch; + vst2_lane_u8(s, d2Result, 5); + s += pitch; + vst2_lane_u8(s, d2Result, 6); + s += pitch; + vst2_lane_u8(s, d2Result, 7); + } + return; +} diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm b/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm index 443032217..91aaec04e 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_neon_asm.asm +++ b/vp9/common/arm/neon/vp9_loopfilter_8_neon_asm.asm @@ -8,8 +8,6 @@ ; be found in the AUTHORS file in the root of the source tree. ; - EXPORT |vp9_lpf_horizontal_4_neon| - EXPORT |vp9_lpf_vertical_4_neon| EXPORT |vp9_lpf_horizontal_8_neon| EXPORT |vp9_lpf_vertical_8_neon| ARM @@ -21,261 +19,6 @@ ; TODO(fgalligan): See about removing the count code as this function is only ; called with a count of 1. ; -; void vp9_lpf_horizontal_4_neon(uint8_t *s, -; int p /* pitch */, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh, -; int count) -; -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -; sp+4 int count -|vp9_lpf_horizontal_4_neon| PROC - push {lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r12, [sp, #8] ; load count - ldr r2, [sp, #4] ; load thresh - add r1, r1, r1 ; double pitch - - cmp r12, #0 - beq end_vp9_lf_h_edge - - vld1.8 {d1[]}, [r3] ; duplicate *limit - vld1.8 {d2[]}, [r2] ; duplicate *thresh - -count_lf_h_loop - sub r2, r0, r1, lsl #1 ; move src pointer down by 4 lines - add r3, r2, r1, lsr #1 ; set to 3 lines down - - vld1.u8 {d3}, [r2@64], r1 ; p3 - vld1.u8 {d4}, [r3@64], r1 ; p2 - vld1.u8 {d5}, [r2@64], r1 ; p1 - vld1.u8 {d6}, [r3@64], r1 ; p0 - vld1.u8 {d7}, [r2@64], r1 ; q0 - vld1.u8 {d16}, [r3@64], r1 ; q1 - vld1.u8 {d17}, [r2@64] ; q2 - vld1.u8 {d18}, [r3@64] ; q3 - - sub r2, r2, r1, lsl #1 - sub r3, r3, r1, lsl #1 - - bl vp9_loop_filter_neon - - vst1.u8 {d4}, [r2@64], r1 ; store op1 - vst1.u8 {d5}, [r3@64], r1 ; store op0 - vst1.u8 {d6}, [r2@64], r1 ; store oq0 - vst1.u8 {d7}, [r3@64], r1 ; store oq1 - - add r0, r0, #8 - subs r12, r12, #1 - bne count_lf_h_loop - -end_vp9_lf_h_edge - pop {pc} - ENDP ; |vp9_lpf_horizontal_4_neon| - -; Currently vp9 only works on iterations 8 at a time. The vp8 loop filter -; works on 16 iterations at a time. -; TODO(fgalligan): See about removing the count code as this function is only -; called with a count of 1. -; -; void vp9_lpf_vertical_4_neon(uint8_t *s, -; int p /* pitch */, -; const uint8_t *blimit, -; const uint8_t *limit, -; const uint8_t *thresh, -; int count) -; -; r0 uint8_t *s, -; r1 int p, /* pitch */ -; r2 const uint8_t *blimit, -; r3 const uint8_t *limit, -; sp const uint8_t *thresh, -; sp+4 int count -|vp9_lpf_vertical_4_neon| PROC - push {lr} - - vld1.8 {d0[]}, [r2] ; duplicate *blimit - ldr r12, [sp, #8] ; load count - vld1.8 {d1[]}, [r3] ; duplicate *limit - - ldr r3, [sp, #4] ; load thresh - sub r2, r0, #4 ; move s pointer down by 4 columns - cmp r12, #0 - beq end_vp9_lf_v_edge - - vld1.8 {d2[]}, [r3] ; duplicate *thresh - -count_lf_v_loop - vld1.u8 {d3}, [r2], r1 ; load s data - vld1.u8 {d4}, [r2], r1 - vld1.u8 {d5}, [r2], r1 - vld1.u8 {d6}, [r2], r1 - vld1.u8 {d7}, [r2], r1 - vld1.u8 {d16}, [r2], r1 - vld1.u8 {d17}, [r2], r1 - vld1.u8 {d18}, [r2] - - ;transpose to 8x16 matrix - vtrn.32 d3, d7 - vtrn.32 d4, d16 - vtrn.32 d5, d17 - vtrn.32 d6, d18 - - vtrn.16 d3, d5 - vtrn.16 d4, d6 - vtrn.16 d7, d17 - vtrn.16 d16, d18 - - vtrn.8 d3, d4 - vtrn.8 d5, d6 - vtrn.8 d7, d16 - vtrn.8 d17, d18 - - bl vp9_loop_filter_neon - - sub r0, r0, #2 - - ;store op1, op0, oq0, oq1 - vst4.8 {d4[0], d5[0], d6[0], d7[0]}, [r0], r1 - vst4.8 {d4[1], d5[1], d6[1], d7[1]}, [r0], r1 - vst4.8 {d4[2], d5[2], d6[2], d7[2]}, [r0], r1 - vst4.8 {d4[3], d5[3], d6[3], d7[3]}, [r0], r1 - vst4.8 {d4[4], d5[4], d6[4], d7[4]}, [r0], r1 - vst4.8 {d4[5], d5[5], d6[5], d7[5]}, [r0], r1 - vst4.8 {d4[6], d5[6], d6[6], d7[6]}, [r0], r1 - vst4.8 {d4[7], d5[7], d6[7], d7[7]}, [r0] - - add r0, r0, r1, lsl #3 ; s += pitch * 8 - subs r12, r12, #1 - subne r2, r0, #4 ; move s pointer down by 4 columns - bne count_lf_v_loop - -end_vp9_lf_v_edge - pop {pc} - ENDP ; |vp9_lpf_vertical_4_neon| - -; void vp9_loop_filter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. The function does not use -; registers d8-d15. -; -; Inputs: -; r0-r3, r12 PRESERVE -; d0 blimit -; d1 limit -; d2 thresh -; d3 p3 -; d4 p2 -; d5 p1 -; d6 p0 -; d7 q0 -; d16 q1 -; d17 q2 -; d18 q3 -; -; Outputs: -; d4 op1 -; d5 op0 -; d6 oq0 -; d7 oq1 -|vp9_loop_filter_neon| PROC - ; filter_mask - vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) - vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) - vabd.u8 d21, d5, d6 ; m3 = abs(p1 - p0) - vabd.u8 d22, d16, d7 ; m4 = abs(q1 - q0) - vabd.u8 d3, d17, d16 ; m5 = abs(q2 - q1) - vabd.u8 d4, d18, d17 ; m6 = abs(q3 - q2) - - ; only compare the largest value to limit - vmax.u8 d19, d19, d20 ; m1 = max(m1, m2) - vmax.u8 d20, d21, d22 ; m2 = max(m3, m4) - - vabd.u8 d17, d6, d7 ; abs(p0 - q0) - - vmax.u8 d3, d3, d4 ; m3 = max(m5, m6) - - vmov.u8 d18, #0x80 - - vmax.u8 d23, d19, d20 ; m1 = max(m1, m2) - - ; hevmask - vcgt.u8 d21, d21, d2 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 d22, d22, d2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 d23, d23, d3 ; m1 = max(m1, m3) - - vabd.u8 d28, d5, d16 ; a = abs(p1 - q1) - vqadd.u8 d17, d17, d17 ; b = abs(p0 - q0) * 2 - - veor d7, d7, d18 ; qs0 - - vcge.u8 d23, d1, d23 ; abs(m1) > limit - - ; filter() function - ; convert to signed - - vshr.u8 d28, d28, #1 ; a = a / 2 - veor d6, d6, d18 ; ps0 - - veor d5, d5, d18 ; ps1 - vqadd.u8 d17, d17, d28 ; a = b + a - - veor d16, d16, d18 ; qs1 - - vmov.u8 d19, #3 - - vsub.s8 d28, d7, d6 ; ( qs0 - ps0) - - vcge.u8 d17, d0, d17 ; a > blimit - - vqsub.s8 d27, d5, d16 ; filter = clamp(ps1-qs1) - vorr d22, d21, d22 ; hevmask - - vmull.s8 q12, d28, d19 ; 3 * ( qs0 - ps0) - - vand d27, d27, d22 ; filter &= hev - vand d23, d23, d17 ; filter_mask - - vaddw.s8 q12, q12, d27 ; filter + 3 * (qs0 - ps0) - - vmov.u8 d17, #4 - - ; filter = clamp(filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d27, q12 - - vand d27, d27, d23 ; filter &= mask - - vqadd.s8 d28, d27, d19 ; filter2 = clamp(filter+3) - vqadd.s8 d27, d27, d17 ; filter1 = clamp(filter+4) - vshr.s8 d28, d28, #3 ; filter2 >>= 3 - vshr.s8 d27, d27, #3 ; filter1 >>= 3 - - vqadd.s8 d19, d6, d28 ; u = clamp(ps0 + filter2) - vqsub.s8 d26, d7, d27 ; u = clamp(qs0 - filter1) - - ; outer tap adjustments - vrshr.s8 d27, d27, #1 ; filter = ++filter1 >> 1 - - veor d6, d26, d18 ; *oq0 = u^0x80 - - vbic d27, d27, d22 ; filter &= ~hev - - vqadd.s8 d21, d5, d27 ; u = clamp(ps1 + filter) - vqsub.s8 d20, d16, d27 ; u = clamp(qs1 - filter) - - veor d5, d19, d18 ; *op0 = u^0x80 - veor d4, d21, d18 ; *op1 = u^0x80 - veor d7, d20, d18 ; *oq1 = u^0x80 - - bx lr - ENDP ; |vp9_loop_filter_neon| - ; void vp9_lpf_horizontal_8_neon(uint8_t *s, int p, ; const uint8_t *blimit, ; const uint8_t *limit, diff --git a/vp9/common/arm/neon/vp9_loopfilter_neon.c b/vp9/common/arm/neon/vp9_loopfilter_neon.c index 079d26677..6432c6cac 100644 --- a/vp9/common/arm/neon/vp9_loopfilter_neon.c +++ b/vp9/common/arm/neon/vp9_loopfilter_neon.c @@ -10,705 +10,49 @@ #include <arm_neon.h> +#include "./vp9_rtcd.h" #include "./vpx_config.h" - -static INLINE void vp9_loop_filter_neon( - uint8x8_t dblimit, // flimit - uint8x8_t dlimit, // limit - uint8x8_t dthresh, // thresh - uint8x8_t d3u8, // p3 - uint8x8_t d4u8, // p2 - uint8x8_t d5u8, // p1 - uint8x8_t d6u8, // p0 - uint8x8_t d7u8, // q0 - uint8x8_t d16u8, // q1 - uint8x8_t d17u8, // q2 - uint8x8_t d18u8, // q3 - uint8x8_t *d4ru8, // p1 - uint8x8_t *d5ru8, // p0 - uint8x8_t *d6ru8, // q0 - uint8x8_t *d7ru8) { // q1 - uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8; - int16x8_t q12s16; - int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8; - - d19u8 = vabd_u8(d3u8, d4u8); - d20u8 = vabd_u8(d4u8, d5u8); - d21u8 = vabd_u8(d5u8, d6u8); - d22u8 = vabd_u8(d16u8, d7u8); - d3u8 = vabd_u8(d17u8, d16u8); - d4u8 = vabd_u8(d18u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - d20u8 = vmax_u8(d21u8, d22u8); - d3u8 = vmax_u8(d3u8, d4u8); - d23u8 = vmax_u8(d19u8, d20u8); - - d17u8 = vabd_u8(d6u8, d7u8); - - d21u8 = vcgt_u8(d21u8, dthresh); - d22u8 = vcgt_u8(d22u8, dthresh); - d23u8 = vmax_u8(d23u8, d3u8); - - d28u8 = vabd_u8(d5u8, d16u8); - d17u8 = vqadd_u8(d17u8, d17u8); - - d23u8 = vcge_u8(dlimit, d23u8); - - d18u8 = vdup_n_u8(0x80); - d5u8 = veor_u8(d5u8, d18u8); - d6u8 = veor_u8(d6u8, d18u8); - d7u8 = veor_u8(d7u8, d18u8); - d16u8 = veor_u8(d16u8, d18u8); - - d28u8 = vshr_n_u8(d28u8, 1); - d17u8 = vqadd_u8(d17u8, d28u8); - - d19u8 = vdup_n_u8(3); - - d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8), - vreinterpret_s8_u8(d6u8)); - - d17u8 = vcge_u8(dblimit, d17u8); - - d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8), - vreinterpret_s8_u8(d16u8)); - - d22u8 = vorr_u8(d21u8, d22u8); - - q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8)); - - d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8); - d23u8 = vand_u8(d23u8, d17u8); - - q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8)); - - d17u8 = vdup_n_u8(4); - - d27s8 = vqmovn_s16(q12s16); - d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8); - d27s8 = vreinterpret_s8_u8(d27u8); - - d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8)); - d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8)); - d28s8 = vshr_n_s8(d28s8, 3); - d27s8 = vshr_n_s8(d27s8, 3); - - d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8); - d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8); - - d27s8 = vrshr_n_s8(d27s8, 1); - d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8)); - - d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8); - d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8); - - *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8); - *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8); - *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8); - *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8); - return; -} - -void vp9_lpf_horizontal_4_neon( - unsigned char *src, - int pitch, - unsigned char *blimit, - unsigned char *limit, - unsigned char *thresh, - int count) { - int i; - uint8_t *s, *psrc; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; - - if (count == 0) // end_vp9_lf_h_edge - return; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - psrc = src - (pitch << 2); - for (i = 0; i < count; i++) { - s = psrc + i * 8; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - vp9_loop_filter_neon(dblimit, dlimit, dthresh, - d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, - &d4u8, &d5u8, &d6u8, &d7u8); - - s -= (pitch * 5); - vst1_u8(s, d4u8); - s += pitch; - vst1_u8(s, d5u8); - s += pitch; - vst1_u8(s, d6u8); - s += pitch; - vst1_u8(s, d7u8); - } - return; -} - -void vp9_lpf_vertical_4_neon( - unsigned char *src, - int pitch, - unsigned char *blimit, - unsigned char *limit, - unsigned char *thresh, - int count) { - int i, pitch8; - uint8_t *s; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8; - uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; - uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; - uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; - uint8x8x4_t d4Result; - - if (count == 0) // end_vp9_lf_h_edge - return; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - pitch8 = pitch * 8; - for (i = 0; i < count; i++, src += pitch8) { - s = src - (i + 1) * 4; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), - vreinterpret_u32_u8(d7u8)); - d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), - vreinterpret_u32_u8(d16u8)); - d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), - vreinterpret_u32_u8(d17u8)); - d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), - vreinterpret_u32_u8(d18u8)); - - d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), - vreinterpret_u16_u32(d2tmp2.val[0])); - d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), - vreinterpret_u16_u32(d2tmp3.val[0])); - d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), - vreinterpret_u16_u32(d2tmp2.val[1])); - d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), - vreinterpret_u16_u32(d2tmp3.val[1])); - - d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), - vreinterpret_u8_u16(d2tmp5.val[0])); - d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), - vreinterpret_u8_u16(d2tmp5.val[1])); - d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), - vreinterpret_u8_u16(d2tmp7.val[0])); - d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), - vreinterpret_u8_u16(d2tmp7.val[1])); - - d3u8 = d2tmp8.val[0]; - d4u8 = d2tmp8.val[1]; - d5u8 = d2tmp9.val[0]; - d6u8 = d2tmp9.val[1]; - d7u8 = d2tmp10.val[0]; - d16u8 = d2tmp10.val[1]; - d17u8 = d2tmp11.val[0]; - d18u8 = d2tmp11.val[1]; - - vp9_loop_filter_neon(dblimit, dlimit, dthresh, - d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, - &d4u8, &d5u8, &d6u8, &d7u8); - - d4Result.val[0] = d4u8; - d4Result.val[1] = d5u8; - d4Result.val[2] = d6u8; - d4Result.val[3] = d7u8; - - src -= 2; - vst4_lane_u8(src, d4Result, 0); - src += pitch; - vst4_lane_u8(src, d4Result, 1); - src += pitch; - vst4_lane_u8(src, d4Result, 2); - src += pitch; - vst4_lane_u8(src, d4Result, 3); - src += pitch; - vst4_lane_u8(src, d4Result, 4); - src += pitch; - vst4_lane_u8(src, d4Result, 5); - src += pitch; - vst4_lane_u8(src, d4Result, 6); - src += pitch; - vst4_lane_u8(src, d4Result, 7); - } - return; +#include "vpx/vpx_integer.h" + +void vp9_lpf_horizontal_8_dual_neon(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, 1); } -static INLINE void vp9_mbloop_filter_neon( - uint8x8_t dblimit, // mblimit - uint8x8_t dlimit, // limit - uint8x8_t dthresh, // thresh - uint8x8_t d3u8, // p2 - uint8x8_t d4u8, // p2 - uint8x8_t d5u8, // p1 - uint8x8_t d6u8, // p0 - uint8x8_t d7u8, // q0 - uint8x8_t d16u8, // q1 - uint8x8_t d17u8, // q2 - uint8x8_t d18u8, // q3 - uint8x8_t *d0ru8, // p1 - uint8x8_t *d1ru8, // p1 - uint8x8_t *d2ru8, // p0 - uint8x8_t *d3ru8, // q0 - uint8x8_t *d4ru8, // q1 - uint8x8_t *d5ru8) { // q1 - uint32_t flat; - uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; - uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; - int16x8_t q15s16; - uint16x8_t q10u16, q14u16; - int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; - - d19u8 = vabd_u8(d3u8, d4u8); - d20u8 = vabd_u8(d4u8, d5u8); - d21u8 = vabd_u8(d5u8, d6u8); - d22u8 = vabd_u8(d16u8, d7u8); - d23u8 = vabd_u8(d17u8, d16u8); - d24u8 = vabd_u8(d18u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - d20u8 = vmax_u8(d21u8, d22u8); - - d25u8 = vabd_u8(d6u8, d4u8); - - d23u8 = vmax_u8(d23u8, d24u8); - - d26u8 = vabd_u8(d7u8, d17u8); - - d19u8 = vmax_u8(d19u8, d20u8); - - d24u8 = vabd_u8(d6u8, d7u8); - d27u8 = vabd_u8(d3u8, d6u8); - d28u8 = vabd_u8(d18u8, d7u8); - - d19u8 = vmax_u8(d19u8, d23u8); - - d23u8 = vabd_u8(d5u8, d16u8); - d24u8 = vqadd_u8(d24u8, d24u8); - - - d19u8 = vcge_u8(dlimit, d19u8); - - - d25u8 = vmax_u8(d25u8, d26u8); - d26u8 = vmax_u8(d27u8, d28u8); - - d23u8 = vshr_n_u8(d23u8, 1); - - d25u8 = vmax_u8(d25u8, d26u8); - - d24u8 = vqadd_u8(d24u8, d23u8); - - d20u8 = vmax_u8(d20u8, d25u8); - - d23u8 = vdup_n_u8(1); - d24u8 = vcge_u8(dblimit, d24u8); - - d21u8 = vcgt_u8(d21u8, dthresh); - - d20u8 = vcge_u8(d23u8, d20u8); - - d19u8 = vand_u8(d19u8, d24u8); - - d23u8 = vcgt_u8(d22u8, dthresh); - - d20u8 = vand_u8(d20u8, d19u8); - - d22u8 = vdup_n_u8(0x80); - - d23u8 = vorr_u8(d21u8, d23u8); - - q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), - vreinterpret_u16_u8(d21u8)); - - d30u8 = vshrn_n_u16(q10u16, 4); - flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); - - if (flat == 0xffffffff) { // Check for all 1's, power_branch_only - d27u8 = vdup_n_u8(3); - d21u8 = vdup_n_u8(2); - q14u16 = vaddl_u8(d6u8, d7u8); - q14u16 = vmlal_u8(q14u16, d3u8, d27u8); - q14u16 = vmlal_u8(q14u16, d4u8, d21u8); - q14u16 = vaddw_u8(q14u16, d5u8); - *d0ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vaddw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d16u8); - *d1ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d17u8); - *d2ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d3ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vsubw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d4ru8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vsubw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d17u8); - q14u16 = vaddw_u8(q14u16, d18u8); - *d5ru8 = vqrshrn_n_u16(q14u16, 3); - } else { - d21u8 = veor_u8(d7u8, d22u8); - d24u8 = veor_u8(d6u8, d22u8); - d25u8 = veor_u8(d5u8, d22u8); - d26u8 = veor_u8(d16u8, d22u8); - - d27u8 = vdup_n_u8(3); - - d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); - d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); - - q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); - - d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); - - q15s16 = vaddw_s8(q15s16, d29s8); - - d29u8 = vdup_n_u8(4); - - d28s8 = vqmovn_s16(q15s16); - - d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); - - d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); - d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); - d30s8 = vshr_n_s8(d30s8, 3); - d29s8 = vshr_n_s8(d29s8, 3); - - d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); - d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); - - d29s8 = vrshr_n_s8(d29s8, 1); - d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); - - d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); - d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); - - if (flat == 0) { // filter_branch_only - *d0ru8 = d4u8; - *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); - *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); - *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); - *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); - *d5ru8 = d17u8; - return; - } - - d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); - d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); - d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); - d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); - - d23u8 = vdup_n_u8(2); - q14u16 = vaddl_u8(d6u8, d7u8); - q14u16 = vmlal_u8(q14u16, d3u8, d27u8); - q14u16 = vmlal_u8(q14u16, d4u8, d23u8); - - d0u8 = vbsl_u8(d20u8, dblimit, d4u8); - - q14u16 = vaddw_u8(q14u16, d5u8); - - d1u8 = vbsl_u8(d20u8, dlimit, d25u8); - - d30u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vaddw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d16u8); - - d2u8 = vbsl_u8(d20u8, dthresh, d24u8); - - d31u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vaddw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d17u8); - - *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); - - d23u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d3u8); - q14u16 = vsubw_u8(q14u16, d6u8); - q14u16 = vaddw_u8(q14u16, d7u8); - - *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); - - q14u16 = vaddw_u8(q14u16, d18u8); - - *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); - - d22u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d4u8); - q14u16 = vsubw_u8(q14u16, d7u8); - q14u16 = vaddw_u8(q14u16, d16u8); - - d3u8 = vbsl_u8(d20u8, d3u8, d21u8); - - q14u16 = vaddw_u8(q14u16, d18u8); - - d4u8 = vbsl_u8(d20u8, d4u8, d26u8); - - d6u8 = vqrshrn_n_u16(q14u16, 3); - - q14u16 = vsubw_u8(q14u16, d5u8); - q14u16 = vsubw_u8(q14u16, d16u8); - q14u16 = vaddw_u8(q14u16, d17u8); - q14u16 = vaddw_u8(q14u16, d18u8); - - d5u8 = vbsl_u8(d20u8, d5u8, d17u8); - - d7u8 = vqrshrn_n_u16(q14u16, 3); - - *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); - *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); - *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); - } - return; +void vp9_lpf_vertical_4_dual_neon(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); } -void vp9_lpf_horizontal_8_neon( - unsigned char *src, - int pitch, - unsigned char *blimit, - unsigned char *limit, - unsigned char *thresh, - int count) { - int i; - uint8_t *s, *psrc; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - uint8x8_t d16u8, d17u8, d18u8; - - if (count == 0) // end_vp9_mblf_h_edge - return; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - psrc = src - (pitch << 2); - for (i = 0; i < count; i++) { - s = psrc + i * 8; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - vp9_mbloop_filter_neon(dblimit, dlimit, dthresh, - d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, - &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); - - s -= (pitch * 6); - vst1_u8(s, d0u8); - s += pitch; - vst1_u8(s, d1u8); - s += pitch; - vst1_u8(s, d2u8); - s += pitch; - vst1_u8(s, d3u8); - s += pitch; - vst1_u8(s, d4u8); - s += pitch; - vst1_u8(s, d5u8); - } - return; +void vp9_lpf_vertical_8_dual_neon(uint8_t *s, int p, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, 1); + vp9_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1); } -void vp9_lpf_vertical_8_neon( - unsigned char *src, - int pitch, - unsigned char *blimit, - unsigned char *limit, - unsigned char *thresh, - int count) { - int i; - uint8_t *s; - uint8x8_t dblimit, dlimit, dthresh; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; - uint8x8_t d16u8, d17u8, d18u8; - uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; - uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; - uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; - uint8x8x4_t d4Result; - uint8x8x2_t d2Result; - - if (count == 0) - return; - - dblimit = vld1_u8(blimit); - dlimit = vld1_u8(limit); - dthresh = vld1_u8(thresh); - - for (i = 0; i < count; i++) { - s = src + (i * (pitch << 3)) - 4; - - d3u8 = vld1_u8(s); - s += pitch; - d4u8 = vld1_u8(s); - s += pitch; - d5u8 = vld1_u8(s); - s += pitch; - d6u8 = vld1_u8(s); - s += pitch; - d7u8 = vld1_u8(s); - s += pitch; - d16u8 = vld1_u8(s); - s += pitch; - d17u8 = vld1_u8(s); - s += pitch; - d18u8 = vld1_u8(s); - - d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), - vreinterpret_u32_u8(d7u8)); - d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), - vreinterpret_u32_u8(d16u8)); - d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), - vreinterpret_u32_u8(d17u8)); - d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), - vreinterpret_u32_u8(d18u8)); - - d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), - vreinterpret_u16_u32(d2tmp2.val[0])); - d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), - vreinterpret_u16_u32(d2tmp3.val[0])); - d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), - vreinterpret_u16_u32(d2tmp2.val[1])); - d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), - vreinterpret_u16_u32(d2tmp3.val[1])); - - d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), - vreinterpret_u8_u16(d2tmp5.val[0])); - d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), - vreinterpret_u8_u16(d2tmp5.val[1])); - d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), - vreinterpret_u8_u16(d2tmp7.val[0])); - d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), - vreinterpret_u8_u16(d2tmp7.val[1])); - - d3u8 = d2tmp8.val[0]; - d4u8 = d2tmp8.val[1]; - d5u8 = d2tmp9.val[0]; - d6u8 = d2tmp9.val[1]; - d7u8 = d2tmp10.val[0]; - d16u8 = d2tmp10.val[1]; - d17u8 = d2tmp11.val[0]; - d18u8 = d2tmp11.val[1]; - - vp9_mbloop_filter_neon(dblimit, dlimit, dthresh, - d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, - &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); - - d4Result.val[0] = d0u8; - d4Result.val[1] = d1u8; - d4Result.val[2] = d2u8; - d4Result.val[3] = d3u8; - - d2Result.val[0] = d4u8; - d2Result.val[1] = d5u8; - - s = src - 3; - vst4_lane_u8(s, d4Result, 0); - s += pitch; - vst4_lane_u8(s, d4Result, 1); - s += pitch; - vst4_lane_u8(s, d4Result, 2); - s += pitch; - vst4_lane_u8(s, d4Result, 3); - s += pitch; - vst4_lane_u8(s, d4Result, 4); - s += pitch; - vst4_lane_u8(s, d4Result, 5); - s += pitch; - vst4_lane_u8(s, d4Result, 6); - s += pitch; - vst4_lane_u8(s, d4Result, 7); - - s = src + 1; - vst2_lane_u8(s, d2Result, 0); - s += pitch; - vst2_lane_u8(s, d2Result, 1); - s += pitch; - vst2_lane_u8(s, d2Result, 2); - s += pitch; - vst2_lane_u8(s, d2Result, 3); - s += pitch; - vst2_lane_u8(s, d2Result, 4); - s += pitch; - vst2_lane_u8(s, d2Result, 5); - s += pitch; - vst2_lane_u8(s, d2Result, 6); - s += pitch; - vst2_lane_u8(s, d2Result, 7); - } - return; +#if HAVE_NEON_ASM +void vp9_lpf_vertical_16_dual_neon(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + vp9_lpf_vertical_16_neon(s, p, blimit, limit, thresh); + vp9_lpf_vertical_16_neon(s + 8 * p, p, blimit, limit, thresh); } +#endif // HAVE_NEON_ASM diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index 1a3fefc5f..b48d52230 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c @@ -1517,12 +1517,12 @@ void vp9_highbd_idct4(const tran_low_t *input, tran_low_t *output, int bd) { // stage 1 temp1 = (input[0] + input[2]) * cospi_16_64; temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = WRAPLOW(dct_const_round_shift(temp1), bd); - step[1] = WRAPLOW(dct_const_round_shift(temp2), bd); + step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = WRAPLOW(dct_const_round_shift(temp1), bd); - step[3] = WRAPLOW(dct_const_round_shift(temp2), bd); + step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); // stage 2 output[0] = WRAPLOW(step[0] + step[3], bd); @@ -1562,10 +1562,11 @@ void vp9_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, int dest_stride, int bd) { int i; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + tran_low_t out = WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); a1 = ROUND_POWER_OF_TWO(out, 4); for (i = 0; i < 4; i++) { @@ -1587,12 +1588,12 @@ void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) { step1[3] = input[6]; temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); // stage 2 & stage 3 - even half vp9_highbd_idct4(step1, step1, bd); @@ -1607,8 +1608,8 @@ void vp9_highbd_idct8(const tran_low_t *input, tran_low_t *output, int bd) { step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[7] = step2[7]; // stage 4 @@ -1653,9 +1654,10 @@ void vp9_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + tran_low_t out = WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); a1 = ROUND_POWER_OF_TWO(out, 5); for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) @@ -1696,10 +1698,10 @@ static void highbd_iadst4(const tran_low_t *input, tran_low_t *output, int bd) { // The overall dynamic range is 14b (input) + 14b (multiplication scaling) // + 1b (addition) = 29b. // Hence the output bit depth is 15b. - output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), bd); - output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), bd); - output[2] = WRAPLOW(dct_const_round_shift(s2), bd); - output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd); + output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd); + output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd); + output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); + output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd); } void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, @@ -1764,14 +1766,14 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { s6 = cospi_26_64 * x6 + cospi_6_64 * x7; s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - x0 = WRAPLOW(dct_const_round_shift(s0 + s4), bd); - x1 = WRAPLOW(dct_const_round_shift(s1 + s5), bd); - x2 = WRAPLOW(dct_const_round_shift(s2 + s6), bd); - x3 = WRAPLOW(dct_const_round_shift(s3 + s7), bd); - x4 = WRAPLOW(dct_const_round_shift(s0 - s4), bd); - x5 = WRAPLOW(dct_const_round_shift(s1 - s5), bd); - x6 = WRAPLOW(dct_const_round_shift(s2 - s6), bd); - x7 = WRAPLOW(dct_const_round_shift(s3 - s7), bd); + x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd); + x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd); + x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd); + x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd); + x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd); + x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd); // stage 2 s0 = x0; @@ -1787,10 +1789,10 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { x1 = WRAPLOW(s1 + s3, bd); x2 = WRAPLOW(s0 - s2, bd); x3 = WRAPLOW(s1 - s3, bd); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd); + x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); + x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); // stage 3 s2 = cospi_16_64 * (x2 + x3); @@ -1798,10 +1800,10 @@ static void highbd_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { s6 = cospi_16_64 * (x6 + x7); s7 = cospi_16_64 * (x6 - x7); - x2 = WRAPLOW(dct_const_round_shift(s2), bd); - x3 = WRAPLOW(dct_const_round_shift(s3), bd); - x6 = WRAPLOW(dct_const_round_shift(s6), bd); - x7 = WRAPLOW(dct_const_round_shift(s7), bd); + x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); + x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); output[0] = WRAPLOW(x0, bd); output[1] = WRAPLOW(-x4, bd); @@ -1910,23 +1912,23 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); // stage 3 step1[0] = step2[0]; @@ -1936,12 +1938,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[8] = WRAPLOW(step2[8] + step2[9], bd); step1[9] = WRAPLOW(step2[8] - step2[9], bd); @@ -1955,12 +1957,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) { // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step2[4] = WRAPLOW(step1[4] + step1[5], bd); step2[5] = WRAPLOW(step1[4] - step1[5], bd); step2[6] = WRAPLOW(-step1[6] + step1[7], bd); @@ -1970,12 +1972,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) { step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step2[11] = step1[11]; step2[12] = step1[12]; @@ -1987,8 +1989,8 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) { step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[7] = step2[7]; step1[8] = WRAPLOW(step2[8] + step2[11], bd); @@ -2013,12 +2015,12 @@ void vp9_highbd_idct16(const tran_low_t *input, tran_low_t *output, int bd) { step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step2[14] = step1[14]; step2[15] = step1[15]; @@ -2115,22 +2117,22 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output, s14 = x14 * cospi_29_64 + x15 * cospi_3_64; s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - x0 = WRAPLOW(dct_const_round_shift(s0 + s8), bd); - x1 = WRAPLOW(dct_const_round_shift(s1 + s9), bd); - x2 = WRAPLOW(dct_const_round_shift(s2 + s10), bd); - x3 = WRAPLOW(dct_const_round_shift(s3 + s11), bd); - x4 = WRAPLOW(dct_const_round_shift(s4 + s12), bd); - x5 = WRAPLOW(dct_const_round_shift(s5 + s13), bd); - x6 = WRAPLOW(dct_const_round_shift(s6 + s14), bd); - x7 = WRAPLOW(dct_const_round_shift(s7 + s15), bd); - x8 = WRAPLOW(dct_const_round_shift(s0 - s8), bd); - x9 = WRAPLOW(dct_const_round_shift(s1 - s9), bd); - x10 = WRAPLOW(dct_const_round_shift(s2 - s10), bd); - x11 = WRAPLOW(dct_const_round_shift(s3 - s11), bd); - x12 = WRAPLOW(dct_const_round_shift(s4 - s12), bd); - x13 = WRAPLOW(dct_const_round_shift(s5 - s13), bd); - x14 = WRAPLOW(dct_const_round_shift(s6 - s14), bd); - x15 = WRAPLOW(dct_const_round_shift(s7 - s15), bd); + x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd); + x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd); + x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd); + x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd); + x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd); + x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd); + x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd); + x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd); + x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd); + x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd); + x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd); + x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd); + x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd); + x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd); // stage 2 s0 = x0; @@ -2158,14 +2160,14 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output, x5 = WRAPLOW(s1 - s5, bd); x6 = WRAPLOW(s2 - s6, bd); x7 = WRAPLOW(s3 - s7, bd); - x8 = WRAPLOW(dct_const_round_shift(s8 + s12), bd); - x9 = WRAPLOW(dct_const_round_shift(s9 + s13), bd); - x10 = WRAPLOW(dct_const_round_shift(s10 + s14), bd); - x11 = WRAPLOW(dct_const_round_shift(s11 + s15), bd); - x12 = WRAPLOW(dct_const_round_shift(s8 - s12), bd); - x13 = WRAPLOW(dct_const_round_shift(s9 - s13), bd); - x14 = WRAPLOW(dct_const_round_shift(s10 - s14), bd); - x15 = WRAPLOW(dct_const_round_shift(s11 - s15), bd); + x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd); + x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd); + x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd); + x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd); + x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd); + x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd); + x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd); + x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd); // stage 3 s0 = x0; @@ -2189,18 +2191,18 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output, x1 = WRAPLOW(s1 + s3, bd); x2 = WRAPLOW(s0 - s2, bd); x3 = WRAPLOW(s1 - s3, bd); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6), bd); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7), bd); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6), bd); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7), bd); + x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); + x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); x8 = WRAPLOW(s8 + s10, bd); x9 = WRAPLOW(s9 + s11, bd); x10 = WRAPLOW(s8 - s10, bd); x11 = WRAPLOW(s9 - s11, bd); - x12 = WRAPLOW(dct_const_round_shift(s12 + s14), bd); - x13 = WRAPLOW(dct_const_round_shift(s13 + s15), bd); - x14 = WRAPLOW(dct_const_round_shift(s12 - s14), bd); - x15 = WRAPLOW(dct_const_round_shift(s13 - s15), bd); + x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd); + x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd); + x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd); + x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd); // stage 4 s2 = (- cospi_16_64) * (x2 + x3); @@ -2212,14 +2214,14 @@ static void highbd_iadst16(const tran_low_t *input, tran_low_t *output, s14 = (- cospi_16_64) * (x14 + x15); s15 = cospi_16_64 * (x14 - x15); - x2 = WRAPLOW(dct_const_round_shift(s2), bd); - x3 = WRAPLOW(dct_const_round_shift(s3), bd); - x6 = WRAPLOW(dct_const_round_shift(s6), bd); - x7 = WRAPLOW(dct_const_round_shift(s7), bd); - x10 = WRAPLOW(dct_const_round_shift(s10), bd); - x11 = WRAPLOW(dct_const_round_shift(s11), bd); - x14 = WRAPLOW(dct_const_round_shift(s14), bd); - x15 = WRAPLOW(dct_const_round_shift(s15), bd); + x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); + x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); + x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); + x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); + x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd); + x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd); + x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd); + x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd); output[0] = WRAPLOW(x0, bd); output[1] = WRAPLOW(-x8, bd); @@ -2306,10 +2308,11 @@ void vp9_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + tran_low_t out = WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { for (i = 0; i < 16; ++i) @@ -2343,43 +2346,43 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[31] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); // stage 2 step2[0] = step1[0]; @@ -2393,23 +2396,23 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step2[16] = WRAPLOW(step1[16] + step1[17], bd); step2[17] = WRAPLOW(step1[16] - step1[17], bd); @@ -2436,12 +2439,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[8] = WRAPLOW(step2[8] + step2[9], bd); step1[9] = WRAPLOW(step2[8] - step2[9], bd); @@ -2456,22 +2459,22 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { step1[31] = step2[31]; temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[30] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[19] = step2[19]; step1[20] = step2[20]; temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[23] = step2[23]; step1[24] = step2[24]; step1[27] = step2[27]; @@ -2480,12 +2483,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step2[4] = WRAPLOW(step1[4] + step1[5], bd); step2[5] = WRAPLOW(step1[4] - step1[5], bd); step2[6] = WRAPLOW(-step1[6] + step1[7], bd); @@ -2495,12 +2498,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step2[11] = step1[11]; step2[12] = step1[12]; @@ -2530,8 +2533,8 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[7] = step2[7]; step1[8] = WRAPLOW(step2[8] + step2[11], bd); @@ -2547,20 +2550,20 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { step1[17] = step2[17]; temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[29] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[28] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[22] = step2[22]; step1[23] = step2[23]; step1[24] = step2[24]; @@ -2581,12 +2584,12 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd); - step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd); + step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step2[14] = step1[14]; step2[15] = step1[15]; @@ -2632,20 +2635,20 @@ static void highbd_idct32(const tran_low_t *input, tran_low_t *output, int bd) { step1[19] = step2[19]; temp1 = (-step2[20] + step2[27]) * cospi_16_64; temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[27] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = (-step2[21] + step2[26]) * cospi_16_64; temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[26] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = (-step2[22] + step2[25]) * cospi_16_64; temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[25] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); temp1 = (-step2[23] + step2[24]) * cospi_16_64; temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = WRAPLOW(dct_const_round_shift(temp1), bd); - step1[24] = WRAPLOW(dct_const_round_shift(temp2), bd); + step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); + step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); step1[28] = step2[28]; step1[29] = step2[29]; step1[30] = step2[30]; @@ -2759,8 +2762,9 @@ void vp9_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, int a1; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + tran_low_t out = WRAPLOW( + highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); + out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 32; ++j) { diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h index 1d8836cf3..6e2551dd4 100644 --- a/vp9/common/vp9_idct.h +++ b/vp9/common/vp9_idct.h @@ -80,13 +80,7 @@ static const tran_high_t sinpi_3_9 = 13377; static const tran_high_t sinpi_4_9 = 15212; static INLINE tran_low_t check_range(tran_high_t input) { -#if CONFIG_VP9_HIGHBITDEPTH - // For valid highbitdepth VP9 streams, intermediate stage coefficients will - // stay within the ranges: - // - 8 bit: signed 16 bit integer - // - 10 bit: signed 18 bit integer - // - 12 bit: signed 20 bit integer -#elif CONFIG_COEFFICIENT_RANGE_CHECKING +#if CONFIG_COEFFICIENT_RANGE_CHECKING // For valid VP9 input streams, intermediate stage coefficients should always // stay within the range of a signed 16 bit integer. Coefficients can go out // of this range for invalid/corrupt VP9 streams. However, strictly checking @@ -95,7 +89,7 @@ static INLINE tran_low_t check_range(tran_high_t input) { // --enable-coefficient-range-checking. assert(INT16_MIN <= input); assert(input <= INT16_MAX); -#endif +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING return (tran_low_t)input; } @@ -104,6 +98,32 @@ static INLINE tran_low_t dct_const_round_shift(tran_high_t input) { return check_range(rv); } +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE tran_low_t highbd_check_range(tran_high_t input, + int bd) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + // For valid highbitdepth VP9 streams, intermediate stage coefficients will + // stay within the ranges: + // - 8 bit: signed 16 bit integer + // - 10 bit: signed 18 bit integer + // - 12 bit: signed 20 bit integer + const int32_t int_max = (1 << (7 + bd)) - 1; + const int32_t int_min = -int_max - 1; + assert(int_min <= input); + assert(input <= int_max); + (void) int_min; +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + (void) bd; + return (tran_low_t)input; +} + +static INLINE tran_low_t highbd_dct_const_round_shift(tran_high_t input, + int bd) { + tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + return highbd_check_range(rv, bd); +} +#endif // CONFIG_VP9_HIGHBITDEPTH + typedef void (*transform_1d)(const tran_low_t*, tran_low_t*); typedef struct { diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 1406b4034..80654b19c 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -238,7 +238,7 @@ static void swap_frame_buffers(VP9Decoder *pbi) { // Invalidate these references until the next frame starts. for (ref_index = 0; ref_index < 3; ref_index++) - cm->frame_refs[ref_index].idx = INT_MAX; + cm->frame_refs[ref_index].idx = -1; } int vp9_receive_compressed_data(VP9Decoder *pbi, @@ -258,7 +258,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, // TODO(jkoleszar): Error concealment is undefined and non-normative // at this point, but if it becomes so, [0] may not always be the correct // thing to do here. - if (cm->frame_refs[0].idx != INT_MAX) + if (cm->frame_refs[0].idx > 0) cm->frame_refs[0].buf->corrupted = 1; } diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index 8704fddac..23d622d70 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -14,6 +14,9 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_entropy.h" +#if CONFIG_COEFFICIENT_RANGE_CHECKING +#include "vp9/common/vp9_idct.h" +#endif #include "vp9/decoder/vp9_detokenize.h" @@ -32,7 +35,7 @@ #define INCREMENT_COUNT(token) \ do { \ if (!cm->frame_parallel_decoding_mode) \ - ++coef_counts[band][ctx][token]; \ + ++coef_counts[band][ctx][token]; \ } while (0) static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) { @@ -191,10 +194,15 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type, } v = (val * dqv) >> dq_shift; #if CONFIG_COEFFICIENT_RANGE_CHECKING +#if CONFIG_VP9_HIGHBITDEPTH + dqcoeff[scan[c]] = highbd_check_range((vp9_read_bit(r) ? -v : v), + cm->bit_depth); +#else dqcoeff[scan[c]] = check_range(vp9_read_bit(r) ? -v : v); +#endif // CONFIG_VP9_HIGHBITDEPTH #else dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v; -#endif +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING token_cache[scan[c]] = vp9_pt_energy_class[token]; ++c; ctx = get_coef_context(nb, token_cache, c); diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index 752429c8f..19bcfd2b6 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -297,7 +297,6 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) { if (bsize >= BLOCK_8X8) { write_inter_mode(w, mode, inter_probs); - ++cpi->td.counts->inter_mode[mode_ctx][INTER_OFFSET(mode)]; } } @@ -320,7 +319,6 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, const int j = idy * 2 + idx; const PREDICTION_MODE b_mode = mi->bmi[j].as_mode; write_inter_mode(w, b_mode, inter_probs); - ++cpi->td.counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)]; if (b_mode == NEWMV) { for (ref = 0; ref < 1 + is_compound; ++ref) vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv, @@ -1172,8 +1170,6 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) { prob_diff_update(vp9_inter_mode_tree, cm->fc->inter_mode_probs[i], counts->inter_mode[i], INTER_MODES, &header_bc); - vp9_zero(counts->inter_mode); - if (cm->interp_filter == SWITCHABLE) update_switchable_interp_probs(cm, &header_bc, counts); diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index 56ec6b335..2266a26bd 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -458,12 +458,14 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, make_grayscale(&denoiser->running_avg_y[i]); #endif denoiser->increase_denoising = 0; + denoiser->frame_buffer_initialized = 1; return 0; } void vp9_denoiser_free(VP9_DENOISER *denoiser) { int i; + denoiser->frame_buffer_initialized = 0; if (denoiser == NULL) { return; } @@ -483,15 +485,13 @@ static void make_grayscale(YV12_BUFFER_CONFIG *yuv) { uint8_t *u = yuv->u_buffer; uint8_t *v = yuv->v_buffer; - // The '/2's are there because we have a 440 buffer, but we want to output - // 420. - for (r = 0; r < yuv->uv_height / 2; ++r) { - for (c = 0; c < yuv->uv_width / 2; ++c) { + for (r = 0; r < yuv->uv_height; ++r) { + for (c = 0; c < yuv->uv_width; ++c) { u[c] = UINT8_MAX / 2; v[c] = UINT8_MAX / 2; } - u += yuv->uv_stride + yuv->uv_width / 2; - v += yuv->uv_stride + yuv->uv_width / 2; + u += yuv->uv_stride; + v += yuv->uv_stride; } } #endif diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h index 421dfcd0c..8eb5da1b8 100644 --- a/vp9/encoder/vp9_denoiser.h +++ b/vp9/encoder/vp9_denoiser.h @@ -29,6 +29,7 @@ typedef struct vp9_denoiser { YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES]; YV12_BUFFER_CONFIG mc_running_avg_y; int increase_denoising; + int frame_buffer_initialized; } VP9_DENOISER; void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 6784d0164..756052771 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -55,9 +55,6 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData * td, int mi_row, int mi_col, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); -// Motion vector component magnitude threshold for defining fast motion. -#define FAST_MOTION_MV_THRESH 24 - // This is used as a reference when computing the source variance for the // purposes of activity masking. // Eventually this should be replaced by custom no-reference routines, @@ -1010,22 +1007,20 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) { const MACROBLOCKD *const xd = &x->e_mbd; const MODE_INFO *const mi = xd->mi[0].src_mi; const MB_MODE_INFO *const mbmi = &mi->mbmi; + const BLOCK_SIZE bsize = mbmi->sb_type; if (!frame_is_intra_only(cm)) { + FRAME_COUNTS *const counts = td->counts; + const int inter_block = is_inter_block(mbmi); const int seg_ref_active = vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME); if (!seg_ref_active) { - FRAME_COUNTS *const counts = td->counts; - const int inter_block = is_inter_block(mbmi); - counts->intra_inter[vp9_get_intra_inter_context(xd)][inter_block]++; - // If the segment reference feature is enabled we have only a single // reference frame allowed for the segment so exclude it from // the reference frame counts used to work out probabilities. if (inter_block) { const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0]; - if (cm->reference_mode == REFERENCE_MODE_SELECT) counts->comp_inter[vp9_get_reference_mode_context(cm, xd)] [has_second_ref(mbmi)]++; @@ -1042,6 +1037,25 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) { } } } + if (inter_block && + !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { + const int mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]]; + if (bsize >= BLOCK_8X8) { + const PREDICTION_MODE mode = mbmi->mode; + ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)]; + } else { + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + int idx, idy; + for (idy = 0; idy < 2; idy += num_4x4_h) { + for (idx = 0; idx < 2; idx += num_4x4_w) { + const int j = idy * 2 + idx; + const PREDICTION_MODE b_mode = mi->bmi[j].as_mode; + ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)]; + } + } + } + } } } diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index d016e9742..70b804e31 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -93,7 +93,7 @@ typedef struct vp9_token_state { int rate; int error; int next; - signed char token; + int16_t token; short qc; } vp9_token_state; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index b7bfddff1..628660b5f 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1347,17 +1347,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { #if CONFIG_VP9_HIGHBITDEPTH highbd_set_var_fns(cpi); #endif - -#if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) { - vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, -#if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - VP9_ENC_BORDER_IN_PIXELS); - } -#endif } #ifndef M_LOG2_E @@ -3408,6 +3397,20 @@ static void check_initial_width(VP9_COMP *cpi, } } +#if CONFIG_VP9_TEMPORAL_DENOISING +static void setup_denoiser_buffer(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + if (cpi->oxcf.noise_sensitivity > 0 && + !cpi->denoiser.frame_buffer_initialized) { + vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS); + } +} +#endif int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, @@ -3424,6 +3427,9 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, unsigned int frame_flags, check_initial_width(cpi, subsampling_x, subsampling_y); #endif // CONFIG_VP9_HIGHBITDEPTH +#if CONFIG_VP9_TEMPORAL_DENOISING + setup_denoiser_buffer(cpi); +#endif vpx_usec_timer_start(&timer); if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags)) @@ -3998,6 +4004,10 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, check_initial_width(cpi, 1, 1); #endif // CONFIG_VP9_HIGHBITDEPTH +#if CONFIG_VP9_TEMPORAL_DENOISING + setup_denoiser_buffer(cpi); +#endif + if (width) { cm->width = width; if (cm->width > cpi->initial_width) { diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h index 5154a59ea..81cc2e13f 100644 --- a/vp9/encoder/vp9_tokenize.h +++ b/vp9/encoder/vp9_tokenize.h @@ -65,7 +65,7 @@ extern const int16_t vp9_cat6_low_cost[256]; extern const int16_t vp9_cat6_high_cost[128]; extern const int16_t vp9_cat6_high10_high_cost[512]; extern const int16_t vp9_cat6_high12_high_cost[2048]; -static INLINE int16_t vp9_get_cost(uint8_t token, EXTRABIT extrabits, +static INLINE int16_t vp9_get_cost(int16_t token, EXTRABIT extrabits, const int16_t *cat6_high_table) { if (token != CATEGORY6_TOKEN) return vp9_extra_bits[token].cost[extrabits]; diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index f5e6e3190..a7ba074ba 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -133,11 +133,13 @@ ifeq ($(ARCH_X86_64), yes) VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_ssse3_x86_64.asm endif +VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon.c # neon with assembly and intrinsics implementations. If both are available # prefer assembly. @@ -156,9 +158,8 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon_asm$(ASM) +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon_asm$(ASM) else ifeq ($(HAVE_NEON), yes) @@ -176,8 +177,9 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_16_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_4_neon.c +VP9_COMMON_SRCS-yes += common/arm/neon/vp9_loopfilter_8_neon.c VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon.c endif # HAVE_NEON endif # HAVE_NEON_ASM diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index 43bf35f9c..b9177876e 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -148,7 +148,11 @@ static vpx_codec_err_t decoder_peek_si_internal(const uint8_t *data, if (frame_marker != VP9_FRAME_MARKER) return VPX_CODEC_UNSUP_BITSTREAM; - if (profile >= MAX_PROFILES) return VPX_CODEC_UNSUP_BITSTREAM; + if (profile >= MAX_PROFILES) + return VPX_CODEC_UNSUP_BITSTREAM; + + if ((profile >= 2 && data_sz <= 1) || data_sz < 1) + return VPX_CODEC_UNSUP_BITSTREAM; if (vp9_rb_read_bit(&rb)) { // show an existing frame vp9_rb_read_literal(&rb, 3); // Frame buffer to show. |