diff options
author | Johann <johannkoenig@google.com> | 2014-10-31 13:42:55 -0700 |
---|---|---|
committer | Johann <johannkoenig@google.com> | 2014-10-31 13:42:55 -0700 |
commit | 2134eb2f054baaec9a796784aeeafb0b669ff601 (patch) | |
tree | e2816daadf199792fd563257fc67c72e1df275a0 /vp8/encoder/arm/neon | |
parent | 7ae75c3d525d79b9b28652fb34082cf81a5de9ab (diff) | |
download | libvpx-2134eb2f054baaec9a796784aeeafb0b669ff601.tar libvpx-2134eb2f054baaec9a796784aeeafb0b669ff601.tar.gz libvpx-2134eb2f054baaec9a796784aeeafb0b669ff601.tar.bz2 libvpx-2134eb2f054baaec9a796784aeeafb0b669ff601.zip |
Remove pair quantization
The intrinsics version of the pair quant is slower than running it
individually.
Change-Id: I7b4ea8599d4aab04be0a5a0c59b8b29a7fc283f4
Diffstat (limited to 'vp8/encoder/arm/neon')
-rw-r--r-- | vp8/encoder/arm/neon/fastquantizeb_neon.c | 124 |
1 files changed, 4 insertions, 120 deletions
diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c index 48764284f..caa763765 100644 --- a/vp8/encoder/arm/neon/fastquantizeb_neon.c +++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c @@ -10,13 +10,12 @@ #include <arm_neon.h> #include "vp8/encoder/block.h" -#include "vpx_mem/vpx_mem.h" static const uint16_t inv_zig_zag[16] = { - 0x0001, 0x0002, 0x0006, 0x0007, - 0x0003, 0x0005, 0x0008, 0x000d, - 0x0004, 0x0009, 0x000c, 0x000e, - 0x000a, 0x000b, 0x000f, 0x0010 + 1, 2, 6, 7, + 3, 5, 8, 13, + 4, 9, 12, 14, + 10, 11, 15, 16 }; void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { @@ -88,118 +87,3 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); } - -void vp8_fast_quantize_b_pair_neon(BLOCK *b0, BLOCK *b1, - BLOCKD *d0, BLOCKD *d1) { - const int16x8_t one_q = vdupq_n_s16(0xff), - b0_z0 = vld1q_s16(b0->coeff), - b0_z1 = vld1q_s16(b0->coeff + 8), - b0_round0 = vld1q_s16(b0->round), - b0_round1 = vld1q_s16(b0->round + 8), - b0_quant0 = vld1q_s16(b0->quant_fast), - b0_quant1 = vld1q_s16(b0->quant_fast + 8), - d0_dequant0 = vld1q_s16(d0->dequant), - d0_dequant1 = vld1q_s16(d0->dequant + 8), - b1_z0 = vld1q_s16(b1->coeff), - b1_z1 = vld1q_s16(b1->coeff + 8), - b1_round0 = vld1q_s16(b1->round), - b1_round1 = vld1q_s16(b1->round + 8), - b1_quant0 = vld1q_s16(b1->quant_fast), - b1_quant1 = vld1q_s16(b1->quant_fast + 8), - d1_dequant0 = vld1q_s16(d1->dequant), - d1_dequant1 = vld1q_s16(d1->dequant + 8); - const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag), - zig_zag1 = vld1q_u16(inv_zig_zag + 8); - int16x8_t b0_x0, b0_x1, b0_sz0, b0_sz1, b0_y0, b0_y1, - b1_x0, b1_x1, b1_sz0, b1_sz1, b1_y0, b1_y1; - uint16x8_t b0_eob0, b0_eob1, - b1_eob0, b1_eob1; - uint16x4_t b0_eob_d16, b1_eob_d16; - uint32x2_t b0_eob_d32, b1_eob_d32; - uint32x4_t b0_eob_q32, b1_eob_q32; - - /* sign of z: z >> 15 */ - b0_sz0 = vshrq_n_s16(b0_z0, 15); - b0_sz1 = vshrq_n_s16(b0_z1, 15); - b1_sz0 = vshrq_n_s16(b1_z0, 15); - b1_sz1 = vshrq_n_s16(b1_z1, 15); - - /* x = abs(z) */ - b0_x0 = vabsq_s16(b0_z0); - b0_x1 = vabsq_s16(b0_z1); - b1_x0 = vabsq_s16(b1_z0); - b1_x1 = vabsq_s16(b1_z1); - - /* x += round */ - b0_x0 = vaddq_s16(b0_x0, b0_round0); - b0_x1 = vaddq_s16(b0_x1, b0_round1); - b1_x0 = vaddq_s16(b1_x0, b1_round0); - b1_x1 = vaddq_s16(b1_x1, b1_round1); - - /* y = 2 * (x * quant) >> 16 */ - b0_y0 = vqdmulhq_s16(b0_x0, b0_quant0); - b0_y1 = vqdmulhq_s16(b0_x1, b0_quant1); - b1_y0 = vqdmulhq_s16(b1_x0, b1_quant0); - b1_y1 = vqdmulhq_s16(b1_x1, b1_quant1); - - /* Compensate for doubling in vqdmulhq */ - b0_y0 = vshrq_n_s16(b0_y0, 1); - b0_y1 = vshrq_n_s16(b0_y1, 1); - b1_y0 = vshrq_n_s16(b1_y0, 1); - b1_y1 = vshrq_n_s16(b1_y1, 1); - - /* Restore sign bit */ - b0_y0 = veorq_s16(b0_y0, b0_sz0); - b0_y1 = veorq_s16(b0_y1, b0_sz1); - b0_x0 = vsubq_s16(b0_y0, b0_sz0); - b0_x1 = vsubq_s16(b0_y1, b0_sz1); - b1_y0 = veorq_s16(b1_y0, b1_sz0); - b1_y1 = veorq_s16(b1_y1, b1_sz1); - b1_x0 = vsubq_s16(b1_y0, b1_sz0); - b1_x1 = vsubq_s16(b1_y1, b1_sz1); - - /* find non-zero elements */ - b0_eob0 = vtstq_s16(b0_x0, one_q); - b0_eob1 = vtstq_s16(b0_x1, one_q); - b1_eob0 = vtstq_s16(b1_x0, one_q); - b1_eob1 = vtstq_s16(b1_x1, one_q); - - /* mask zig zag */ - b0_eob0 = vandq_u16(b0_eob0, zig_zag0); - b0_eob1 = vandq_u16(b0_eob1, zig_zag1); - b1_eob0 = vandq_u16(b1_eob0, zig_zag0); - b1_eob1 = vandq_u16(b1_eob1, zig_zag1); - - /* select the largest value */ - b0_eob0 = vmaxq_u16(b0_eob0, b0_eob1); - b0_eob_d16 = vmax_u16(vget_low_u16(b0_eob0), - vget_high_u16(b0_eob0)); - b0_eob_q32 = vmovl_u16(b0_eob_d16); - b0_eob_d32 = vmax_u32(vget_low_u32(b0_eob_q32), - vget_high_u32(b0_eob_q32)); - b0_eob_d32 = vpmax_u32(b0_eob_d32, b0_eob_d32); - - b1_eob0 = vmaxq_u16(b1_eob0, b1_eob1); - b1_eob_d16 = vmax_u16(vget_low_u16(b1_eob0), - vget_high_u16(b1_eob0)); - b1_eob_q32 = vmovl_u16(b1_eob_d16); - b1_eob_d32 = vmax_u32(vget_low_u32(b1_eob_q32), - vget_high_u32(b1_eob_q32)); - b1_eob_d32 = vpmax_u32(b1_eob_d32, b1_eob_d32); - - /* qcoeff = x */ - vst1q_s16(d0->qcoeff, b0_x0); - vst1q_s16(d0->qcoeff + 8, b0_x1); - vst1q_s16(d1->qcoeff, b1_x0); - vst1q_s16(d1->qcoeff + 8, b1_x1); - - /* dqcoeff = x * dequant */ - vst1q_s16(d0->dqcoeff, vmulq_s16(d0_dequant0, b0_x0)); - vst1q_s16(d0->dqcoeff + 8, vmulq_s16(d0_dequant1, b0_x1)); - vst1q_s16(d1->dqcoeff, vmulq_s16(d1_dequant0, b1_x0)); - vst1q_s16(d1->dqcoeff + 8, vmulq_s16(d1_dequant1, b1_x1)); - - vst1_lane_s8((int8_t *)d0->eob, vreinterpret_s8_u32(b0_eob_d32), 0); - vst1_lane_s8((int8_t *)d1->eob, vreinterpret_s8_u32(b1_eob_d32), 0); - return; -} |