summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Wright <jonathan.wright@arm.com>2023-02-25 00:43:46 +0000
committerJonathan Wright <jonathan.wright@arm.com>2023-02-27 09:49:02 +0000
commitb25cca8c2edba5fbc18448007da2624a25113f4d (patch)
tree6b4a8cb4adbcdabb409c13954db1c6e18b869e06
parent45dc0d34d2fa1a848c282d8fc992206fa69f01b8 (diff)
downloadlibvpx-b25cca8c2edba5fbc18448007da2624a25113f4d.tar
libvpx-b25cca8c2edba5fbc18448007da2624a25113f4d.tar.gz
libvpx-b25cca8c2edba5fbc18448007da2624a25113f4d.tar.bz2
libvpx-b25cca8c2edba5fbc18448007da2624a25113f4d.zip
Optimize transpose_neon.h helper functions
1) Use vtrn[12]q_[su]64 in vpx_vtrnq_[su]64* helpers on AArch64 targets. This produces half as many TRN1/2 instructions compared to the number of MOVs that result from vcombine. 2) Use vpx_vtrnq_[su]64* helpers wherever applicable. 3) Refactor transpose_4x8_s16 to operate on 128-bit vectors. Change-Id: I9a8b1c1fe2a98a429e0c5f39def5eb2f65759127
-rw-r--r--vpx_dsp/arm/transpose_neon.h108
1 files changed, 53 insertions, 55 deletions
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 48292c693..518278f30 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -39,26 +39,45 @@ static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
int32x4x2_t b0;
+#if defined(__aarch64__)
+ b0.val[0] = vreinterpretq_s32_s64(
+ vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+ b0.val[1] = vreinterpretq_s32_s64(
+ vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+#endif
return b0;
}
static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
int64x2x2_t b0;
+#if defined(__aarch64__)
+ b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
+ b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
+#else
b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)),
vreinterpret_s64_s32(vget_low_s32(a1)));
b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)),
vreinterpret_s64_s32(vget_high_s32(a1)));
+#endif
return b0;
}
static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
uint8x16x2_t b0;
+#if defined(__aarch64__)
+ b0.val[0] = vreinterpretq_u8_u64(
+ vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+ b0.val[1] = vreinterpretq_u8_u64(
+ vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
vreinterpret_u8_u32(vget_low_u32(a1)));
b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)),
vreinterpret_u8_u32(vget_high_u32(a1)));
+#endif
return b0;
}
@@ -155,17 +174,13 @@ static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
// c0: 00 01 20 21 02 03 22 23
// c1: 10 11 30 31 12 13 32 33
- const int32x4_t c0 =
- vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1]));
- const int32x4_t c1 =
- vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1]));
+ const int16x8x2_t c0 = vpx_vtrnq_s64_to_s16(b0.val[0], b0.val[1]);
// Swap 16 bit elements resulting in:
// d0.val[0]: 00 10 20 30 02 12 22 32
// d0.val[1]: 01 11 21 31 03 13 23 33
- const int16x8x2_t d0 =
- vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1));
+ const int16x8x2_t d0 = vtrnq_s16(c0.val[0], c0.val[1]);
*a0 = d0.val[0];
*a1 = d0.val[1];
@@ -186,17 +201,13 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
// c0: 00 01 20 21 02 03 22 23
// c1: 10 11 30 31 12 13 32 33
- const uint32x4_t c0 =
- vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1]));
- const uint32x4_t c1 =
- vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1]));
+ const uint16x8x2_t c0 = vpx_vtrnq_u64_to_u16(b0.val[0], b0.val[1]);
// Swap 16 bit elements resulting in:
// d0.val[0]: 00 10 20 30 02 12 22 32
// d0.val[1]: 01 11 21 31 03 13 23 33
- const uint16x8x2_t d0 =
- vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1));
+ const uint16x8x2_t d0 = vtrnq_u16(c0.val[0], c0.val[1]);
*a0 = d0.val[0];
*a1 = d0.val[1];
@@ -295,7 +306,7 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
const int16x4_t a6, const int16x4_t a7,
int16x8_t *const o0, int16x8_t *const o1,
int16x8_t *const o2, int16x8_t *const o3) {
- // Swap 16 bit elements. Goes from:
+ // Combine rows. Goes from:
// a0: 00 01 02 03
// a1: 10 11 12 13
// a2: 20 21 22 23
@@ -305,53 +316,40 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
// a6: 60 61 62 63
// a7: 70 71 72 73
// to:
- // b0.val[0]: 00 10 02 12
- // b0.val[1]: 01 11 03 13
- // b1.val[0]: 20 30 22 32
- // b1.val[1]: 21 31 23 33
- // b2.val[0]: 40 50 42 52
- // b2.val[1]: 41 51 43 53
- // b3.val[0]: 60 70 62 72
- // b3.val[1]: 61 71 63 73
+ // b0: 00 01 02 03 40 41 42 43
+ // b1: 10 11 12 13 50 51 52 53
+ // b2: 20 21 22 23 60 61 62 63
+ // b3: 30 31 32 33 70 71 72 73
+
+ const int16x8_t b0 = vcombine_s16(a0, a4);
+ const int16x8_t b1 = vcombine_s16(a1, a5);
+ const int16x8_t b2 = vcombine_s16(a2, a6);
+ const int16x8_t b3 = vcombine_s16(a3, a7);
- const int16x4x2_t b0 = vtrn_s16(a0, a1);
- const int16x4x2_t b1 = vtrn_s16(a2, a3);
- const int16x4x2_t b2 = vtrn_s16(a4, a5);
- const int16x4x2_t b3 = vtrn_s16(a6, a7);
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 02 12 40 50 42 52
+ // c0.val[1]: 01 11 03 13 41 51 43 53
+ // c1.val[0]: 20 30 22 32 60 70 62 72
+ // c1.val[1]: 21 31 23 33 61 71 63 73
+
+ const int16x8x2_t c0 = vtrnq_s16(b0, b1);
+ const int16x8x2_t c1 = vtrnq_s16(b2, b3);
// Swap 32 bit elements resulting in:
- // c0.val[0]: 00 10 20 30
- // c0.val[1]: 02 12 22 32
- // c1.val[0]: 01 11 21 31
- // c1.val[1]: 03 13 23 33
- // c2.val[0]: 40 50 60 70
- // c2.val[1]: 42 52 62 72
- // c3.val[0]: 41 51 61 71
- // c3.val[1]: 43 53 63 73
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 02 12 22 32 42 52 62 72
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 03 13 23 33 43 53 63 73
- const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
- vreinterpret_s32_s16(b1.val[0]));
- const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
- vreinterpret_s32_s16(b1.val[1]));
- const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
- vreinterpret_s32_s16(b3.val[0]));
- const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
- vreinterpret_s32_s16(b3.val[1]));
+ const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+ vreinterpretq_s32_s16(c1.val[0]));
+ const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+ vreinterpretq_s32_s16(c1.val[1]));
- // Swap 64 bit elements resulting in:
- // o0: 00 10 20 30 40 50 60 70
- // o1: 01 11 21 31 41 51 61 71
- // o2: 02 12 22 32 42 52 62 72
- // o3: 03 13 23 33 43 53 63 73
-
- *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
- vreinterpret_s16_s32(c2.val[0]));
- *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
- vreinterpret_s16_s32(c3.val[0]));
- *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
- vreinterpret_s16_s32(c2.val[1]));
- *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
- vreinterpret_s16_s32(c3.val[1]));
+ *o0 = vreinterpretq_s16_s32(d0.val[0]);
+ *o1 = vreinterpretq_s16_s32(d1.val[0]);
+ *o2 = vreinterpretq_s16_s32(d0.val[1]);
+ *o3 = vreinterpretq_s16_s32(d1.val[1]);
}
static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1,