aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/s390/s390-64/chacha20-s390x.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/s390/s390-64/chacha20-s390x.S')
-rw-r--r--sysdeps/s390/s390-64/chacha20-s390x.S573
1 files changed, 0 insertions, 573 deletions
diff --git a/sysdeps/s390/s390-64/chacha20-s390x.S b/sysdeps/s390/s390-64/chacha20-s390x.S
deleted file mode 100644
index e38504d370..0000000000
--- a/sysdeps/s390/s390-64/chacha20-s390x.S
+++ /dev/null
@@ -1,573 +0,0 @@
-/* Optimized s390x implementation of ChaCha20 cipher.
- Copyright (C) 2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-/* chacha20-s390x.S - zSeries implementation of ChaCha20 cipher
-
- Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
-
- This file is part of Libgcrypt.
-
- Libgcrypt is free software; you can redistribute it and/or modify
- it under the terms of the GNU Lesser General Public License as
- published by the Free Software Foundation; either version 2.1 of
- the License, or (at your option) any later version.
-
- Libgcrypt is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with this program; if not, see <https://www.gnu.org/licenses/>.
- */
-
-#include <sysdep.h>
-
-#ifdef HAVE_S390_VX_ASM_SUPPORT
-
-/* CFA expressions are used for pointing CFA and registers to
- * SP relative offsets. */
-# define DW_REGNO_SP 15
-
-/* Fixed length encoding used for integers for now. */
-# define DW_SLEB128_7BIT(value) \
- 0x00|((value) & 0x7f)
-# define DW_SLEB128_28BIT(value) \
- 0x80|((value)&0x7f), \
- 0x80|(((value)>>7)&0x7f), \
- 0x80|(((value)>>14)&0x7f), \
- 0x00|(((value)>>21)&0x7f)
-
-# define cfi_cfa_on_stack(rsp_offs,cfa_depth) \
- .cfi_escape \
- 0x0f, /* DW_CFA_def_cfa_expression */ \
- DW_SLEB128_7BIT(11), /* length */ \
- 0x7f, /* DW_OP_breg15, rsp + constant */ \
- DW_SLEB128_28BIT(rsp_offs), \
- 0x06, /* DW_OP_deref */ \
- 0x23, /* DW_OP_plus_constu */ \
- DW_SLEB128_28BIT((cfa_depth)+160)
-
-.machine "z13+vx"
-.text
-
-.balign 16
-.Lconsts:
-.Lwordswap:
- .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
-.Lbswap128:
- .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-.Lbswap32:
- .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
-.Lone:
- .long 0, 0, 0, 1
-.Ladd_counter_0123:
- .long 0, 1, 2, 3
-.Ladd_counter_4567:
- .long 4, 5, 6, 7
-
-/* register macros */
-#define INPUT %r2
-#define DST %r3
-#define SRC %r4
-#define NBLKS %r0
-#define ROUND %r1
-
-/* stack structure */
-
-#define STACK_FRAME_STD (8 * 16 + 8 * 4)
-#define STACK_FRAME_F8_F15 (8 * 8)
-#define STACK_FRAME_Y0_Y15 (16 * 16)
-#define STACK_FRAME_CTR (4 * 16)
-#define STACK_FRAME_PARAMS (6 * 8)
-
-#define STACK_MAX (STACK_FRAME_STD + STACK_FRAME_F8_F15 + \
- STACK_FRAME_Y0_Y15 + STACK_FRAME_CTR + \
- STACK_FRAME_PARAMS)
-
-#define STACK_F8 (STACK_MAX - STACK_FRAME_F8_F15)
-#define STACK_F9 (STACK_F8 + 8)
-#define STACK_F10 (STACK_F9 + 8)
-#define STACK_F11 (STACK_F10 + 8)
-#define STACK_F12 (STACK_F11 + 8)
-#define STACK_F13 (STACK_F12 + 8)
-#define STACK_F14 (STACK_F13 + 8)
-#define STACK_F15 (STACK_F14 + 8)
-#define STACK_Y0_Y15 (STACK_F8 - STACK_FRAME_Y0_Y15)
-#define STACK_CTR (STACK_Y0_Y15 - STACK_FRAME_CTR)
-#define STACK_INPUT (STACK_CTR - STACK_FRAME_PARAMS)
-#define STACK_DST (STACK_INPUT + 8)
-#define STACK_SRC (STACK_DST + 8)
-#define STACK_NBLKS (STACK_SRC + 8)
-#define STACK_POCTX (STACK_NBLKS + 8)
-#define STACK_POSRC (STACK_POCTX + 8)
-
-#define STACK_G0_H3 STACK_Y0_Y15
-
-/* vector registers */
-#define A0 %v0
-#define A1 %v1
-#define A2 %v2
-#define A3 %v3
-
-#define B0 %v4
-#define B1 %v5
-#define B2 %v6
-#define B3 %v7
-
-#define C0 %v8
-#define C1 %v9
-#define C2 %v10
-#define C3 %v11
-
-#define D0 %v12
-#define D1 %v13
-#define D2 %v14
-#define D3 %v15
-
-#define E0 %v16
-#define E1 %v17
-#define E2 %v18
-#define E3 %v19
-
-#define F0 %v20
-#define F1 %v21
-#define F2 %v22
-#define F3 %v23
-
-#define G0 %v24
-#define G1 %v25
-#define G2 %v26
-#define G3 %v27
-
-#define H0 %v28
-#define H1 %v29
-#define H2 %v30
-#define H3 %v31
-
-#define IO0 E0
-#define IO1 E1
-#define IO2 E2
-#define IO3 E3
-#define IO4 F0
-#define IO5 F1
-#define IO6 F2
-#define IO7 F3
-
-#define S0 G0
-#define S1 G1
-#define S2 G2
-#define S3 G3
-
-#define TMP0 H0
-#define TMP1 H1
-#define TMP2 H2
-#define TMP3 H3
-
-#define X0 A0
-#define X1 A1
-#define X2 A2
-#define X3 A3
-#define X4 B0
-#define X5 B1
-#define X6 B2
-#define X7 B3
-#define X8 C0
-#define X9 C1
-#define X10 C2
-#define X11 C3
-#define X12 D0
-#define X13 D1
-#define X14 D2
-#define X15 D3
-
-#define Y0 E0
-#define Y1 E1
-#define Y2 E2
-#define Y3 E3
-#define Y4 F0
-#define Y5 F1
-#define Y6 F2
-#define Y7 F3
-#define Y8 G0
-#define Y9 G1
-#define Y10 G2
-#define Y11 G3
-#define Y12 H0
-#define Y13 H1
-#define Y14 H2
-#define Y15 H3
-
-/**********************************************************************
- helper macros
- **********************************************************************/
-
-#define _ /*_*/
-
-#define START_STACK(last_r) \
- lgr %r0, %r15; \
- lghi %r1, ~15; \
- stmg %r6, last_r, 6 * 8(%r15); \
- aghi %r0, -STACK_MAX; \
- ngr %r0, %r1; \
- lgr %r1, %r15; \
- cfi_def_cfa_register(1); \
- lgr %r15, %r0; \
- stg %r1, 0(%r15); \
- cfi_cfa_on_stack(0, 0); \
- std %f8, STACK_F8(%r15); \
- std %f9, STACK_F9(%r15); \
- std %f10, STACK_F10(%r15); \
- std %f11, STACK_F11(%r15); \
- std %f12, STACK_F12(%r15); \
- std %f13, STACK_F13(%r15); \
- std %f14, STACK_F14(%r15); \
- std %f15, STACK_F15(%r15);
-
-#define END_STACK(last_r) \
- lg %r1, 0(%r15); \
- ld %f8, STACK_F8(%r15); \
- ld %f9, STACK_F9(%r15); \
- ld %f10, STACK_F10(%r15); \
- ld %f11, STACK_F11(%r15); \
- ld %f12, STACK_F12(%r15); \
- ld %f13, STACK_F13(%r15); \
- ld %f14, STACK_F14(%r15); \
- ld %f15, STACK_F15(%r15); \
- lmg %r6, last_r, 6 * 8(%r1); \
- lgr %r15, %r1; \
- cfi_def_cfa_register(DW_REGNO_SP);
-
-#define PLUS(dst,src) \
- vaf dst, dst, src;
-
-#define XOR(dst,src) \
- vx dst, dst, src;
-
-#define ROTATE(v1,c) \
- verllf v1, v1, (c)(0);
-
-#define WORD_ROTATE(v1,s) \
- vsldb v1, v1, v1, ((s) * 4);
-
-#define DST_8(OPER, I, J) \
- OPER(A##I, J); OPER(B##I, J); OPER(C##I, J); OPER(D##I, J); \
- OPER(E##I, J); OPER(F##I, J); OPER(G##I, J); OPER(H##I, J);
-
-/**********************************************************************
- round macros
- **********************************************************************/
-
-/**********************************************************************
- 8-way chacha20 ("vertical")
- **********************************************************************/
-
-#define QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\
- x8,x9,x10,x11,x12,x13,x14,x15,\
- y0,y1,y2,y3,y4,y5,y6,y7,\
- y8,y9,y10,y11,y12,y13,y14,y15,\
- op1,op2,op3,op4,op5,op6,op7,op8,\
- op9,op10,op11,op12) \
- op1; \
- PLUS(x0, x1); PLUS(x4, x5); \
- PLUS(x8, x9); PLUS(x12, x13); \
- PLUS(y0, y1); PLUS(y4, y5); \
- PLUS(y8, y9); PLUS(y12, y13); \
- op2; \
- XOR(x3, x0); XOR(x7, x4); \
- XOR(x11, x8); XOR(x15, x12); \
- XOR(y3, y0); XOR(y7, y4); \
- XOR(y11, y8); XOR(y15, y12); \
- op3; \
- ROTATE(x3, 16); ROTATE(x7, 16); \
- ROTATE(x11, 16); ROTATE(x15, 16); \
- ROTATE(y3, 16); ROTATE(y7, 16); \
- ROTATE(y11, 16); ROTATE(y15, 16); \
- op4; \
- PLUS(x2, x3); PLUS(x6, x7); \
- PLUS(x10, x11); PLUS(x14, x15); \
- PLUS(y2, y3); PLUS(y6, y7); \
- PLUS(y10, y11); PLUS(y14, y15); \
- op5; \
- XOR(x1, x2); XOR(x5, x6); \
- XOR(x9, x10); XOR(x13, x14); \
- XOR(y1, y2); XOR(y5, y6); \
- XOR(y9, y10); XOR(y13, y14); \
- op6; \
- ROTATE(x1,12); ROTATE(x5,12); \
- ROTATE(x9,12); ROTATE(x13,12); \
- ROTATE(y1,12); ROTATE(y5,12); \
- ROTATE(y9,12); ROTATE(y13,12); \
- op7; \
- PLUS(x0, x1); PLUS(x4, x5); \
- PLUS(x8, x9); PLUS(x12, x13); \
- PLUS(y0, y1); PLUS(y4, y5); \
- PLUS(y8, y9); PLUS(y12, y13); \
- op8; \
- XOR(x3, x0); XOR(x7, x4); \
- XOR(x11, x8); XOR(x15, x12); \
- XOR(y3, y0); XOR(y7, y4); \
- XOR(y11, y8); XOR(y15, y12); \
- op9; \
- ROTATE(x3,8); ROTATE(x7,8); \
- ROTATE(x11,8); ROTATE(x15,8); \
- ROTATE(y3,8); ROTATE(y7,8); \
- ROTATE(y11,8); ROTATE(y15,8); \
- op10; \
- PLUS(x2, x3); PLUS(x6, x7); \
- PLUS(x10, x11); PLUS(x14, x15); \
- PLUS(y2, y3); PLUS(y6, y7); \
- PLUS(y10, y11); PLUS(y14, y15); \
- op11; \
- XOR(x1, x2); XOR(x5, x6); \
- XOR(x9, x10); XOR(x13, x14); \
- XOR(y1, y2); XOR(y5, y6); \
- XOR(y9, y10); XOR(y13, y14); \
- op12; \
- ROTATE(x1,7); ROTATE(x5,7); \
- ROTATE(x9,7); ROTATE(x13,7); \
- ROTATE(y1,7); ROTATE(y5,7); \
- ROTATE(y9,7); ROTATE(y13,7);
-
-#define QUARTERROUND4_V8(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,\
- y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15) \
- QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\
- x8,x9,x10,x11,x12,x13,x14,x15,\
- y0,y1,y2,y3,y4,y5,y6,y7,\
- y8,y9,y10,y11,y12,y13,y14,y15,\
- ,,,,,,,,,,,)
-
-#define TRANSPOSE_4X4_2(v0,v1,v2,v3,va,vb,vc,vd,tmp0,tmp1,tmp2,tmpa,tmpb,tmpc) \
- vmrhf tmp0, v0, v1; \
- vmrhf tmp1, v2, v3; \
- vmrlf tmp2, v0, v1; \
- vmrlf v3, v2, v3; \
- vmrhf tmpa, va, vb; \
- vmrhf tmpb, vc, vd; \
- vmrlf tmpc, va, vb; \
- vmrlf vd, vc, vd; \
- vpdi v0, tmp0, tmp1, 0; \
- vpdi v1, tmp0, tmp1, 5; \
- vpdi v2, tmp2, v3, 0; \
- vpdi v3, tmp2, v3, 5; \
- vpdi va, tmpa, tmpb, 0; \
- vpdi vb, tmpa, tmpb, 5; \
- vpdi vc, tmpc, vd, 0; \
- vpdi vd, tmpc, vd, 5;
-
-.balign 8
-.globl __chacha20_s390x_vx_blocks8
-ENTRY (__chacha20_s390x_vx_blocks8)
- /* input:
- * %r2: input
- * %r3: dst
- * %r4: src
- * %r5: nblks (multiple of 8)
- */
-
- START_STACK(%r8);
- lgr NBLKS, %r5;
-
- larl %r7, .Lconsts;
-
- /* Load counter. */
- lg %r8, (12 * 4)(INPUT);
- rllg %r8, %r8, 32;
-
-.balign 4
- /* Process eight chacha20 blocks per loop. */
-.Lloop8:
- vlm Y0, Y3, 0(INPUT);
-
- slgfi NBLKS, 8;
- lghi ROUND, (20 / 2);
-
- /* Construct counter vectors X12/X13 & Y12/Y13. */
- vl X4, (.Ladd_counter_0123 - .Lconsts)(%r7);
- vl Y4, (.Ladd_counter_4567 - .Lconsts)(%r7);
- vrepf Y12, Y3, 0;
- vrepf Y13, Y3, 1;
- vaccf X5, Y12, X4;
- vaccf Y5, Y12, Y4;
- vaf X12, Y12, X4;
- vaf Y12, Y12, Y4;
- vaf X13, Y13, X5;
- vaf Y13, Y13, Y5;
-
- vrepf X0, Y0, 0;
- vrepf X1, Y0, 1;
- vrepf X2, Y0, 2;
- vrepf X3, Y0, 3;
- vrepf X4, Y1, 0;
- vrepf X5, Y1, 1;
- vrepf X6, Y1, 2;
- vrepf X7, Y1, 3;
- vrepf X8, Y2, 0;
- vrepf X9, Y2, 1;
- vrepf X10, Y2, 2;
- vrepf X11, Y2, 3;
- vrepf X14, Y3, 2;
- vrepf X15, Y3, 3;
-
- /* Store counters for blocks 0-7. */
- vstm X12, X13, (STACK_CTR + 0 * 16)(%r15);
- vstm Y12, Y13, (STACK_CTR + 2 * 16)(%r15);
-
- vlr Y0, X0;
- vlr Y1, X1;
- vlr Y2, X2;
- vlr Y3, X3;
- vlr Y4, X4;
- vlr Y5, X5;
- vlr Y6, X6;
- vlr Y7, X7;
- vlr Y8, X8;
- vlr Y9, X9;
- vlr Y10, X10;
- vlr Y11, X11;
- vlr Y14, X14;
- vlr Y15, X15;
-
- /* Update and store counter. */
- agfi %r8, 8;
- rllg %r5, %r8, 32;
- stg %r5, (12 * 4)(INPUT);
-
-.balign 4
-.Lround2_8:
- QUARTERROUND4_V8(X0, X4, X8, X12, X1, X5, X9, X13,
- X2, X6, X10, X14, X3, X7, X11, X15,
- Y0, Y4, Y8, Y12, Y1, Y5, Y9, Y13,
- Y2, Y6, Y10, Y14, Y3, Y7, Y11, Y15);
- QUARTERROUND4_V8(X0, X5, X10, X15, X1, X6, X11, X12,
- X2, X7, X8, X13, X3, X4, X9, X14,
- Y0, Y5, Y10, Y15, Y1, Y6, Y11, Y12,
- Y2, Y7, Y8, Y13, Y3, Y4, Y9, Y14);
- brctg ROUND, .Lround2_8;
-
- /* Store blocks 4-7. */
- vstm Y0, Y15, STACK_Y0_Y15(%r15);
-
- /* Load counters for blocks 0-3. */
- vlm Y0, Y1, (STACK_CTR + 0 * 16)(%r15);
-
- lghi ROUND, 1;
- j .Lfirst_output_4blks_8;
-
-.balign 4
-.Lsecond_output_4blks_8:
- /* Load blocks 4-7. */
- vlm X0, X15, STACK_Y0_Y15(%r15);
-
- /* Load counters for blocks 4-7. */
- vlm Y0, Y1, (STACK_CTR + 2 * 16)(%r15);
-
- lghi ROUND, 0;
-
-.balign 4
- /* Output four chacha20 blocks per loop. */
-.Lfirst_output_4blks_8:
- vlm Y12, Y15, 0(INPUT);
- PLUS(X12, Y0);
- PLUS(X13, Y1);
- vrepf Y0, Y12, 0;
- vrepf Y1, Y12, 1;
- vrepf Y2, Y12, 2;
- vrepf Y3, Y12, 3;
- vrepf Y4, Y13, 0;
- vrepf Y5, Y13, 1;
- vrepf Y6, Y13, 2;
- vrepf Y7, Y13, 3;
- vrepf Y8, Y14, 0;
- vrepf Y9, Y14, 1;
- vrepf Y10, Y14, 2;
- vrepf Y11, Y14, 3;
- vrepf Y14, Y15, 2;
- vrepf Y15, Y15, 3;
- PLUS(X0, Y0);
- PLUS(X1, Y1);
- PLUS(X2, Y2);
- PLUS(X3, Y3);
- PLUS(X4, Y4);
- PLUS(X5, Y5);
- PLUS(X6, Y6);
- PLUS(X7, Y7);
- PLUS(X8, Y8);
- PLUS(X9, Y9);
- PLUS(X10, Y10);
- PLUS(X11, Y11);
- PLUS(X14, Y14);
- PLUS(X15, Y15);
-
- vl Y15, (.Lbswap32 - .Lconsts)(%r7);
- TRANSPOSE_4X4_2(X0, X1, X2, X3, X4, X5, X6, X7,
- Y9, Y10, Y11, Y12, Y13, Y14);
- TRANSPOSE_4X4_2(X8, X9, X10, X11, X12, X13, X14, X15,
- Y9, Y10, Y11, Y12, Y13, Y14);
-
- vlm Y0, Y14, 0(SRC);
- vperm X0, X0, X0, Y15;
- vperm X1, X1, X1, Y15;
- vperm X2, X2, X2, Y15;
- vperm X3, X3, X3, Y15;
- vperm X4, X4, X4, Y15;
- vperm X5, X5, X5, Y15;
- vperm X6, X6, X6, Y15;
- vperm X7, X7, X7, Y15;
- vperm X8, X8, X8, Y15;
- vperm X9, X9, X9, Y15;
- vperm X10, X10, X10, Y15;
- vperm X11, X11, X11, Y15;
- vperm X12, X12, X12, Y15;
- vperm X13, X13, X13, Y15;
- vperm X14, X14, X14, Y15;
- vperm X15, X15, X15, Y15;
- vl Y15, (15 * 16)(SRC);
-
- XOR(Y0, X0);
- XOR(Y1, X4);
- XOR(Y2, X8);
- XOR(Y3, X12);
- XOR(Y4, X1);
- XOR(Y5, X5);
- XOR(Y6, X9);
- XOR(Y7, X13);
- XOR(Y8, X2);
- XOR(Y9, X6);
- XOR(Y10, X10);
- XOR(Y11, X14);
- XOR(Y12, X3);
- XOR(Y13, X7);
- XOR(Y14, X11);
- XOR(Y15, X15);
- vstm Y0, Y15, 0(DST);
-
- aghi SRC, 256;
- aghi DST, 256;
-
- clgije ROUND, 1, .Lsecond_output_4blks_8;
-
- clgijhe NBLKS, 8, .Lloop8;
-
-
- END_STACK(%r8);
- xgr %r2, %r2;
- br %r14;
-END (__chacha20_s390x_vx_blocks8)
-
-#endif /* HAVE_S390_VX_ASM_SUPPORT */