2 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
7 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
26 #include <linux/linkage.h>
27 #include <asm/frame.h>
28 #include "glue_helper-asm-avx.S"
30 .file "cast6-avx-x86_64-asm_64.S"
37 /* structure of crypto context */
47 /**********************************************************************
49 **********************************************************************/
100 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
101 movzbl src ## bh, RID1d; \
102 movzbl src ## bl, RID2d; \
104 movl s1(, RID1, 4), dst ## d; \
105 op1 s2(, RID2, 4), dst ## d; \
106 movzbl src ## bh, RID1d; \
107 movzbl src ## bl, RID2d; \
108 interleave_op(il_reg); \
109 op2 s3(, RID1, 4), dst ## d; \
110 op3 s4(, RID2, 4), dst ## d;
112 #define dummy(d) /* do nothing */
114 #define shr_next(reg) \
117 #define F_head(a, x, gi1, gi2, op0) \
119 vpslld RKRF, x, RTMP; \
126 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
127 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
128 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
130 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
133 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
138 vpinsrq $1, RFS3, x, x;
140 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
141 F_head(b1, RX, RGI1, RGI2, op0); \
142 F_head(b2, RX, RGI3, RGI4, op0); \
144 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
145 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
150 #define F1_2(a1, b1, a2, b2) \
151 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
152 #define F2_2(a1, b1, a2, b2) \
153 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
154 #define F3_2(a1, b1, a2, b2) \
155 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
157 #define qop(in, out, f) \
158 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
160 #define get_round_keys(nn) \
161 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
162 vpand R1ST, RKR, RKRF; \
163 vpsubq RKRF, R32, RKRR; \
164 vpsrldq $1, RKR, RKR;
167 get_round_keys(4*n+0); \
170 get_round_keys(4*n+1); \
173 get_round_keys(4*n+2); \
176 get_round_keys(4*n+3); \
180 get_round_keys(4*n+3); \
183 get_round_keys(4*n+2); \
186 get_round_keys(4*n+1); \
189 get_round_keys(4*n+0); \
192 #define shuffle(mask) \
193 vpshufb mask, RKR, RKR;
195 #define preload_rkr(n, do_mask, mask) \
196 vbroadcastss .L16_mask, RKR; \
197 /* add 16-bit rotation to key rotations (mod 32) */ \
198 vpxor (kr+n*16)(CTX), RKR, RKR; \
201 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
202 vpunpckldq x1, x0, t0; \
203 vpunpckhdq x1, x0, t2; \
204 vpunpckldq x3, x2, t1; \
205 vpunpckhdq x3, x2, x3; \
207 vpunpcklqdq t1, t0, x0; \
208 vpunpckhqdq t1, t0, x1; \
209 vpunpcklqdq x3, t2, x2; \
210 vpunpckhqdq x3, t2, x3;
212 #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
213 vpshufb rmask, x0, x0; \
214 vpshufb rmask, x1, x1; \
215 vpshufb rmask, x2, x2; \
216 vpshufb rmask, x3, x3; \
218 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
220 #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
221 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
223 vpshufb rmask, x0, x0; \
224 vpshufb rmask, x1, x1; \
225 vpshufb rmask, x2, x2; \
226 vpshufb rmask, x3, x3;
231 .Lxts_gf128mul_and_shl1_mask:
232 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
234 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
236 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
237 .Lrkr_enc_Q_Q_QBAR_QBAR:
238 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
239 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
240 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
242 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
243 .Lrkr_dec_Q_Q_QBAR_QBAR:
244 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
245 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
246 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
260 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
262 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
268 vmovdqa .Lbswap_mask, RKM;
269 vmovd .Lfirst_mask, R1ST;
270 vmovd .L32_mask, R32;
272 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
273 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
275 preload_rkr(0, dummy, none);
280 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
285 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
294 vmovdqa .Lbswap_mask, RKM;
296 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
297 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
300 ENDPROC(__cast6_enc_blk8)
306 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
308 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
314 vmovdqa .Lbswap_mask, RKM;
315 vmovd .Lfirst_mask, R1ST;
316 vmovd .L32_mask, R32;
318 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
319 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
321 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
326 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
331 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
340 vmovdqa .Lbswap_mask, RKM;
341 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
342 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
345 ENDPROC(__cast6_dec_blk8)
347 ENTRY(cast6_ecb_enc_8way)
357 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
359 call __cast6_enc_blk8;
361 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
365 ENDPROC(cast6_ecb_enc_8way)
367 ENTRY(cast6_ecb_dec_8way)
377 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
379 call __cast6_dec_blk8;
381 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
385 ENDPROC(cast6_ecb_dec_8way)
387 ENTRY(cast6_cbc_dec_8way)
400 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
402 call __cast6_dec_blk8;
404 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
410 ENDPROC(cast6_cbc_dec_8way)
412 ENTRY(cast6_ctr_8way)
417 * %rcx: iv (little endian, 128bit)
426 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
429 call __cast6_enc_blk8;
431 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
437 ENDPROC(cast6_ctr_8way)
439 ENTRY(cast6_xts_enc_8way)
444 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
450 /* regs <= src, dst <= IVs, regs <= regs xor IVs */
451 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
452 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
454 call __cast6_enc_blk8;
456 /* dst <= regs xor IVs(in dst) */
457 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
461 ENDPROC(cast6_xts_enc_8way)
463 ENTRY(cast6_xts_dec_8way)
468 * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
474 /* regs <= src, dst <= IVs, regs <= regs xor IVs */
475 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
476 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
478 call __cast6_dec_blk8;
480 /* dst <= regs xor IVs(in dst) */
481 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
485 ENDPROC(cast6_xts_dec_8way)