2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
4 * This is AES128/192/256 CTR mode optimization implementation. It requires
5 * the support of Intel(R) AESNI and AVX instructions.
7 * This work was inspired by the AES CTR mode optimization published
8 * in Intel Optimized IPSEC Cryptograhpic library.
9 * Additional information on it can be found at:
10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
12 * This file is provided under a dual BSD/GPLv2 license. When using or
13 * redistributing this file, you may do so under either license.
17 * Copyright(c) 2014 Intel Corporation.
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of version 2 of the GNU General Public License as
21 * published by the Free Software Foundation.
23 * This program is distributed in the hope that it will be useful, but
24 * WITHOUT ANY WARRANTY; without even the implied warranty of
25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26 * General Public License for more details.
28 * Contact Information:
29 * James Guilford <james.guilford@intel.com>
30 * Sean Gulley <sean.m.gulley@intel.com>
31 * Chandramouli Narayanan <mouli@linux.intel.com>
35 * Copyright(c) 2014 Intel Corporation.
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
41 * Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in
45 * the documentation and/or other materials provided with the
47 * Neither the name of Intel Corporation nor the names of its
48 * contributors may be used to endorse or promote products derived
49 * from this software without specific prior written permission.
51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65 #include <linux/linkage.h>
68 #define CONCAT(a,b) a##b
69 #define VMOVDQ vmovdqu
79 #define xcounter %xmm8
80 #define xbyteswap %xmm9
95 #define DDQ(i) CONCAT(ddq_add_,i)
96 #define XMM(i) CONCAT(%xmm, i)
107 .octa 0x000102030405060708090A0B0C0D0E0F
109 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
111 .octa 0x00000000000000010000000000000000
113 .octa 0x00000000000000000000000000000001
115 .octa 0x00000000000000000000000000000002
117 .octa 0x00000000000000000000000000000003
119 .octa 0x00000000000000000000000000000004
121 .octa 0x00000000000000000000000000000005
123 .octa 0x00000000000000000000000000000006
125 .octa 0x00000000000000000000000000000007
127 .octa 0x00000000000000000000000000000008
131 /* generate a unique variable for ddq_add_x */
134 var_ddq_add = DDQ(\n)
137 /* generate a unique variable for xmm register */
142 /* club the numeric 'id' to the symbol 'name' */
146 .if \name == DDQ_DATA
148 .elseif \name == XDATA
155 * do_aes num_in_par load_keys key_len
156 * This increments p_in, but not p_out
158 .macro do_aes b, k, key_len
164 vmovdqa 0*16(p_keys), xkey0
167 vpshufb xbyteswap, xcounter, xdata0
173 vpaddq var_ddq_add(%rip), xcounter, var_xdata
174 vptest ddq_low_msk(%rip), var_xdata
176 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
177 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
179 vpshufb xbyteswap, var_xdata, var_xdata
183 vmovdqa 1*16(p_keys), xkeyA
185 vpxor xkey0, xdata0, xdata0
187 vpaddq var_ddq_add(%rip), xcounter, xcounter
188 vptest ddq_low_msk(%rip), xcounter
190 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
196 vpxor xkey0, var_xdata, var_xdata
200 vmovdqa 2*16(p_keys), xkeyB
205 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */
209 .if (klen == KEY_128)
211 vmovdqa 3*16(p_keys), xkeyA
214 vmovdqa 3*16(p_keys), xkeyA
220 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */
226 .if (klen == KEY_128)
227 vmovdqa 4*16(p_keys), xkey4
230 vmovdqa 4*16(p_keys), xkey4
237 vaesenc xkeyA, var_xdata, var_xdata /* key 3 */
241 vmovdqa 5*16(p_keys), xkeyA
246 vaesenc xkey4, var_xdata, var_xdata /* key 4 */
250 .if (klen == KEY_128)
252 vmovdqa 6*16(p_keys), xkeyB
255 vmovdqa 6*16(p_keys), xkeyB
261 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */
265 vmovdqa 7*16(p_keys), xkeyA
270 vaesenc xkeyB, var_xdata, var_xdata /* key 6 */
274 .if (klen == KEY_128)
275 vmovdqa 8*16(p_keys), xkey8
278 vmovdqa 8*16(p_keys), xkey8
285 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */
289 .if (klen == KEY_128)
291 vmovdqa 9*16(p_keys), xkeyA
294 vmovdqa 9*16(p_keys), xkeyA
300 vaesenc xkey8, var_xdata, var_xdata /* key 8 */
304 vmovdqa 10*16(p_keys), xkeyB
309 vaesenc xkeyA, var_xdata, var_xdata /* key 9 */
313 .if (klen != KEY_128)
314 vmovdqa 11*16(p_keys), xkeyA
321 .if (klen == KEY_128)
322 vaesenclast xkeyB, var_xdata, var_xdata
324 vaesenc xkeyB, var_xdata, var_xdata
329 .if (klen != KEY_128)
331 vmovdqa 12*16(p_keys), xkey12
337 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */
341 .if (klen == KEY_256)
342 vmovdqa 13*16(p_keys), xkeyA
348 .if (klen == KEY_256)
350 vaesenc xkey12, var_xdata, var_xdata
352 vaesenclast xkey12, var_xdata, var_xdata
357 .if (klen == KEY_256)
358 vmovdqa 14*16(p_keys), xkeyB
364 vaesenc xkeyA, var_xdata, var_xdata
372 vaesenclast xkeyB, var_xdata, var_xdata
381 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
382 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
384 vpxor xkeyA, var_xdata, var_xdata
386 vpxor xkeyB, var_xdata, var_xdata
391 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
393 vpxor xkeyA, var_xdata, var_xdata
399 VMOVDQ var_xdata, i*16(p_out)
404 .macro do_aes_load val, key_len
405 do_aes \val, 1, \key_len
408 .macro do_aes_noload val, key_len
409 do_aes \val, 0, \key_len
412 /* main body of aes ctr load */
414 .macro do_aes_ctrmain key_len
417 jb .Ldo_return2\key_len
419 vmovdqa byteswap_const(%rip), xbyteswap
420 vmovdqu (p_iv), xcounter
421 vpshufb xbyteswap, xcounter, xcounter
425 jz .Lmult_of_8_blks\key_len
438 do_aes_load 1, \key_len
440 and $(~7*16), num_bytes
441 jz .Ldo_return2\key_len
442 jmp .Lmain_loop2\key_len
445 do_aes_load 2, \key_len
447 and $(~7*16), num_bytes
448 jz .Ldo_return2\key_len
449 jmp .Lmain_loop2\key_len
453 do_aes_load 3, \key_len
455 and $(~7*16), num_bytes
456 jz .Ldo_return2\key_len
457 jmp .Lmain_loop2\key_len
460 do_aes_load 4, \key_len
462 and $(~7*16), num_bytes
463 jz .Ldo_return2\key_len
464 jmp .Lmain_loop2\key_len
472 do_aes_load 5, \key_len
474 and $(~7*16), num_bytes
475 jz .Ldo_return2\key_len
476 jmp .Lmain_loop2\key_len
479 do_aes_load 6, \key_len
481 and $(~7*16), num_bytes
482 jz .Ldo_return2\key_len
483 jmp .Lmain_loop2\key_len
486 do_aes_load 7, \key_len
488 and $(~7*16), num_bytes
489 jz .Ldo_return2\key_len
490 jmp .Lmain_loop2\key_len
492 .Lmult_of_8_blks\key_len:
493 .if (\key_len != KEY_128)
494 vmovdqa 0*16(p_keys), xkey0
495 vmovdqa 4*16(p_keys), xkey4
496 vmovdqa 8*16(p_keys), xkey8
497 vmovdqa 12*16(p_keys), xkey12
499 vmovdqa 0*16(p_keys), xkey0
500 vmovdqa 3*16(p_keys), xkey4
501 vmovdqa 6*16(p_keys), xkey8
502 vmovdqa 9*16(p_keys), xkey12
505 .Lmain_loop2\key_len:
506 /* num_bytes is a multiple of 8 and >0 */
507 do_aes_noload 8, \key_len
509 sub $(8*16), num_bytes
510 jne .Lmain_loop2\key_len
512 .Ldo_return2\key_len:
513 /* return updated IV */
514 vpshufb xbyteswap, xcounter, xcounter
515 vmovdqu xcounter, (p_iv)
520 * routine to do AES128 CTR enc/decrypt "by8"
521 * XMM registers are clobbered.
522 * Saving/restoring must be done at a higher level
523 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
524 * unsigned int num_bytes)
526 ENTRY(aes_ctr_enc_128_avx_by8)
527 /* call the aes main loop */
528 do_aes_ctrmain KEY_128
530 ENDPROC(aes_ctr_enc_128_avx_by8)
533 * routine to do AES192 CTR enc/decrypt "by8"
534 * XMM registers are clobbered.
535 * Saving/restoring must be done at a higher level
536 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
537 * unsigned int num_bytes)
539 ENTRY(aes_ctr_enc_192_avx_by8)
540 /* call the aes main loop */
541 do_aes_ctrmain KEY_192
543 ENDPROC(aes_ctr_enc_192_avx_by8)
546 * routine to do AES256 CTR enc/decrypt "by8"
547 * XMM registers are clobbered.
548 * Saving/restoring must be done at a higher level
549 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
550 * unsigned int num_bytes)
552 ENTRY(aes_ctr_enc_256_avx_by8)
553 /* call the aes main loop */
554 do_aes_ctrmain KEY_256
556 ENDPROC(aes_ctr_enc_256_avx_by8)