crypto: twofish/avx - avoid using temporary stack buffers

author Jussi Kivilinna <jussi.kivilinna@mbnet.fi>

Sat, 20 Oct 2012 12:06:46 +0000 (15:06 +0300)

committer Herbert Xu <herbert@gondor.apana.org.au>

Wed, 24 Oct 2012 13:10:55 +0000 (21:10 +0800)
author Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Sat, 20 Oct 2012 12:06:46 +0000 (15:06 +0300)
committer Herbert Xu <herbert@gondor.apana.org.au>
Wed, 24 Oct 2012 13:10:55 +0000 (21:10 +0800)
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S

index 1585abb13ddec3482c07bcd8fa09d470513814a7..ebac16bfa8302331d50bca8dc760965845a4428f 100644 (file)
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -23,7 +23,16 @@
   *
   */
  
+#include "glue_helper-asm-avx.S"
+
  .file "twofish-avx-x86_64-asm_64.S"
+
+.data
+.align 16
+
+.Lbswap128_mask:
+       .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
  .text
  
  /* structure of crypto context */
@@ -217,69 +226,45 @@
         vpunpcklqdq             x3, t2, x2; \
         vpunpckhqdq             x3, t2, x3;
  
-#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
-       vpxor (0*4*4)(in),      wkey, x0; \
-       vpxor (1*4*4)(in),      wkey, x1; \
-       vpxor (2*4*4)(in),      wkey, x2; \
-       vpxor (3*4*4)(in),      wkey, x3; \
+#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
+       vpxor           x0, wkey, x0; \
+       vpxor           x1, wkey, x1; \
+       vpxor           x2, wkey, x2; \
+       vpxor           x3, wkey, x3; \
         \
         transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
  
-#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
-       transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
-       \
-       vpxor           x0, wkey, x0;     \
-       vmovdqu         x0, (0*4*4)(out); \
-       vpxor           x1, wkey, x1;     \
-       vmovdqu         x1, (1*4*4)(out); \
-       vpxor           x2, wkey, x2;     \
-       vmovdqu         x2, (2*4*4)(out); \
-       vpxor           x3, wkey, x3;     \
-       vmovdqu         x3, (3*4*4)(out);
-
-#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
+#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
         transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
         \
-       vpxor           x0, wkey, x0;         \
-       vpxor           (0*4*4)(out), x0, x0; \
-       vmovdqu         x0, (0*4*4)(out);     \
-       vpxor           x1, wkey, x1;         \
-       vpxor           (1*4*4)(out), x1, x1; \
-       vmovdqu         x1, (1*4*4)(out);     \
-       vpxor           x2, wkey, x2;         \
-       vpxor           (2*4*4)(out), x2, x2; \
-       vmovdqu         x2, (2*4*4)(out);     \
-       vpxor           x3, wkey, x3;         \
-       vpxor           (3*4*4)(out), x3, x3; \
-       vmovdqu         x3, (3*4*4)(out);
+       vpxor           x0, wkey, x0; \
+       vpxor           x1, wkey, x1; \
+       vpxor           x2, wkey, x2; \
+       vpxor           x3, wkey, x3;
  
  .align 8
-.global __twofish_enc_blk_8way
-.type   __twofish_enc_blk_8way,@function;
+.type  __twofish_enc_blk8,@function;
  
-__twofish_enc_blk_8way:
+__twofish_enc_blk8:
         /* input:
          *      %rdi: ctx, CTX
-        *      %rsi: dst
-        *      %rdx: src
-        *      %rcx: bool, if true: xor output
+        *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
+        * output:
+        *      RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
          */
  
+       vmovdqu w(CTX), RK1;
+
         pushq %rbp;
         pushq %rbx;
         pushq %rcx;
  
-       vmovdqu w(CTX), RK1;
-
-       leaq (4*4*4)(%rdx), %rax;
-       inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+       inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
         preload_rgi(RA1);
         rotate_1l(RD1);
-       inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+       inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
         rotate_1l(RD2);
  
-       movq %rsi, %r11;
-
         encrypt_cycle(0);
         encrypt_cycle(1);
         encrypt_cycle(2);
@@ -295,47 +280,33 @@ __twofish_enc_blk_8way:
         popq %rbx;
         popq %rbp;
  
-       leaq (4*4*4)(%r11), %rax;
-
-       testb %cl, %cl;
-       jnz __enc_xor8;
-
-       outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
-       outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
-
-       ret;
-
-__enc_xor8:
-       outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
-       outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
+       outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+       outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
  
         ret;
  
  .align 8
-.global twofish_dec_blk_8way
-.type   twofish_dec_blk_8way,@function;
+.type  __twofish_dec_blk8,@function;
  
-twofish_dec_blk_8way:
+__twofish_dec_blk8:
         /* input:
          *      %rdi: ctx, CTX
-        *      %rsi: dst
-        *      %rdx: src
+        *      RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
+        * output:
+        *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
          */
  
+       vmovdqu (w+4*4)(CTX), RK1;
+
         pushq %rbp;
         pushq %rbx;
  
-       vmovdqu (w+4*4)(CTX), RK1;
-
-       leaq (4*4*4)(%rdx), %rax;
-       inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+       inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
         preload_rgi(RC1);
         rotate_1l(RA1);
-       inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
+       inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
         rotate_1l(RA2);
  
-       movq %rsi, %r11;
-
         decrypt_cycle(7);
         decrypt_cycle(6);
         decrypt_cycle(5);
@@ -350,8 +321,103 @@ twofish_dec_blk_8way:
         popq %rbx;
         popq %rbp;
  
-       leaq (4*4*4)(%r11), %rax;
-       outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
-       outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+       outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+       outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
+
+       ret;
+
+.align 8
+.global twofish_ecb_enc_8way
+.type   twofish_ecb_enc_8way,@function;
+
+twofish_ecb_enc_8way:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst
+        *      %rdx: src
+        */
+
+       movq %rsi, %r11;
+
+       load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+       call __twofish_enc_blk8;
+
+       store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+       ret;
+
+.align 8
+.global twofish_ecb_dec_8way
+.type   twofish_ecb_dec_8way,@function;
+
+twofish_ecb_dec_8way:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst
+        *      %rdx: src
+        */
+
+       movq %rsi, %r11;
+
+       load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+       call __twofish_dec_blk8;
+
+       store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+       ret;
+
+.align 8
+.global twofish_cbc_dec_8way
+.type   twofish_cbc_dec_8way,@function;
+
+twofish_cbc_dec_8way:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst
+        *      %rdx: src
+        */
+
+       pushq %r12;
+
+       movq %rsi, %r11;
+       movq %rdx, %r12;
+
+       load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+       call __twofish_dec_blk8;
+
+       store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+
+       popq %r12;
+
+       ret;
+
+.align 8
+.global twofish_ctr_8way
+.type   twofish_ctr_8way,@function;
+
+twofish_ctr_8way:
+       /* input:
+        *      %rdi: ctx, CTX
+        *      %rsi: dst
+        *      %rdx: src
+        *      %rcx: iv (little endian, 128bit)
+        */
+
+       pushq %r12;
+
+       movq %rsi, %r11;
+       movq %rdx, %r12;
+
+       load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
+                     RD2, RX0, RX1, RY0);
+
+       call __twofish_enc_blk8;
+
+       store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+
+       popq %r12;
  
         ret;
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c

index 810e45d5118695b70e52dee7a61bfb0599c976e2..94ac91d26e47e3b4ef415df06115d1c490b9f6a9 100644 (file)
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -45,66 +45,23 @@
  
  #define TWOFISH_PARALLEL_BLOCKS 8
  
-static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
-                                       const u8 *src)
-{
-       __twofish_enc_blk_3way(ctx, dst, src, false);
-}
-
  /* 8-way parallel cipher functions */
-asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst,
-                                      const u8 *src, bool xor);
-asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst,
+asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
+                                    const u8 *src);
+asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
                                      const u8 *src);
  
-static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst,
-                                       const u8 *src)
-{
-       __twofish_enc_blk_8way(ctx, dst, src, false);
-}
-
-static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst,
-                                           const u8 *src)
-{
-       __twofish_enc_blk_8way(ctx, dst, src, true);
-}
+asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
+                                    const u8 *src);
+asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
+                                const u8 *src, le128 *iv);
  
-static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst,
+static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
                                         const u8 *src)
  {
-       twofish_dec_blk_8way(ctx, dst, src);
-}
-
-static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
-{
-       u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1];
-       unsigned int j;
-
-       for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
-               ivs[j] = src[j];
-
-       twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
-
-       for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
-               u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
+       __twofish_enc_blk_3way(ctx, dst, src, false);
  }
  
-static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
-                                    le128 *iv)
-{
-       be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
-       unsigned int i;
-
-       for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) {
-               if (dst != src)
-                       dst[i] = src[i];
-
-               le128_to_be128(&ctrblks[i], iv);
-               le128_inc(iv);
-       }
-
-       twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
-}
  
  static const struct common_glue_ctx twofish_enc = {
         .num_funcs = 3,
@@ -112,7 +69,7 @@ static const struct common_glue_ctx twofish_enc = {
  
         .funcs = { {
                 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
-               .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) }
+               .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
         }, {
                 .num_blocks = 3,
                 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
@@ -128,7 +85,7 @@ static const struct common_glue_ctx twofish_ctr = {
  
         .funcs = { {
                 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
-               .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) }
+               .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
         }, {
                 .num_blocks = 3,
                 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
@@ -144,7 +101,7 @@ static const struct common_glue_ctx twofish_dec = {
  
         .funcs = { {
                 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
-               .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) }
+               .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
         }, {
                 .num_blocks = 3,
                 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
@@ -160,7 +117,7 @@ static const struct common_glue_ctx twofish_dec_cbc = {
  
         .funcs = { {
                 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
-               .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) }
+               .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
         }, {
                 .num_blocks = 3,
                 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
@@ -227,7 +184,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
         ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
  
         if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
-               twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst);
+               twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
                 return;
         }
  
@@ -249,7 +206,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
         ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
  
         if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
-               twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst);
+               twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
                 return;
         }
author	Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
	Sat, 20 Oct 2012 12:06:46 +0000 (15:06 +0300)
committer	Herbert Xu <herbert@gondor.apana.org.au>
	Wed, 24 Oct 2012 13:10:55 +0000 (21:10 +0800)
arch/x86/crypto/twofish-avx-x86_64-asm_64.S		patch \| blob \| history
arch/x86/crypto/twofish_avx_glue.c		patch \| blob \| history