From 077e719dfbf9bf2582bed80026251cc0d108c16e Mon Sep 17 00:00:00 2001 From: cinap_lenrek Date: Mon, 20 Nov 2017 00:10:35 +0100 Subject: libsec: write optimized _chachablock() function for amd64 / sse2 doing 4 quarterround's in parallel using 128-bit vector registers. for second round shuffle the columns and then shuffle back. code is rather obvious. only trick here is for the first quaterround PSHUFLW/PSHUFHW is used to swap the halfwords for the <<<16 rotation. --- sys/src/ape/lib/sec/amd64/mkfile | 1 + sys/src/ape/lib/sec/port/mkfile | 2 +- sys/src/libsec/amd64/chachablock.s | 74 ++++++++++++++++++++++++++++++++++++++ sys/src/libsec/amd64/mkfile | 1 + sys/src/libsec/port/chacha.c | 39 +++----------------- sys/src/libsec/port/chachablock.c | 29 +++++++++++++++ sys/src/libsec/port/mkfile | 2 +- 7 files changed, 112 insertions(+), 36 deletions(-) create mode 100644 sys/src/libsec/amd64/chachablock.s create mode 100644 sys/src/libsec/port/chachablock.c diff --git a/sys/src/ape/lib/sec/amd64/mkfile b/sys/src/ape/lib/sec/amd64/mkfile index f11c532a6..a2b3e867a 100644 --- a/sys/src/ape/lib/sec/amd64/mkfile +++ b/sys/src/ape/lib/sec/amd64/mkfile @@ -3,6 +3,7 @@ APE=/sys/src/ape LIB=/$objtype/lib/ape/libsec.a FILES=\ + chachablock\ md5block\ sha1block\ aesni\ diff --git a/sys/src/ape/lib/sec/port/mkfile b/sys/src/ape/lib/sec/port/mkfile index b25156cba..54d96bb8a 100644 --- a/sys/src/ape/lib/sec/port/mkfile +++ b/sys/src/ape/lib/sec/port/mkfile @@ -11,7 +11,7 @@ CFILES = des.c desmodes.c desECB.c desCBC.c des3ECB.c des3CBC.c\ sha1pickle.c md5pickle.c\ poly1305.c\ rc4.c\ - chacha.c\ + chacha.c chachablock.c\ salsa.c\ genrandom.c prng.c fastrand.c nfastrand.c\ probably_prime.c smallprimetest.c genprime.c dsaprimes.c\ diff --git a/sys/src/libsec/amd64/chachablock.s b/sys/src/libsec/amd64/chachablock.s new file mode 100644 index 000000000..d098c4425 --- /dev/null +++ b/sys/src/libsec/amd64/chachablock.s @@ -0,0 +1,74 @@ +#define ROTATE(n, v1, v2) \ + MOVO v1, v2; \ + PSLLL $(n), v1; \ + PSRLL $(32-n), v2; \ + POR v1, v2 + +TEXT _chachablock(SB), 0, $0 + MOVOU 0(RARG), X0 + MOVOU 16(RARG), X1 + MOVOU 32(RARG), X2 + MOVOU 48(RARG), X3 + + MOVL rounds+8(FP), CX + SHRL $1, CX + +_loop: + PADDL X1, X0 + PXOR X0, X3 + /* ROTATE(16, X3, X3) */ + PSHUFLW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3 + PSHUFHW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3 + + PADDL X3, X2 + MOVO X1, X4 + PXOR X2, X4 + ROTATE(12, X4, X1) + + PADDL X1, X0 + MOVO X0, X4 + PXOR X3, X4 + ROTATE(8, X4, X3) + + PADDL X3, X2 + MOVO X1, X4 + PXOR X2, X4 + ROTATE(7, X4, X1) + + PSHUFL $(1<<0 | 2<<2 | 3<<4 | 0<<6), X1, X1 + PSHUFL $(2<<0 | 3<<2 | 0<<4 | 1<<6), X2, X2 + PSHUFL $(3<<0 | 0<<2 | 1<<4 | 2<<6), X3, X3 + + PADDL X1, X0 + PXOR X0, X3 + /* ROTATE(16, X3, X3) */ + PSHUFLW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3 + PSHUFHW $(1<<0 | 0<<2 | 3<<4 | 2<<6), X3, X3 + + PADDL X3, X2 + MOVO X1, X4 + PXOR X2, X4 + ROTATE(12, X4, X1) + + PADDL X1, X0 + MOVO X0, X4 + PXOR X3, X4 + ROTATE(8, X4, X3) + + PADDL X3, X2 + MOVO X1, X4 + PXOR X2, X4 + ROTATE(7, X4, X1) + + PSHUFL $(3<<0 | 0<<2 | 1<<4 | 2<<6), X1, X1 + PSHUFL $(2<<0 | 3<<2 | 0<<4 | 1<<6), X2, X2 + PSHUFL $(1<<0 | 2<<2 | 3<<4 | 0<<6), X3, X3 + + DECL CX + JNE _loop + + MOVOU X0, 0(RARG) + MOVOU X1, 16(RARG) + MOVOU X2, 32(RARG) + MOVOU X3, 48(RARG) + RET diff --git a/sys/src/libsec/amd64/mkfile b/sys/src/libsec/amd64/mkfile index 990d35aa4..633fde1fe 100644 --- a/sys/src/libsec/amd64/mkfile +++ b/sys/src/libsec/amd64/mkfile @@ -3,6 +3,7 @@ objtype=amd64 LIB=/$objtype/lib/libsec.a FILES=\ + chachablock\ md5block\ sha1block\ aesni\ diff --git a/sys/src/libsec/port/chacha.c b/sys/src/libsec/port/chacha.c index b885b8b92..9431ca69a 100644 --- a/sys/src/libsec/port/chacha.c +++ b/sys/src/libsec/port/chacha.c @@ -10,26 +10,13 @@ and including the changes to block number and nonce defined in RFC7539 #include "os.h" #include -enum{ - Blockwords= ChachaBsize/sizeof(u32int) -}; +/* from chachablock.$O */ +extern void _chachablock(u32int x[16], int rounds); /* little-endian data order */ #define GET4(p) ((p)[0]|((p)[1]<<8)|((p)[2]<<16)|((p)[3]<<24)) #define PUT4(p,v) (p)[0]=(v);(p)[1]=(v)>>8;(p)[2]=(v)>>16;(p)[3]=(v)>>24 -#define ROTATE(v,c) ((u32int)((v) << (c)) | ((v) >> (32 - (c)))) - -#define QUARTERROUND(ia,ib,ic,id) { \ - u32int a, b, c, d, t; \ - a = x[ia]; b = x[ib]; c = x[ic]; d = x[id]; \ - a += b; t = d^a; d = ROTATE(t,16); \ - c += d; t = b^c; b = ROTATE(t,12); \ - a += b; t = d^a; d = ROTATE(t, 8); \ - c += d; t = b^c; b = ROTATE(t, 7); \ - x[ia] = a; x[ib] = b; x[ic] = c; x[id] = d; \ -} - #define ENCRYPT(s, x, y, d) {\ u32int v; \ v = GET4(s); \ @@ -87,22 +74,6 @@ setupChachastate(Chachastate *s, uchar *key, ulong keylen, uchar *iv, ulong ivle chacha_setiv(s, iv); } -static void -dorounds(u32int x[Blockwords], int rounds) -{ - for(; rounds > 0; rounds -= 2) { - QUARTERROUND(0, 4, 8,12) - QUARTERROUND(1, 5, 9,13) - QUARTERROUND(2, 6,10,14) - QUARTERROUND(3, 7,11,15) - - QUARTERROUND(0, 5,10,15) - QUARTERROUND(1, 6,11,12) - QUARTERROUND(2, 7, 8,13) - QUARTERROUND(3, 4, 9,14) - } -} - static void hchachablock(uchar h[32], Chachastate *s) { @@ -125,7 +96,7 @@ hchachablock(uchar h[32], Chachastate *s) x[14] = s->input[14]; x[15] = s->input[15]; - dorounds(x, s->rounds); + _chachablock(x, s->rounds); PUT4(h+0*4, x[0]); PUT4(h+1*4, x[1]); @@ -183,7 +154,7 @@ chacha_setblock(Chachastate *s, u64int blockno) static void encryptblock(Chachastate *s, uchar *src, uchar *dst) { - u32int x[Blockwords]; + u32int x[16]; int i; x[0] = s->input[0]; @@ -202,7 +173,7 @@ encryptblock(Chachastate *s, uchar *src, uchar *dst) x[13] = s->input[13]; x[14] = s->input[14]; x[15] = s->input[15]; - dorounds(x, s->rounds); + _chachablock(x, s->rounds); for(i=0; iinput[i], dst); diff --git a/sys/src/libsec/port/chachablock.c b/sys/src/libsec/port/chachablock.c new file mode 100644 index 000000000..955b48107 --- /dev/null +++ b/sys/src/libsec/port/chachablock.c @@ -0,0 +1,29 @@ +#include "os.h" + +#define ROTATE(v,c) ((u32int)((v) << (c)) | ((v) >> (32 - (c)))) + +#define QUARTERROUND(ia,ib,ic,id) { \ + u32int a, b, c, d, t; \ + a = x[ia]; b = x[ib]; c = x[ic]; d = x[id]; \ + a += b; t = d^a; d = ROTATE(t,16); \ + c += d; t = b^c; b = ROTATE(t,12); \ + a += b; t = d^a; d = ROTATE(t, 8); \ + c += d; t = b^c; b = ROTATE(t, 7); \ + x[ia] = a; x[ib] = b; x[ic] = c; x[id] = d; \ +} + +void +_chachablock(u32int x[16], int rounds) +{ + for(; rounds > 0; rounds -= 2) { + QUARTERROUND(0, 4, 8,12) + QUARTERROUND(1, 5, 9,13) + QUARTERROUND(2, 6,10,14) + QUARTERROUND(3, 7,11,15) + + QUARTERROUND(0, 5,10,15) + QUARTERROUND(1, 6,11,12) + QUARTERROUND(2, 7, 8,13) + QUARTERROUND(3, 4, 9,14) + } +} diff --git a/sys/src/libsec/port/mkfile b/sys/src/libsec/port/mkfile index e0bc8becb..b6490b7bc 100644 --- a/sys/src/libsec/port/mkfile +++ b/sys/src/libsec/port/mkfile @@ -10,7 +10,7 @@ CFILES = des.c desmodes.c desECB.c desCBC.c des3ECB.c des3CBC.c\ sha1pickle.c md5pickle.c\ poly1305.c\ rc4.c\ - chacha.c\ + chacha.c chachablock.c\ salsa.c\ genrandom.c prng.c fastrand.c nfastrand.c\ probably_prime.c smallprimetest.c genprime.c dsaprimes.c\ -- cgit v1.2.3