| 1 | /* KECCAK permutation and sponge construction |
| 2 | * J. Welsh, May 2019 |
| 3 | * |
| 4 | * Based on Guido Bertoni, Joan Daemen, Michael Peeters and Gilles van Assche |
| 5 | * (2011): The KECCAK reference, Version 3.0. |
| 6 | */ |
| 7 | |
| 8 | #include <stdint.h> |
| 9 | #include "io.h" |
| 10 | |
| 11 | /* State is arranged in a 5 x 5 x W bit array, indexed as (x,y,z). |
| 12 | * The 1D section for a given (x,y) is called a lane and packed in a lane_t. |
| 13 | * The 1D section for a given (x,z) is called a column. |
| 14 | * The 2D section for a given z is called a slice. |
| 15 | */ |
| 16 | |
| 17 | typedef uint64_t lane_t; |
| 18 | #define L 6 |
| 19 | #define W (1 << L) /* lane size = 64 bits */ |
| 20 | #define B 25*W /* permutation width = 1600 bits */ |
| 21 | /* we require L be at least 3 (W=8, B=200) to avoid sub-byte lane size */ |
| 22 | |
| 23 | #define ROUNDS (12 + 2*L) |
| 24 | |
| 25 | #define mask(x) (((unsigned) (x)) & (W-1)) |
| 26 | #define rot(v,r) (v = v<<mask(r) | v>>mask(W-mask(r))) |
| 27 | |
| 28 | static lane_t state[5][5]; |
| 29 | /* physical storage order can be swapped as a performance tweak */ |
| 30 | #define a(x,y) (state[x][y]) |
| 31 | |
| 32 | /* Galois linear feedback shift register, output in low bit: |
| 33 | * (x^t mod x^8 + x^6 + x^5 + x^4 + 1) mod x in GF(2)(x) |
| 34 | * (I'd forgotten my grade-school algebra but the Handbook of Applied |
| 35 | * Cryptography plus http://archive.is/hhvKKJ (nayuki.io) and some written |
| 36 | * exercises cleared this up for me.) */ |
| 37 | static uint8_t lfsr_step(uint8_t rstate) { |
| 38 | /* polynomial xor'd in when high (overflowing) bit is set */ |
| 39 | uint8_t gate = ((int8_t) rstate) >> 7; |
| 40 | return (rstate << 1) ^ (gate & 0x71); /* bits 6,5,4,0 */ |
| 41 | } |
| 42 | |
| 43 | /* Column parity diffusion */ |
| 44 | static void theta(void) { |
| 45 | /* LUT for x+1 mod 5 (not used on secret bits!) */ |
| 46 | uint8_t const inc_mod5[8] = {1,2,3,4,0,1,2,3}; |
| 47 | unsigned x, y; |
| 48 | lane_t c[5], d; |
| 49 | |
| 50 | for (x=0; x<5; ++x) |
| 51 | c[x] = a(x,0) ^ a(x,1) ^ a(x,2) ^ a(x,3) ^ a(x,4); |
| 52 | |
| 53 | for (x=0; x<5; ++x) { |
| 54 | d = c[inc_mod5[x]]; |
| 55 | rot(d,1); |
| 56 | d ^= c[inc_mod5[x+3]]; /* x+3+1 == x-1 mod 5 */ |
| 57 | for (y=0; y<5; ++y) |
| 58 | a(x,y) ^= d; |
| 59 | } |
| 60 | } |
| 61 | |
| 62 | /* Inter-slice dispersion */ |
| 63 | static void rho(void) { |
| 64 | /* precomputed offsets from Table 2.1 */ |
| 65 | rot(a(0,1), 36); |
| 66 | rot(a(0,2), 3); |
| 67 | rot(a(0,3), 105); |
| 68 | rot(a(0,4), 210); |
| 69 | |
| 70 | rot(a(1,0), 1); |
| 71 | rot(a(1,1), 300); |
| 72 | rot(a(1,2), 10); |
| 73 | rot(a(1,3), 45); |
| 74 | rot(a(1,4), 66); |
| 75 | |
| 76 | rot(a(2,0), 190); |
| 77 | rot(a(2,1), 6); |
| 78 | rot(a(2,2), 171); |
| 79 | rot(a(2,3), 15); |
| 80 | rot(a(2,4), 253); |
| 81 | |
| 82 | rot(a(3,0), 28); |
| 83 | rot(a(3,1), 55); |
| 84 | rot(a(3,2), 153); |
| 85 | rot(a(3,3), 21); |
| 86 | rot(a(3,4), 120); |
| 87 | |
| 88 | rot(a(4,0), 91); |
| 89 | rot(a(4,1), 276); |
| 90 | rot(a(4,2), 231); |
| 91 | rot(a(4,3), 136); |
| 92 | rot(a(4,4), 78); |
| 93 | } |
| 94 | |
| 95 | /* Slicewise transposition by matrix |
| 96 | * [ 0 1 ] |
| 97 | * [ 2 3 ] |
| 98 | * For all indices x, y, z: |
| 99 | * a'(y, 2x+3y mod 5, z) = a(x, y, z) |
| 100 | */ |
| 101 | static void pi(void) { |
| 102 | /* shuffle in-place by swapping alternating temporaries with successive |
| 103 | * destinations */ |
| 104 | lane_t c, d; |
| 105 | #define even(x,y) (c=a(x,y), a(x,y)=d) |
| 106 | #define odd(x,y) (d=a(x,y), a(x,y)=c) |
| 107 | /* a[0][0] doesn't move */ |
| 108 | d=a(0,1); /* 2*0 + 3*1 = 3 -> 3 */ |
| 109 | even(1,3); /* 2+9=11 -> 1 */ |
| 110 | odd (3,1); /* 6+3=9 -> 4 */ |
| 111 | even(1,4); /* 2+12=14 */ |
| 112 | odd (4,4); /* 8+12=20 */ |
| 113 | even(4,0); /* 8+0=8 */ |
| 114 | odd (0,3); /* 0+9=9 */ |
| 115 | even(3,4); /* 6+12=18 */ |
| 116 | odd (4,3); /* 8+9=17 */ |
| 117 | even(3,2); /* 6+6=12 */ |
| 118 | odd (2,2); /* 4+6=10 */ |
| 119 | even(2,0); /* 4+0=4 */ |
| 120 | odd (0,4); /* 0+12=12 */ |
| 121 | even(4,2); /* 8+6=14 */ |
| 122 | odd (2,4); /* 4+12=16 */ |
| 123 | even(4,1); /* 8+3=11 */ |
| 124 | odd (1,1); /* 2+3=5 */ |
| 125 | even(1,0); /* 2+0=2 */ |
| 126 | odd (0,2); /* 0+6=6 */ |
| 127 | even(2,1); /* 4+3=7 */ |
| 128 | odd (1,2); /* 2+6=8 */ |
| 129 | even(2,3); /* 4+9=13 */ |
| 130 | odd (3,3); /* 6+9=15 */ |
| 131 | even(3,0); /* 6+0=6 */ |
| 132 | a(0,1)=c; /* And we're back home in 24 steps, lovely! */ |
| 133 | } |
| 134 | |
| 135 | /* Nonlinear row mapping */ |
| 136 | static void chi(void) { |
| 137 | unsigned y; |
| 138 | for (y=0; y<5; ++y) { |
| 139 | lane_t c0=a(0,y), c1=a(1,y), c2=a(2,y), c3=a(3,y), c4=a(4,y); |
| 140 | |
| 141 | a(0,y) = c0 ^ (~c1 & c2); |
| 142 | a(1,y) = c1 ^ (~c2 & c3); |
| 143 | a(2,y) = c2 ^ (~c3 & c4); |
| 144 | a(3,y) = c3 ^ (~c4 & c0); |
| 145 | a(4,y) = c4 ^ (~c0 & c1); |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | /* Round constants for disrupting z symmetry |
| 150 | * |
| 151 | * Mix LFSR bitstream sequentially into first lane with z index given by |
| 152 | * 2^j - 1 |
| 153 | * LFSR advances 7 steps per round regardless of how many "j" are needed to |
| 154 | * fill the lane, corresponding to the maximum 64-bit lane size. |
| 155 | */ |
| 156 | static uint8_t iota(uint8_t rstate) { |
| 157 | unsigned j; |
| 158 | for (j=0; j<=L; ++j) { |
| 159 | a(0,0) ^= ((lane_t) (rstate & 1)) << ((1 << j) - 1); |
| 160 | rstate = lfsr_step(rstate); |
| 161 | } |
| 162 | for (; j<7; ++j) |
| 163 | rstate = lfsr_step(rstate); |
| 164 | return rstate; |
| 165 | } |
| 166 | |
| 167 | #ifdef TEST |
| 168 | |
| 169 | #include <stdio.h> |
| 170 | #include "testvectors.h" |
| 171 | |
| 172 | static int test_fail; |
| 173 | static unsigned test, test_step; |
| 174 | |
| 175 | static void checkpoint(unsigned round, char const *step) { |
| 176 | int fail = 0; |
| 177 | unsigned x, y, i=0; |
| 178 | printf("test=%u round=%u %s ", test, round, step); |
| 179 | for (y=0; y<5; ++y) |
| 180 | for (x=0; x<5; ++x, ++i) { |
| 181 | assert(i<25); |
| 182 | assert(test_step<TEST_STEPS); |
| 183 | if (a(x,y) != tests[test][test_step][i]) { |
| 184 | test_fail = fail = 1; |
| 185 | printf("FAIL x=%u y=%u (0x%lx != 0x%lx)\n", |
| 186 | x, y, a(x,y), |
| 187 | tests[test][test_step][i]); |
| 188 | } |
| 189 | } |
| 190 | if (!fail) puts("PASS"); |
| 191 | ++test_step; |
| 192 | } |
| 193 | #else |
| 194 | #define checkpoint(r,s) |
| 195 | #endif |
| 196 | |
| 197 | static void keccakf(void) { |
| 198 | unsigned round; |
| 199 | uint8_t rstate=1; |
| 200 | for (round=0; round<ROUNDS; ++round) { |
| 201 | theta(); checkpoint(round,"theta"); |
| 202 | rho(); checkpoint(round,"rho"); |
| 203 | pi(); checkpoint(round,"pi"); |
| 204 | chi(); checkpoint(round,"chi"); |
| 205 | rstate = iota(rstate); checkpoint(round,"iota"); |
| 206 | } |
| 207 | } |
| 208 | |
| 209 | static void reset(void) { |
| 210 | unsigned x, y; |
| 211 | for (x=0; x<5; ++x) |
| 212 | for (y=0; y<5; ++y) |
| 213 | a(x,y) = 0; |
| 214 | } |
| 215 | |
| 216 | static void load(uint8_t const *block, unsigned len) { |
| 217 | /* Keeping bytes in lane little-endian, independent of machine. This |
| 218 | * harmonizes with (rumored) Keccak convention for interpretation of |
| 219 | * octet stream as bitstream. For a big-endian interpretation we could |
| 220 | * flip the byte load/store order and the direction of rotations. |
| 221 | * |
| 222 | * Ordering of lanes is x-major due to the specified mapping of |
| 223 | * bitstring to state coordinates: s[w(5y+x)+z] = a[x][y][z] */ |
| 224 | unsigned x, y, z, i=0; |
| 225 | for (y=0; y<5; ++y) |
| 226 | for (x=0; x<5; ++x) |
| 227 | for (z=0; z!=W; z+=8, ++i) |
| 228 | if (i==len) return; |
| 229 | else a(x,y) ^= ((lane_t) block[i]) << z; |
| 230 | } |
| 231 | |
| 232 | static void store_hex(char *buf, unsigned block_len) { |
| 233 | static char const hex[] = "0123456789abcdef"; |
| 234 | unsigned x, y, z, len = 2*block_len, i = 0; |
| 235 | for (y=0; y<5; ++y) |
| 236 | for (x=0; x<5; ++x) |
| 237 | for (z=0; z!=W; z+=8) { |
| 238 | uint8_t byte; |
| 239 | if (i==len) return; |
| 240 | byte = a(x,y) >> z; |
| 241 | /* Digits within the byte rendered big-endian, |
| 242 | * matching common practice but unfortunately |
| 243 | * not bitstream order. */ |
| 244 | buf[i++] = hex[byte>>4]; |
| 245 | buf[i++] = hex[byte&15]; |
| 246 | } |
| 247 | } |
| 248 | |
| 249 | static void extract(unsigned len) { |
| 250 | static char buf[2*B/8]; |
| 251 | assert(2*len <= sizeof buf); |
| 252 | store_hex(buf,len); |
| 253 | write_all(1,buf,2*len); |
| 254 | } |
| 255 | |
| 256 | /* Stream octets from stdin until EOF, then write hex to stdout. */ |
| 257 | void sponge(unsigned capacity, size_t out_bits) { |
| 258 | static uint8_t buf[B/8]; |
| 259 | unsigned rate = B - capacity, |
| 260 | len = rate/8, |
| 261 | n; |
| 262 | size_t out_len = out_bits/8; |
| 263 | assert(W == 8*sizeof(lane_t)); |
| 264 | assert(0 < rate && rate < B); |
| 265 | assert(!(rate%8)); |
| 266 | assert(len <= sizeof buf); |
| 267 | |
| 268 | /* absorb */ |
| 269 | reset(); |
| 270 | while ((n = read_all(0,buf,len)) == len) { |
| 271 | load(buf,len); |
| 272 | keccakf(); |
| 273 | } |
| 274 | |
| 275 | /* pad (1000...1) */ |
| 276 | if (n < len-1) { |
| 277 | buf[n] = 0x01; /* first bit (little-endian) */ |
| 278 | for (++n; n < len-1; ++n) |
| 279 | buf[n] = 0; |
| 280 | buf[n] = 0x80; /* last bit (little-endian) */ |
| 281 | } |
| 282 | else { |
| 283 | assert(n == len-1); |
| 284 | buf[n] = 0x81; /* first and last bits */ |
| 285 | } |
| 286 | load(buf,len); |
| 287 | keccakf(); |
| 288 | |
| 289 | /* squeeze */ |
| 290 | for (; out_len > len; out_len -= len) { |
| 291 | extract(len); |
| 292 | keccakf(); |
| 293 | } |
| 294 | extract(out_len); |
| 295 | } |
| 296 | |
| 297 | #ifdef TEST |
| 298 | int main() { |
| 299 | for (test=0; test<NTESTS; ++test) { |
| 300 | unsigned x, y, i=0; |
| 301 | /* initial state */ |
| 302 | for (y=0; y<5; ++y) |
| 303 | for (x=0; x<5; ++x, ++i) { |
| 304 | assert(i<25); |
| 305 | a(x,y) = tests[test][0][i]; |
| 306 | } |
| 307 | test_step = 1; |
| 308 | keccakf(); |
| 309 | } |
| 310 | return test_fail; |
| 311 | } |
| 312 | #endif |