1 | /* KECCAK permutation and sponge construction |
2 | * J. Welsh, May 2019 |
3 | * |
4 | * Based on Guido Bertoni, Joan Daemen, Michael Peeters and Gilles van Assche |
5 | * (2011): The KECCAK reference, Version 3.0. |
6 | */ |
7 | |
8 | #include <stdint.h> |
9 | #include "io.h" |
10 | |
11 | /* State is arranged in a 5 x 5 x W bit array, indexed as (x,y,z). |
12 | * The 1D section for a given (x,y) is called a lane and packed in a lane_t. |
13 | * The 1D section for a given (x,z) is called a column. |
14 | * The 2D section for a given z is called a slice. |
15 | */ |
16 | |
17 | typedef uint64_t lane_t; |
18 | #define L 6 |
19 | #define W (1 << L) /* lane size = 64 bits */ |
20 | #define B 25*W /* permutation width = 1600 bits */ |
21 | /* we require L be at least 3 (W=8, B=200) to avoid sub-byte lane size */ |
22 | |
23 | #define ROUNDS (12 + 2*L) |
24 | |
25 | #define mask(x) (((unsigned) (x)) & (W-1)) |
26 | #define rot(v,r) (v = v<<mask(r) | v>>mask(W-mask(r))) |
27 | |
28 | static lane_t state[5][5]; |
29 | /* physical storage order can be swapped as a performance tweak */ |
30 | #define a(x,y) (state[x][y]) |
31 | |
32 | /* Galois linear feedback shift register, output in low bit: |
33 | * (x^t mod x^8 + x^6 + x^5 + x^4 + 1) mod x in GF(2)(x) |
34 | * (I'd forgotten my grade-school algebra but the Handbook of Applied |
35 | * Cryptography plus http://archive.is/hhvKKJ (nayuki.io) and some written |
36 | * exercises cleared this up for me.) */ |
37 | static uint8_t lfsr_step(uint8_t rstate) { |
38 | /* polynomial xor'd in when high (overflowing) bit is set */ |
39 | uint8_t gate = ((int8_t) rstate) >> 7; |
40 | return (rstate << 1) ^ (gate & 0x71); /* bits 6,5,4,0 */ |
41 | } |
42 | |
43 | /* Column parity diffusion */ |
44 | static void theta(void) { |
45 | /* LUT for x+1 mod 5 (not used on secret bits!) */ |
46 | uint8_t const inc_mod5[8] = {1,2,3,4,0,1,2,3}; |
47 | unsigned x, y; |
48 | lane_t c[5], d; |
49 | |
50 | for (x=0; x<5; ++x) |
51 | c[x] = a(x,0) ^ a(x,1) ^ a(x,2) ^ a(x,3) ^ a(x,4); |
52 | |
53 | for (x=0; x<5; ++x) { |
54 | d = c[inc_mod5[x]]; |
55 | rot(d,1); |
56 | d ^= c[inc_mod5[x+3]]; /* x+3+1 == x-1 mod 5 */ |
57 | for (y=0; y<5; ++y) |
58 | a(x,y) ^= d; |
59 | } |
60 | } |
61 | |
62 | /* Inter-slice dispersion */ |
63 | static void rho(void) { |
64 | /* precomputed offsets from Table 2.1 */ |
65 | rot(a(0,1), 36); |
66 | rot(a(0,2), 3); |
67 | rot(a(0,3), 105); |
68 | rot(a(0,4), 210); |
69 | |
70 | rot(a(1,0), 1); |
71 | rot(a(1,1), 300); |
72 | rot(a(1,2), 10); |
73 | rot(a(1,3), 45); |
74 | rot(a(1,4), 66); |
75 | |
76 | rot(a(2,0), 190); |
77 | rot(a(2,1), 6); |
78 | rot(a(2,2), 171); |
79 | rot(a(2,3), 15); |
80 | rot(a(2,4), 253); |
81 | |
82 | rot(a(3,0), 28); |
83 | rot(a(3,1), 55); |
84 | rot(a(3,2), 153); |
85 | rot(a(3,3), 21); |
86 | rot(a(3,4), 120); |
87 | |
88 | rot(a(4,0), 91); |
89 | rot(a(4,1), 276); |
90 | rot(a(4,2), 231); |
91 | rot(a(4,3), 136); |
92 | rot(a(4,4), 78); |
93 | } |
94 | |
95 | /* Slicewise transposition by matrix |
96 | * [ 0 1 ] |
97 | * [ 2 3 ] |
98 | * For all indices x, y, z: |
99 | * a'(y, 2x+3y mod 5, z) = a(x, y, z) |
100 | */ |
101 | static void pi(void) { |
102 | /* shuffle in-place by swapping alternating temporaries with successive |
103 | * destinations */ |
104 | lane_t c, d; |
105 | #define even(x,y) (c=a(x,y), a(x,y)=d) |
106 | #define odd(x,y) (d=a(x,y), a(x,y)=c) |
107 | /* a[0][0] doesn't move */ |
108 | d=a(0,1); /* 2*0 + 3*1 = 3 -> 3 */ |
109 | even(1,3); /* 2+9=11 -> 1 */ |
110 | odd (3,1); /* 6+3=9 -> 4 */ |
111 | even(1,4); /* 2+12=14 */ |
112 | odd (4,4); /* 8+12=20 */ |
113 | even(4,0); /* 8+0=8 */ |
114 | odd (0,3); /* 0+9=9 */ |
115 | even(3,4); /* 6+12=18 */ |
116 | odd (4,3); /* 8+9=17 */ |
117 | even(3,2); /* 6+6=12 */ |
118 | odd (2,2); /* 4+6=10 */ |
119 | even(2,0); /* 4+0=4 */ |
120 | odd (0,4); /* 0+12=12 */ |
121 | even(4,2); /* 8+6=14 */ |
122 | odd (2,4); /* 4+12=16 */ |
123 | even(4,1); /* 8+3=11 */ |
124 | odd (1,1); /* 2+3=5 */ |
125 | even(1,0); /* 2+0=2 */ |
126 | odd (0,2); /* 0+6=6 */ |
127 | even(2,1); /* 4+3=7 */ |
128 | odd (1,2); /* 2+6=8 */ |
129 | even(2,3); /* 4+9=13 */ |
130 | odd (3,3); /* 6+9=15 */ |
131 | even(3,0); /* 6+0=6 */ |
132 | a(0,1)=c; /* And we're back home in 24 steps, lovely! */ |
133 | } |
134 | |
135 | /* Nonlinear row mapping */ |
136 | static void chi(void) { |
137 | unsigned y; |
138 | for (y=0; y<5; ++y) { |
139 | lane_t c0=a(0,y), c1=a(1,y), c2=a(2,y), c3=a(3,y), c4=a(4,y); |
140 | |
141 | a(0,y) = c0 ^ (~c1 & c2); |
142 | a(1,y) = c1 ^ (~c2 & c3); |
143 | a(2,y) = c2 ^ (~c3 & c4); |
144 | a(3,y) = c3 ^ (~c4 & c0); |
145 | a(4,y) = c4 ^ (~c0 & c1); |
146 | } |
147 | } |
148 | |
149 | /* Round constants for disrupting z symmetry |
150 | * |
151 | * Mix LFSR bitstream sequentially into first lane with z index given by |
152 | * 2^j - 1 |
153 | * LFSR advances 7 steps per round regardless of how many "j" are needed to |
154 | * fill the lane, corresponding to the maximum 64-bit lane size. |
155 | */ |
156 | static uint8_t iota(uint8_t rstate) { |
157 | unsigned j; |
158 | for (j=0; j<=L; ++j) { |
159 | a(0,0) ^= ((lane_t) (rstate & 1)) << ((1 << j) - 1); |
160 | rstate = lfsr_step(rstate); |
161 | } |
162 | for (; j<7; ++j) |
163 | rstate = lfsr_step(rstate); |
164 | return rstate; |
165 | } |
166 | |
167 | #ifdef TEST |
168 | |
169 | #include <stdio.h> |
170 | #include "testvectors.h" |
171 | |
172 | static int test_fail; |
173 | static unsigned test, test_step; |
174 | |
175 | static void checkpoint(unsigned round, char const *step) { |
176 | int fail = 0; |
177 | unsigned x, y, i=0; |
178 | printf("test=%u round=%u %s ", test, round, step); |
179 | for (y=0; y<5; ++y) |
180 | for (x=0; x<5; ++x, ++i) { |
181 | assert(i<25); |
182 | assert(test_step<TEST_STEPS); |
183 | if (a(x,y) != tests[test][test_step][i]) { |
184 | test_fail = fail = 1; |
185 | printf("FAIL x=%u y=%u (0x%lx != 0x%lx)\n", |
186 | x, y, a(x,y), |
187 | tests[test][test_step][i]); |
188 | } |
189 | } |
190 | if (!fail) puts("PASS"); |
191 | ++test_step; |
192 | } |
193 | #else |
194 | #define checkpoint(r,s) |
195 | #endif |
196 | |
197 | static void keccakf(void) { |
198 | unsigned round; |
199 | uint8_t rstate=1; |
200 | for (round=0; round<ROUNDS; ++round) { |
201 | theta(); checkpoint(round,"theta"); |
202 | rho(); checkpoint(round,"rho"); |
203 | pi(); checkpoint(round,"pi"); |
204 | chi(); checkpoint(round,"chi"); |
205 | rstate = iota(rstate); checkpoint(round,"iota"); |
206 | } |
207 | } |
208 | |
209 | static void reset(void) { |
210 | unsigned x, y; |
211 | for (x=0; x<5; ++x) |
212 | for (y=0; y<5; ++y) |
213 | a(x,y) = 0; |
214 | } |
215 | |
216 | static void load(uint8_t const *block, unsigned len) { |
217 | /* Keeping bytes in lane little-endian, independent of machine. This |
218 | * harmonizes with (rumored) Keccak convention for interpretation of |
219 | * octet stream as bitstream. For a big-endian interpretation we could |
220 | * flip the byte load/store order and the direction of rotations. |
221 | * |
222 | * Ordering of lanes is x-major due to the specified mapping of |
223 | * bitstring to state coordinates: s[w(5y+x)+z] = a[x][y][z] */ |
224 | unsigned x, y, z, i=0; |
225 | for (y=0; y<5; ++y) |
226 | for (x=0; x<5; ++x) |
227 | for (z=0; z!=W; z+=8, ++i) |
228 | if (i==len) return; |
229 | else a(x,y) ^= ((lane_t) block[i]) << z; |
230 | } |
231 | |
232 | static void store_hex(char *buf, unsigned block_len) { |
233 | static char const hex[] = "0123456789abcdef"; |
234 | unsigned x, y, z, len = 2*block_len, i = 0; |
235 | for (y=0; y<5; ++y) |
236 | for (x=0; x<5; ++x) |
237 | for (z=0; z!=W; z+=8) { |
238 | uint8_t byte; |
239 | if (i==len) return; |
240 | byte = a(x,y) >> z; |
241 | /* Digits within the byte rendered big-endian, |
242 | * matching common practice but unfortunately |
243 | * not bitstream order. */ |
244 | buf[i++] = hex[byte>>4]; |
245 | buf[i++] = hex[byte&15]; |
246 | } |
247 | } |
248 | |
249 | static void extract(unsigned len) { |
250 | static char buf[2*B/8]; |
251 | assert(2*len <= sizeof buf); |
252 | store_hex(buf,len); |
253 | write_all(1,buf,2*len); |
254 | } |
255 | |
256 | /* Stream octets from stdin until EOF, then write hex to stdout. */ |
257 | void sponge(unsigned capacity, size_t out_bits) { |
258 | static uint8_t buf[B/8]; |
259 | unsigned rate = B - capacity, |
260 | len = rate/8, |
261 | n; |
262 | size_t out_len = out_bits/8; |
263 | assert(W == 8*sizeof(lane_t)); |
264 | assert(0 < rate && rate < B); |
265 | assert(!(rate%8)); |
266 | assert(len <= sizeof buf); |
267 | |
268 | /* absorb */ |
269 | reset(); |
270 | while ((n = read_all(0,buf,len)) == len) { |
271 | load(buf,len); |
272 | keccakf(); |
273 | } |
274 | |
275 | /* pad (1000...1) */ |
276 | if (n < len-1) { |
277 | buf[n] = 0x01; /* first bit (little-endian) */ |
278 | for (++n; n < len-1; ++n) |
279 | buf[n] = 0; |
280 | buf[n] = 0x80; /* last bit (little-endian) */ |
281 | } |
282 | else { |
283 | assert(n == len-1); |
284 | buf[n] = 0x81; /* first and last bits */ |
285 | } |
286 | load(buf,len); |
287 | keccakf(); |
288 | |
289 | /* squeeze */ |
290 | for (; out_len > len; out_len -= len) { |
291 | extract(len); |
292 | keccakf(); |
293 | } |
294 | extract(out_len); |
295 | } |
296 | |
297 | #ifdef TEST |
298 | int main() { |
299 | for (test=0; test<NTESTS; ++test) { |
300 | unsigned x, y, i=0; |
301 | /* initial state */ |
302 | for (y=0; y<5; ++y) |
303 | for (x=0; x<5; ++x, ++i) { |
304 | assert(i<25); |
305 | a(x,y) = tests[test][0][i]; |
306 | } |
307 | test_step = 1; |
308 | keccakf(); |
309 | } |
310 | return test_fail; |
311 | } |
312 | #endif |