Projects : gscm : gscm_usrbin
1 | #include <stdint.h> |
2 | |
3 | #if __SIZEOF_POINTER__ == 4 |
4 | typedef uint32_t full; |
5 | typedef int32_t sfull; |
6 | typedef uint16_t half; |
7 | |
8 | #elif __SIZEOF_POINTER__ == 8 |
9 | typedef uint64_t full; |
10 | typedef int64_t sfull; |
11 | typedef uint32_t half; |
12 | |
13 | #else |
14 | #error Unsupported word size |
15 | #endif |
16 | |
17 | #define HALF_BITS (8*sizeof(half)) |
18 | #define FULL_BITS (8*sizeof(full)) |
19 | |
20 | /* Low half of word, unsigned */ |
21 | #define LO(n) ((full)((half)(n))) |
22 | /* High half of word, unsigned */ |
23 | #define HI(n) ((n)>>HALF_BITS) |
24 | /* High half of word, sign extended (casting back to unsigned because overflow |
25 | * on signed types is Undefined Behavior) */ |
26 | #define S_HI(n) ((full)(((sfull)(n))>>HALF_BITS)) |
27 | |
28 | /** sc_wide_mul |
29 | * |
30 | * Computes the full (double-word) product of two unsigned words, returning the |
31 | * low word through the first argument and the high word through the second. |
32 | * |
33 | * The algorithm works by decomposition into half-word products. To verify, |
34 | * consider the variables arranged much as in schoolbook multiplication, each |
35 | * "digit" here being a half-word: |
36 | * |
37 | * ah al |
38 | * x bh bl |
39 | * --------------- |
40 | * c |
41 | * p1h p1l |
42 | * p2h p2l |
43 | * p3h p3l |
44 | * p4h p4l |
45 | * \_____/ \_____/ |
46 | * high / low outputs |
47 | */ |
48 | |
49 | void sc_wide_mul(full *a, full *b) { |
50 | full ah=HI(*a), al=LO(*a), bh=HI(*b), bl=LO(*b), |
51 | p1=al*bl, p2=ah*bl, p3=al*bh, p4=ah*bh, /* no overflow */ |
52 | t=LO(p2) + LO(p3), /* no overflow */ |
53 | c=HI(HI(p1) + t); /* no overflow */ |
54 | *a = p1 + (t<<HALF_BITS); /* overflow captured by c */ |
55 | *b = c + HI(p2) + HI(p3) + p4; |
56 | /* That the overall multiplication can't overflow should be proof enough |
57 | * that the last sum doesn't, but just in case: |
58 | * Let r = 2^HALF_BITS; the sum must be less than r^2. |
59 | * The upper bound of each partial product is |
60 | * (r - 1)^2 = r^2 - 2r + 1 |
61 | * The upper bound of p2h or p3h is |
62 | * floor((r^2 - 2r + 1) / r) = r - 2 |
63 | * And the upper bound of c is 2, so the sum is bounded by |
64 | * 2 + 2(r - 2) + (r^2 - 2r + 1) |
65 | * = r^2 + 2 - 4 + 1 |
66 | * = r^2 - 1 |
67 | */ |
68 | } |
69 | |
70 | /** sc_wide_mul_signed |
71 | * |
72 | * Computes the full (double-word) product of two signed words in two's |
73 | * complement, returning the low word through the first argument and the high |
74 | * word through the second. |
75 | * |
76 | * A trivial modification of sc_wide_mult, but the proof may not be obvious. |
77 | * |
78 | * Lemma 1: AB mod r = (jr + A)(kr + B) mod r for any integers j and k |
79 | * since r divides (jkr^2 + jrB + krA + AB) - AB. |
80 | * |
81 | * By Lemma 1, the single word product (AB mod r) generalizes to negative |
82 | * numbers in two's complement representation, since the representation differs |
83 | * from the number by a multiple of r. |
84 | * |
85 | * The full product (AB) can thus be obtained by sign-extending A and B to |
86 | * double-words and taking the unsigned product mod r^2, that is, discarding |
87 | * overflow. The trick is skipping computations that don't affect the result. |
88 | * |
89 | * ax ax ah al |
90 | * x bx bx bh bl |
91 | * ------------------ |
92 | * c |
93 | * p1h p1l <- al*bl |
94 | * p2h p2l <- ah*bl |
95 | * p5h p5l <- ax*bl |
96 | * p5l <- ax*bl |
97 | * p3h p3l <- al*bh |
98 | * p4h p4l <- ah*bh |
99 | * p6l <- ax*bh |
100 | * p7h p7l <- al*bx |
101 | * p8l <- ah*bx |
102 | * p7l <- al*bx |
103 | * |
104 | * XXX unfinished |
105 | * |
106 | * Sign extension of a from r to r^2: |
107 | * a if 0 <= a < r/2 |
108 | * r^2 - r + a if r/2 <= a < r |
109 | * |
110 | * Looking for sx(a) * sx(b) mod r^2 |
111 | * |
112 | * ab if 0 <= a < r/2 and 0 <= b < r/2 |
113 | * a(r^2 - r + b) if 0 <= a < r/2 and r/2 <= b < r |
114 | * (r^2 - r + a)b if r/2 <= a < r and 0 <= b < r/2 |
115 | * (r^2 - r + a)(r^2 - r + a) if r/2 <= a < r and r/2 <= b < r |
116 | * |
117 | * a or r(r - 1) + a === a - r (mod r^2) |
118 | * |
119 | * Let A = (r*ah + al) |
120 | * A === (r^2 + r*ah + al) mod r^2 |
121 | * = r(r + ah) + al |
122 | * B = r(r + bh) + bl |
123 | * |
124 | * AB = (r(r + ah) + al)(r(r + bh) + bl) |
125 | * AB = (r*ah + al)(r*bh + bl) |
126 | * = r^2*ah*bh + r(ah*bl + bh*al) + al*bl |
127 | * === r(ah*bl + bh*al) + al*bl (mod r^2) |
128 | * === r((ah + r)*bl + (bh + r)*al) + al*bl (mod r^2) |
129 | * |
130 | * === r^2*(ah + r)*(bh + r) + r*(ah + r)*bl + r*(bh + r)*al + al*bl |
131 | */ |
132 | |
133 | void sc_wide_mul_signed(full *a, full *b) { |
134 | /* |
135 | full ah=S_HI(*a), al=LO(*a), bh=S_HI(*b), bl=LO(*b), |
136 | p1=al*bl, p2=ah*bl, p3=al*bh, p4=ah*bh, |
137 | t=LO(p2) + LO(p3), |
138 | c=HI(HI(p1) + t); |
139 | *a = p1 + (t<<HALF_BITS); |
140 | *b = c + S_HI(p2) + S_HI(p3) + p4; |
141 | |
142 | Since my proof crashed and burned, a longer but more obvious alternative: |
143 | multiply unsigned magnitudes and negate if input signs differ. Still |
144 | branch-free. */ |
145 | full a_sign_bit = (*a) >> (FULL_BITS-1), |
146 | a_sign_ext = ((sfull)(*a)) >> (FULL_BITS-1), |
147 | b_sign_bit = (*b) >> (FULL_BITS-1), |
148 | b_sign_ext = ((sfull)(*b)) >> (FULL_BITS-1), |
149 | p_sign_bit = a_sign_bit ^ b_sign_bit, |
150 | p_sign_ext = a_sign_ext ^ b_sign_ext, |
151 | tmp; |
152 | /* Conditional two's complement based on sign */ |
153 | *a = (*a ^ a_sign_ext) + a_sign_bit; |
154 | *b = (*b ^ b_sign_ext) + b_sign_bit; |
155 | sc_wide_mul(a, b); |
156 | /* Likewise for the product, but extending to two words. C being |
157 | * brain-dead, we have to re-derive the carry bit; this trick from FFA |
158 | * (http://www.loper-os.org/?p=1913). */ |
159 | *a = *a ^ p_sign_ext; |
160 | tmp = *a + p_sign_bit; |
161 | *b = (*b ^ p_sign_ext) + ((*a & ~tmp) >> (FULL_BITS-1)); |
162 | *a = tmp; |
163 | } |
164 | |
165 | /* Count leading zeros in a nonzero word, returning an integer from zero to |
166 | * BITS-1. (This implementation happens to treat zero the same as one, but this |
167 | * can't be assumed, especially for assembly versions.) |
168 | * |
169 | * 1. Let A be a 2^N-bit word rounded down to a power of two, isolating the |
170 | * most significant 1-bit. |
171 | * 2. Let D be a de Bruijn bit sequence of order N: a cyclic sequence of 2^N |
172 | * bits containing all possible N-bit subsequences. |
173 | * 3. Multiply D by A, thereby up-shifting it. |
174 | * 4. If at least the N-1 high bits of D are zero, the shift is equivalent to a |
175 | * rotation within the N high bits, so they form a perfect hash function |
176 | * over the possible values of A. |
177 | * |
178 | * Note that the table so constructed will be complete only if the hash |
179 | * function is indeed perfect; one need not understand why it works to verify |
180 | * that it does. |
181 | * |
182 | * See Leiserson, Prokop and Randall, "Using de Bruijn Sequences to Index a 1 |
183 | * in a Computer Word", MIT Laboratory for Computer Science, 1998. |
184 | * |
185 | * As a further hack, some subset of de Bruijn sequences also work when |
186 | * rounding UP to the next power of two minus one, avoiding the need to clear |
187 | * the low bits. I'm not aware of a better way to find these than brute force. |
188 | * |
189 | * A program to generate the numbers is included at the end of this file. */ |
190 | static int count_leading_zeros(full a) { |
191 | #if __SIZEOF_POINTER__ == 8 |
192 | static const uint8_t tbl[64] = { |
193 | 63, 52, 62, 51, 47, 34, 61, 50, 41, 46, 22, 38, 33, 15, 60, 2, 49, 43, |
194 | 40, 45, 29, 27, 21, 37, 25, 32, 10, 19, 14, 7, 59, 1, 53, 48, 35, 42, |
195 | 23, 39, 16, 3, 44, 30, 28, 26, 11, 20, 8, 54, 36, 24, 17, 4, 31, 12, 9, |
196 | 55, 18, 5, 13, 56, 6, 57, 58, 0 |
197 | }; |
198 | a |= a >> 1; |
199 | a |= a >> 2; |
200 | a |= a >> 4; |
201 | a |= a >> 8; |
202 | a |= a >> 16; |
203 | a |= a >> 32; |
204 | return tbl[((full)0x03f08a4c6acb9dbd * a) >> 58]; |
205 | #elif __SIZEOF_POINTER__ == 4 |
206 | static const uint8_t tbl[32] = { |
207 | 31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1, 23, 19, 11, |
208 | 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0 |
209 | }; |
210 | a |= a >> 1; |
211 | a |= a >> 2; |
212 | a |= a >> 4; |
213 | a |= a >> 8; |
214 | a |= a >> 16; |
215 | return tbl[((full)0x07c4acdd * a) >> 27]; |
216 | #else |
217 | #error Unsupported word size |
218 | #endif |
219 | } |
220 | |
221 | /* Bit length of unsigned integer, aka 1-based index of most significant 1-bit, |
222 | * aka ceil(log2(a+1)). This formulation is well defined for all possible |
223 | * inputs. */ |
224 | int sc_bit_length(full a) { |
225 | return a ? 8*sizeof a - count_leading_zeros(a) : 0; |
226 | } |
227 | |
228 | void sc_div_extended(full *a_lo, full *a_hi, full b) { |
229 | /* Input: two-word dividend a, one-word divisor b |
230 | * Precondition: divisor greater than high word of dividend (ensuring |
231 | * quotient fits in one word) |
232 | * Output: quotient in a_hi, remainder in a_lo */ |
233 | full q0, q1, a0 = LO(*a_lo), a1 = HI(*a_lo), b0 = LO(b), b1 = HI(b); |
234 | if (b1 == 0) { |
235 | q1 = *a_hi / b0; |
236 | a1 = ((*a_hi << HALF_BITS) | a1) - q1*b0; |
237 | *a_lo = (a1 << HALF_BITS) | a0; |
238 | q0 = *a_lo / b0; |
239 | *a_lo -= q0*b0; |
240 | } |
241 | else { |
242 | full a2 = LO(*a_hi), a3 = HI(*a_hi); |
243 | q1 = (a3 == b1) ? (half) -1 : *a_hi / b1; |
244 | /* TODO |
245 | while (q1*(b) > (a3 a2 a1)) --q1; |
246 | subtract q1*(b) from (a3 a2 a1) */ |
247 | q0 = (a2 == b1) ? (half) -1 : ((a2 << HALF_BITS) | a1) / b1; |
248 | /* |
249 | while (q0*(b) > (a2 a1 a0) --q0; |
250 | subtract q0*(b) from (a2 a1 a0) */ |
251 | *a_lo = (a1 << HALF_BITS) | a0; |
252 | } |
253 | *a_hi = q1 << HALF_BITS | q0; |
254 | } |
255 | |
256 | #ifdef SEARCH_PHF |
257 | /* Search for perfect hash functions based on de Bruijn sequences */ |
258 | |
259 | #include <stdio.h> |
260 | #include <stdint.h> |
261 | |
262 | static void tab32(uint32_t d) { |
263 | int8_t tbl[32], i; |
264 | for (i=0; i<32; ++i) tbl[i] = -1; |
265 | for (i=0; i<32; ++i) { |
266 | int hash = (d * (((uint32_t)-1) >> i)) >> (32-5); |
267 | tbl[hash] = i; |
268 | } |
269 | printf("{"); |
270 | for (i=0; i<32; ++i) printf("%d, ", tbl[i]); |
271 | printf("}\n"); |
272 | } |
273 | |
274 | static int test32(uint32_t d) { |
275 | uint32_t slots = 0; |
276 | int i; |
277 | /* Check for perfect hash function on powers of two minus one */ |
278 | for (i=0; i<32; ++i) { |
279 | int hash = (d * (((uint32_t)-1) >> i)) >> (32-5); |
280 | uint32_t bit = ((uint32_t)1) << hash; |
281 | if (slots & bit) return 0; /* collision */ |
282 | slots |= bit; |
283 | } |
284 | /* For good measure, make sure it also works on powers of two (is a de |
285 | * Bruijn sequence) */ |
286 | slots = 0; |
287 | for (i=0; i<32; ++i) { |
288 | int hash = (d * (((uint32_t)1) << i)) >> (32-5); |
289 | uint32_t bit = ((uint32_t)1) << hash; |
290 | if (slots & bit) return 0; |
291 | slots |= bit; |
292 | } |
293 | printf("%x ", d); |
294 | tab32(d); |
295 | return 1; |
296 | } |
297 | |
298 | static void tab64(uint64_t d) { |
299 | int8_t tbl[64], i; |
300 | for (i=0; i<64; ++i) tbl[i] = -1; |
301 | for (i=0; i<64; ++i) { |
302 | int hash = (d * (-1UL >> i)) >> (64-6); |
303 | tbl[hash] = i; |
304 | } |
305 | printf("{"); |
306 | for (i=0; i<64; ++i) printf("%d, ", tbl[i]); |
307 | printf("}\n"); |
308 | } |
309 | |
310 | static int test64(uint64_t d) { |
311 | uint64_t slots = 0; |
312 | int i; |
313 | for (i=0; i<64; ++i) { |
314 | int hash = (d * (-1UL >> i)) >> (64-6); |
315 | uint64_t bit = 1UL << hash; |
316 | if (slots & bit) return 0; |
317 | slots |= bit; |
318 | } |
319 | slots = 0; |
320 | for (i=0; i<64; ++i) { |
321 | int hash = (d * (1UL << i)) >> (64-6); |
322 | uint64_t bit = 1UL << hash; |
323 | if (slots & bit) return 0; |
324 | slots |= bit; |
325 | } |
326 | printf("%lx ", d); |
327 | tab64(d); |
328 | return 1; |
329 | } |
330 | |
331 | int main() { |
332 | uint64_t d; |
333 | for (d = 0; !test32(d); ++d) ; |
334 | /* XXX 19 magic internet bits remain in this prefix to make the brute force |
335 | * approach tractable. Smarter algorithm needed. */ |
336 | for (d = 0x03f08a4000000000; !test64(d); ++d) ; |
337 | return 0; |
338 | } |
339 | |
340 | #endif |