Projects : gscm : gscm_usrbin

gscm/src/asm-generic.c

Dir - Raw

1#include <stdint.h>
2
3#if __SIZEOF_POINTER__ == 4
4typedef uint32_t full;
5typedef int32_t sfull;
6typedef uint16_t half;
7
8#elif __SIZEOF_POINTER__ == 8
9typedef uint64_t full;
10typedef int64_t sfull;
11typedef uint32_t half;
12
13#else
14#error Unsupported word size
15#endif
16
17#define HALF_BITS (8*sizeof(half))
18#define FULL_BITS (8*sizeof(full))
19
20/* Low half of word, unsigned */
21#define LO(n) ((full)((half)(n)))
22/* High half of word, unsigned */
23#define HI(n) ((n)>>HALF_BITS)
24/* High half of word, sign extended (casting back to unsigned because overflow
25 * on signed types is Undefined Behavior) */
26#define S_HI(n) ((full)(((sfull)(n))>>HALF_BITS))
27
28/** sc_wide_mul
29 *
30 * Computes the full (double-word) product of two unsigned words, returning the
31 * low word through the first argument and the high word through the second.
32 *
33 * The algorithm works by decomposition into half-word products. To verify,
34 * consider the variables arranged much as in schoolbook multiplication, each
35 * "digit" here being a half-word:
36 *
37 * ah al
38 * x bh bl
39 * ---------------
40 * c
41 * p1h p1l
42 * p2h p2l
43 * p3h p3l
44 * p4h p4l
45 * \_____/ \_____/
46 * high / low outputs
47 */
48
49void sc_wide_mul(full *a, full *b) {
50 full ah=HI(*a), al=LO(*a), bh=HI(*b), bl=LO(*b),
51 p1=al*bl, p2=ah*bl, p3=al*bh, p4=ah*bh, /* no overflow */
52 t=LO(p2) + LO(p3), /* no overflow */
53 c=HI(HI(p1) + t); /* no overflow */
54 *a = p1 + (t<<HALF_BITS); /* overflow captured by c */
55 *b = c + HI(p2) + HI(p3) + p4;
56 /* That the overall multiplication can't overflow should be proof enough
57 * that the last sum doesn't, but just in case:
58 * Let r = 2^HALF_BITS; the sum must be less than r^2.
59 * The upper bound of each partial product is
60 * (r - 1)^2 = r^2 - 2r + 1
61 * The upper bound of p2h or p3h is
62 * floor((r^2 - 2r + 1) / r) = r - 2
63 * And the upper bound of c is 2, so the sum is bounded by
64 * 2 + 2(r - 2) + (r^2 - 2r + 1)
65 * = r^2 + 2 - 4 + 1
66 * = r^2 - 1
67 */
68}
69
70/** sc_wide_mul_signed
71 *
72 * Computes the full (double-word) product of two signed words in two's
73 * complement, returning the low word through the first argument and the high
74 * word through the second.
75 *
76 * A trivial modification of sc_wide_mult, but the proof may not be obvious.
77 *
78 * Lemma 1: AB mod r = (jr + A)(kr + B) mod r for any integers j and k
79 * since r divides (jkr^2 + jrB + krA + AB) - AB.
80 *
81 * By Lemma 1, the single word product (AB mod r) generalizes to negative
82 * numbers in two's complement representation, since the representation differs
83 * from the number by a multiple of r.
84 *
85 * The full product (AB) can thus be obtained by sign-extending A and B to
86 * double-words and taking the unsigned product mod r^2, that is, discarding
87 * overflow. The trick is skipping computations that don't affect the result.
88 *
89 * ax ax ah al
90 * x bx bx bh bl
91 * ------------------
92 * c
93 * p1h p1l <- al*bl
94 * p2h p2l <- ah*bl
95 * p5h p5l <- ax*bl
96 * p5l <- ax*bl
97 * p3h p3l <- al*bh
98 * p4h p4l <- ah*bh
99 * p6l <- ax*bh
100 * p7h p7l <- al*bx
101 * p8l <- ah*bx
102 * p7l <- al*bx
103 *
104 * XXX unfinished
105 *
106 * Sign extension of a from r to r^2:
107 * a if 0 <= a < r/2
108 * r^2 - r + a if r/2 <= a < r
109 *
110 * Looking for sx(a) * sx(b) mod r^2
111 *
112 * ab if 0 <= a < r/2 and 0 <= b < r/2
113 * a(r^2 - r + b) if 0 <= a < r/2 and r/2 <= b < r
114 * (r^2 - r + a)b if r/2 <= a < r and 0 <= b < r/2
115 * (r^2 - r + a)(r^2 - r + a) if r/2 <= a < r and r/2 <= b < r
116 *
117 * a or r(r - 1) + a === a - r (mod r^2)
118 *
119 * Let A = (r*ah + al)
120 * A === (r^2 + r*ah + al) mod r^2
121 * = r(r + ah) + al
122 * B = r(r + bh) + bl
123 *
124 * AB = (r(r + ah) + al)(r(r + bh) + bl)
125 * AB = (r*ah + al)(r*bh + bl)
126 * = r^2*ah*bh + r(ah*bl + bh*al) + al*bl
127 * === r(ah*bl + bh*al) + al*bl (mod r^2)
128 * === r((ah + r)*bl + (bh + r)*al) + al*bl (mod r^2)
129 *
130 * === r^2*(ah + r)*(bh + r) + r*(ah + r)*bl + r*(bh + r)*al + al*bl
131 */
132
133void sc_wide_mul_signed(full *a, full *b) {
134 /*
135 full ah=S_HI(*a), al=LO(*a), bh=S_HI(*b), bl=LO(*b),
136 p1=al*bl, p2=ah*bl, p3=al*bh, p4=ah*bh,
137 t=LO(p2) + LO(p3),
138 c=HI(HI(p1) + t);
139 *a = p1 + (t<<HALF_BITS);
140 *b = c + S_HI(p2) + S_HI(p3) + p4;
141
142 Since my proof crashed and burned, a longer but more obvious alternative:
143 multiply unsigned magnitudes and negate if input signs differ. Still
144 branch-free. */
145 full a_sign_bit = (*a) >> (FULL_BITS-1),
146 a_sign_ext = ((sfull)(*a)) >> (FULL_BITS-1),
147 b_sign_bit = (*b) >> (FULL_BITS-1),
148 b_sign_ext = ((sfull)(*b)) >> (FULL_BITS-1),
149 p_sign_bit = a_sign_bit ^ b_sign_bit,
150 p_sign_ext = a_sign_ext ^ b_sign_ext,
151 tmp;
152 /* Conditional two's complement based on sign */
153 *a = (*a ^ a_sign_ext) + a_sign_bit;
154 *b = (*b ^ b_sign_ext) + b_sign_bit;
155 sc_wide_mul(a, b);
156 /* Likewise for the product, but extending to two words. C being
157 * brain-dead, we have to re-derive the carry bit; this trick from FFA
158 * (http://www.loper-os.org/?p=1913). */
159 *a = *a ^ p_sign_ext;
160 tmp = *a + p_sign_bit;
161 *b = (*b ^ p_sign_ext) + ((*a & ~tmp) >> (FULL_BITS-1));
162 *a = tmp;
163}
164
165/* Count leading zeros in a nonzero word, returning an integer from zero to
166 * BITS-1. (This implementation happens to treat zero the same as one, but this
167 * can't be assumed, especially for assembly versions.)
168 *
169 * 1. Let A be a 2^N-bit word rounded down to a power of two, isolating the
170 * most significant 1-bit.
171 * 2. Let D be a de Bruijn bit sequence of order N: a cyclic sequence of 2^N
172 * bits containing all possible N-bit subsequences.
173 * 3. Multiply D by A, thereby up-shifting it.
174 * 4. If at least the N-1 high bits of D are zero, the shift is equivalent to a
175 * rotation within the N high bits, so they form a perfect hash function
176 * over the possible values of A.
177 *
178 * Note that the table so constructed will be complete only if the hash
179 * function is indeed perfect; one need not understand why it works to verify
180 * that it does.
181 *
182 * See Leiserson, Prokop and Randall, "Using de Bruijn Sequences to Index a 1
183 * in a Computer Word", MIT Laboratory for Computer Science, 1998.
184 *
185 * As a further hack, some subset of de Bruijn sequences also work when
186 * rounding UP to the next power of two minus one, avoiding the need to clear
187 * the low bits. I'm not aware of a better way to find these than brute force.
188 *
189 * A program to generate the numbers is included at the end of this file. */
190static int count_leading_zeros(full a) {
191#if __SIZEOF_POINTER__ == 8
192 static const uint8_t tbl[64] = {
193 63, 52, 62, 51, 47, 34, 61, 50, 41, 46, 22, 38, 33, 15, 60, 2, 49, 43,
194 40, 45, 29, 27, 21, 37, 25, 32, 10, 19, 14, 7, 59, 1, 53, 48, 35, 42,
195 23, 39, 16, 3, 44, 30, 28, 26, 11, 20, 8, 54, 36, 24, 17, 4, 31, 12, 9,
196 55, 18, 5, 13, 56, 6, 57, 58, 0
197 };
198 a |= a >> 1;
199 a |= a >> 2;
200 a |= a >> 4;
201 a |= a >> 8;
202 a |= a >> 16;
203 a |= a >> 32;
204 return tbl[((full)0x03f08a4c6acb9dbd * a) >> 58];
205#elif __SIZEOF_POINTER__ == 4
206 static const uint8_t tbl[32] = {
207 31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1, 23, 19, 11,
208 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
209 };
210 a |= a >> 1;
211 a |= a >> 2;
212 a |= a >> 4;
213 a |= a >> 8;
214 a |= a >> 16;
215 return tbl[((full)0x07c4acdd * a) >> 27];
216#else
217#error Unsupported word size
218#endif
219}
220
221/* Bit length of unsigned integer, aka 1-based index of most significant 1-bit,
222 * aka ceil(log2(a+1)). This formulation is well defined for all possible
223 * inputs. */
224int sc_bit_length(full a) {
225 return a ? 8*sizeof a - count_leading_zeros(a) : 0;
226}
227
228void sc_div_extended(full *a_lo, full *a_hi, full b) {
229 /* Input: two-word dividend a, one-word divisor b
230 * Precondition: divisor greater than high word of dividend (ensuring
231 * quotient fits in one word)
232 * Output: quotient in a_hi, remainder in a_lo */
233 full q0, q1, a0 = LO(*a_lo), a1 = HI(*a_lo), b0 = LO(b), b1 = HI(b);
234 if (b1 == 0) {
235 q1 = *a_hi / b0;
236 a1 = ((*a_hi << HALF_BITS) | a1) - q1*b0;
237 *a_lo = (a1 << HALF_BITS) | a0;
238 q0 = *a_lo / b0;
239 *a_lo -= q0*b0;
240 }
241 else {
242 full a2 = LO(*a_hi), a3 = HI(*a_hi);
243 q1 = (a3 == b1) ? (half) -1 : *a_hi / b1;
244 /* TODO
245 while (q1*(b) > (a3 a2 a1)) --q1;
246 subtract q1*(b) from (a3 a2 a1) */
247 q0 = (a2 == b1) ? (half) -1 : ((a2 << HALF_BITS) | a1) / b1;
248 /*
249 while (q0*(b) > (a2 a1 a0) --q0;
250 subtract q0*(b) from (a2 a1 a0) */
251 *a_lo = (a1 << HALF_BITS) | a0;
252 }
253 *a_hi = q1 << HALF_BITS | q0;
254}
255
256#ifdef SEARCH_PHF
257/* Search for perfect hash functions based on de Bruijn sequences */
258
259#include <stdio.h>
260#include <stdint.h>
261
262static void tab32(uint32_t d) {
263 int8_t tbl[32], i;
264 for (i=0; i<32; ++i) tbl[i] = -1;
265 for (i=0; i<32; ++i) {
266 int hash = (d * (((uint32_t)-1) >> i)) >> (32-5);
267 tbl[hash] = i;
268 }
269 printf("{");
270 for (i=0; i<32; ++i) printf("%d, ", tbl[i]);
271 printf("}\n");
272}
273
274static int test32(uint32_t d) {
275 uint32_t slots = 0;
276 int i;
277 /* Check for perfect hash function on powers of two minus one */
278 for (i=0; i<32; ++i) {
279 int hash = (d * (((uint32_t)-1) >> i)) >> (32-5);
280 uint32_t bit = ((uint32_t)1) << hash;
281 if (slots & bit) return 0; /* collision */
282 slots |= bit;
283 }
284 /* For good measure, make sure it also works on powers of two (is a de
285 * Bruijn sequence) */
286 slots = 0;
287 for (i=0; i<32; ++i) {
288 int hash = (d * (((uint32_t)1) << i)) >> (32-5);
289 uint32_t bit = ((uint32_t)1) << hash;
290 if (slots & bit) return 0;
291 slots |= bit;
292 }
293 printf("%x ", d);
294 tab32(d);
295 return 1;
296}
297
298static void tab64(uint64_t d) {
299 int8_t tbl[64], i;
300 for (i=0; i<64; ++i) tbl[i] = -1;
301 for (i=0; i<64; ++i) {
302 int hash = (d * (-1UL >> i)) >> (64-6);
303 tbl[hash] = i;
304 }
305 printf("{");
306 for (i=0; i<64; ++i) printf("%d, ", tbl[i]);
307 printf("}\n");
308}
309
310static int test64(uint64_t d) {
311 uint64_t slots = 0;
312 int i;
313 for (i=0; i<64; ++i) {
314 int hash = (d * (-1UL >> i)) >> (64-6);
315 uint64_t bit = 1UL << hash;
316 if (slots & bit) return 0;
317 slots |= bit;
318 }
319 slots = 0;
320 for (i=0; i<64; ++i) {
321 int hash = (d * (1UL << i)) >> (64-6);
322 uint64_t bit = 1UL << hash;
323 if (slots & bit) return 0;
324 slots |= bit;
325 }
326 printf("%lx ", d);
327 tab64(d);
328 return 1;
329}
330
331int main() {
332 uint64_t d;
333 for (d = 0; !test32(d); ++d) ;
334 /* XXX 19 magic internet bits remain in this prefix to make the brute force
335 * approach tractable. Smarter algorithm needed. */
336 for (d = 0x03f08a4000000000; !test64(d); ++d) ;
337 return 0;
338}
339
340#endif