gscm/src/asm-generic.c [gscm

gscm/src/asm-generic.c

1	#include <stdint.h>
2
3	#if __SIZEOF_POINTER__ == 4
4	typedef uint32_t full;
5	typedef int32_t sfull;
6	typedef uint16_t half;
7
8	#elif __SIZEOF_POINTER__ == 8
9	typedef uint64_t full;
10	typedef int64_t sfull;
11	typedef uint32_t half;
12
13	#else
14	#error Unsupported word size
15	#endif
16
17	#define HALF_BITS (8*sizeof(half))
18	#define FULL_BITS (8*sizeof(full))
19
20	/* Low half of word, unsigned */
21	#define LO(n) ((full)((half)(n)))
22	/* High half of word, unsigned */
23	#define HI(n) ((n)>>HALF_BITS)
24	/* High half of word, sign extended (casting back to unsigned because overflow
25	* on signed types is Undefined Behavior) */
26	#define S_HI(n) ((full)(((sfull)(n))>>HALF_BITS))
27
28	/** sc_wide_mul
29	*
30	* Computes the full (double-word) product of two unsigned words, returning the
31	* low word through the first argument and the high word through the second.
32	*
33	* The algorithm works by decomposition into half-word products. To verify,
34	* consider the variables arranged much as in schoolbook multiplication, each
35	* "digit" here being a half-word:
36	*
37	* ah al
38	* x bh bl
39	* ---------------
40	* c
41	* p1h p1l
42	* p2h p2l
43	* p3h p3l
44	* p4h p4l
45	* \_____/ \_____/
46	* high / low outputs
47	*/
48
49	void sc_wide_mul(full a, full b) {
50	full ah=HI(a), al=LO(a), bh=HI(b), bl=LO(b),
51	p1=albl, p2=ahbl, p3=albh, p4=ahbh, /* no overflow */
52	t=LO(p2) + LO(p3), /* no overflow */
53	c=HI(HI(p1) + t); /* no overflow */
54	a = p1 + (t<<HALF_BITS); / overflow captured by c */
55	*b = c + HI(p2) + HI(p3) + p4;
56	/* That the overall multiplication can't overflow should be proof enough
57	* that the last sum doesn't, but just in case:
58	* Let r = 2^HALF_BITS; the sum must be less than r^2.
59	* The upper bound of each partial product is
60	* (r - 1)^2 = r^2 - 2r + 1
61	* The upper bound of p2h or p3h is
62	* floor((r^2 - 2r + 1) / r) = r - 2
63	* And the upper bound of c is 2, so the sum is bounded by
64	* 2 + 2(r - 2) + (r^2 - 2r + 1)
65	* = r^2 + 2 - 4 + 1
66	* = r^2 - 1
67	*/
68	}
69
70	/** sc_wide_mul_signed
71	*
72	* Computes the full (double-word) product of two signed words in two's
73	* complement, returning the low word through the first argument and the high
74	* word through the second.
75	*
76	* A trivial modification of sc_wide_mult, but the proof may not be obvious.
77	*
78	* Lemma 1: AB mod r = (jr + A)(kr + B) mod r for any integers j and k
79	* since r divides (jkr^2 + jrB + krA + AB) - AB.
80	*
81	* By Lemma 1, the single word product (AB mod r) generalizes to negative
82	* numbers in two's complement representation, since the representation differs
83	* from the number by a multiple of r.
84	*
85	* The full product (AB) can thus be obtained by sign-extending A and B to
86	* double-words and taking the unsigned product mod r^2, that is, discarding
87	* overflow. The trick is skipping computations that don't affect the result.
88	*
89	* ax ax ah al
90	* x bx bx bh bl
91	* ------------------
92	* c
93	* p1h p1l <- al*bl
94	* p2h p2l <- ah*bl
95	* p5h p5l <- ax*bl
96	* p5l <- ax*bl
97	* p3h p3l <- al*bh
98	* p4h p4l <- ah*bh
99	* p6l <- ax*bh
100	* p7h p7l <- al*bx
101	* p8l <- ah*bx
102	* p7l <- al*bx
103	*
104	* XXX unfinished
105	*
106	* Sign extension of a from r to r^2:
107	* a if 0 <= a < r/2
108	* r^2 - r + a if r/2 <= a < r
109	*
110	* Looking for sx(a) * sx(b) mod r^2
111	*
112	* ab if 0 <= a < r/2 and 0 <= b < r/2
113	* a(r^2 - r + b) if 0 <= a < r/2 and r/2 <= b < r
114	* (r^2 - r + a)b if r/2 <= a < r and 0 <= b < r/2
115	* (r^2 - r + a)(r^2 - r + a) if r/2 <= a < r and r/2 <= b < r
116	*
117	* a or r(r - 1) + a === a - r (mod r^2)
118	*
119	* Let A = (r*ah + al)
120	* A === (r^2 + r*ah + al) mod r^2
121	* = r(r + ah) + al
122	* B = r(r + bh) + bl
123	*
124	* AB = (r(r + ah) + al)(r(r + bh) + bl)
125	* AB = (rah + al)(rbh + bl)
126	* = r^2ahbh + r(ahbl + bhal) + al*bl
127	* === r(ahbl + bhal) + al*bl (mod r^2)
128	* === r((ah + r)bl + (bh + r)al) + al*bl (mod r^2)
129	*
130	* === r^2(ah + r)(bh + r) + r(ah + r)bl + r(bh + r)al + al*bl
131	*/
132
133	void sc_wide_mul_signed(full a, full b) {
134	/*
135	full ah=S_HI(a), al=LO(a), bh=S_HI(b), bl=LO(b),
136	p1=albl, p2=ahbl, p3=albh, p4=ahbh,
137	t=LO(p2) + LO(p3),
138	c=HI(HI(p1) + t);
139	*a = p1 + (t<<HALF_BITS);
140	*b = c + S_HI(p2) + S_HI(p3) + p4;
141
142	Since my proof crashed and burned, a longer but more obvious alternative:
143	multiply unsigned magnitudes and negate if input signs differ. Still
144	branch-free. */
145	full a_sign_bit = (*a) >> (FULL_BITS-1),
146	a_sign_ext = ((sfull)(*a)) >> (FULL_BITS-1),
147	b_sign_bit = (*b) >> (FULL_BITS-1),
148	b_sign_ext = ((sfull)(*b)) >> (FULL_BITS-1),
149	p_sign_bit = a_sign_bit ^ b_sign_bit,
150	p_sign_ext = a_sign_ext ^ b_sign_ext,
151	tmp;
152	/* Conditional two's complement based on sign */
153	a = (a ^ a_sign_ext) + a_sign_bit;
154	b = (b ^ b_sign_ext) + b_sign_bit;
155	sc_wide_mul(a, b);
156	/* Likewise for the product, but extending to two words. C being
157	* brain-dead, we have to re-derive the carry bit; this trick from FFA
158	* (http://www.loper-os.org/?p=1913). */
159	a = a ^ p_sign_ext;
160	tmp = *a + p_sign_bit;
161	b = (b ^ p_sign_ext) + ((*a & ~tmp) >> (FULL_BITS-1));
162	*a = tmp;
163	}
164
165	/* Count leading zeros in a nonzero word, returning an integer from zero to
166	* BITS-1. (This implementation happens to treat zero the same as one, but this
167	* can't be assumed, especially for assembly versions.)
168	*
169	* 1. Let A be a 2^N-bit word rounded down to a power of two, isolating the
170	* most significant 1-bit.
171	* 2. Let D be a de Bruijn bit sequence of order N: a cyclic sequence of 2^N
172	* bits containing all possible N-bit subsequences.
173	* 3. Multiply D by A, thereby up-shifting it.
174	* 4. If at least the N-1 high bits of D are zero, the shift is equivalent to a
175	* rotation within the N high bits, so they form a perfect hash function
176	* over the possible values of A.
177	*
178	* Note that the table so constructed will be complete only if the hash
179	* function is indeed perfect; one need not understand why it works to verify
180	* that it does.
181	*
182	* See Leiserson, Prokop and Randall, "Using de Bruijn Sequences to Index a 1
183	* in a Computer Word", MIT Laboratory for Computer Science, 1998.
184	*
185	* As a further hack, some subset of de Bruijn sequences also work when
186	* rounding UP to the next power of two minus one, avoiding the need to clear
187	* the low bits. I'm not aware of a better way to find these than brute force.
188	*
189	* A program to generate the numbers is included at the end of this file. */
190	static int count_leading_zeros(full a) {
191	#if __SIZEOF_POINTER__ == 8
192	static const uint8_t tbl[64] = {
193	63, 52, 62, 51, 47, 34, 61, 50, 41, 46, 22, 38, 33, 15, 60, 2, 49, 43,
194	40, 45, 29, 27, 21, 37, 25, 32, 10, 19, 14, 7, 59, 1, 53, 48, 35, 42,
195	23, 39, 16, 3, 44, 30, 28, 26, 11, 20, 8, 54, 36, 24, 17, 4, 31, 12, 9,
196	55, 18, 5, 13, 56, 6, 57, 58, 0
197	};
198	a \|= a >> 1;
199	a \|= a >> 2;
200	a \|= a >> 4;
201	a \|= a >> 8;
202	a \|= a >> 16;
203	a \|= a >> 32;
204	return tbl[((full)0x03f08a4c6acb9dbd * a) >> 58];
205	#elif __SIZEOF_POINTER__ == 4
206	static const uint8_t tbl[32] = {
207	31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1, 23, 19, 11,
208	3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
209	};
210	a \|= a >> 1;
211	a \|= a >> 2;
212	a \|= a >> 4;
213	a \|= a >> 8;
214	a \|= a >> 16;
215	return tbl[((full)0x07c4acdd * a) >> 27];
216	#else
217	#error Unsupported word size
218	#endif
219	}
220
221	/* Bit length of unsigned integer, aka 1-based index of most significant 1-bit,
222	* aka ceil(log2(a+1)). This formulation is well defined for all possible
223	* inputs. */
224	int sc_bit_length(full a) {
225	return a ? 8*sizeof a - count_leading_zeros(a) : 0;
226	}
227
228	void sc_div_extended(full a_lo, full a_hi, full b) {
229	/* Input: two-word dividend a, one-word divisor b
230	* Precondition: divisor greater than high word of dividend (ensuring
231	* quotient fits in one word)
232	* Output: quotient in a_hi, remainder in a_lo */
233	full q0, q1, a0 = LO(a_lo), a1 = HI(a_lo), b0 = LO(b), b1 = HI(b);
234	if (b1 == 0) {
235	q1 = *a_hi / b0;
236	a1 = ((a_hi << HALF_BITS) \| a1) - q1b0;
237	*a_lo = (a1 << HALF_BITS) \| a0;
238	q0 = *a_lo / b0;
239	a_lo -= q0b0;
240	}
241	else {
242	full a2 = LO(a_hi), a3 = HI(a_hi);
243	q1 = (a3 == b1) ? (half) -1 : *a_hi / b1;
244	/* TODO
245	while (q1*(b) > (a3 a2 a1)) --q1;
246	subtract q1(b) from (a3 a2 a1) /
247	q0 = (a2 == b1) ? (half) -1 : ((a2 << HALF_BITS) \| a1) / b1;
248	/*
249	while (q0*(b) > (a2 a1 a0) --q0;
250	subtract q0(b) from (a2 a1 a0) /
251	*a_lo = (a1 << HALF_BITS) \| a0;
252	}
253	*a_hi = q1 << HALF_BITS \| q0;
254	}
255
256	#ifdef SEARCH_PHF
257	/* Search for perfect hash functions based on de Bruijn sequences */
258
259	#include <stdio.h>
260	#include <stdint.h>
261
262	static void tab32(uint32_t d) {
263	int8_t tbl[32], i;
264	for (i=0; i<32; ++i) tbl[i] = -1;
265	for (i=0; i<32; ++i) {
266	int hash = (d * (((uint32_t)-1) >> i)) >> (32-5);
267	tbl[hash] = i;
268	}
269	printf("{");
270	for (i=0; i<32; ++i) printf("%d, ", tbl[i]);
271	printf("}\n");
272	}
273
274	static int test32(uint32_t d) {
275	uint32_t slots = 0;
276	int i;
277	/* Check for perfect hash function on powers of two minus one */
278	for (i=0; i<32; ++i) {
279	int hash = (d * (((uint32_t)-1) >> i)) >> (32-5);
280	uint32_t bit = ((uint32_t)1) << hash;
281	if (slots & bit) return 0; /* collision */
282	slots \|= bit;
283	}
284	/* For good measure, make sure it also works on powers of two (is a de
285	* Bruijn sequence) */
286	slots = 0;
287	for (i=0; i<32; ++i) {
288	int hash = (d * (((uint32_t)1) << i)) >> (32-5);
289	uint32_t bit = ((uint32_t)1) << hash;
290	if (slots & bit) return 0;
291	slots \|= bit;
292	}
293	printf("%x ", d);
294	tab32(d);
295	return 1;
296	}
297
298	static void tab64(uint64_t d) {
299	int8_t tbl[64], i;
300	for (i=0; i<64; ++i) tbl[i] = -1;
301	for (i=0; i<64; ++i) {
302	int hash = (d * (-1UL >> i)) >> (64-6);
303	tbl[hash] = i;
304	}
305	printf("{");
306	for (i=0; i<64; ++i) printf("%d, ", tbl[i]);
307	printf("}\n");
308	}
309
310	static int test64(uint64_t d) {
311	uint64_t slots = 0;
312	int i;
313	for (i=0; i<64; ++i) {
314	int hash = (d * (-1UL >> i)) >> (64-6);
315	uint64_t bit = 1UL << hash;
316	if (slots & bit) return 0;
317	slots \|= bit;
318	}
319	slots = 0;
320	for (i=0; i<64; ++i) {
321	int hash = (d * (1UL << i)) >> (64-6);
322	uint64_t bit = 1UL << hash;
323	if (slots & bit) return 0;
324	slots \|= bit;
325	}
326	printf("%lx ", d);
327	tab64(d);
328	return 1;
329	}
330
331	int main() {
332	uint64_t d;
333	for (d = 0; !test32(d); ++d) ;
334	/* XXX 19 magic internet bits remain in this prefix to make the brute force
335	* approach tractable. Smarter algorithm needed. */
336	for (d = 0x03f08a4000000000; !test64(d); ++d) ;
337	return 0;
338	}
339
340	#endif