#include #if __SIZEOF_POINTER__ == 4 typedef uint32_t full; typedef int32_t sfull; typedef uint16_t half; #elif __SIZEOF_POINTER__ == 8 typedef uint64_t full; typedef int64_t sfull; typedef uint32_t half; #else #error Unsupported word size #endif #define HALF_BITS (8*sizeof(half)) #define FULL_BITS (8*sizeof(full)) /* Low half of word, unsigned */ #define LO(n) ((full)((half)(n))) /* High half of word, unsigned */ #define HI(n) ((n)>>HALF_BITS) /* High half of word, sign extended (casting back to unsigned because overflow * on signed types is Undefined Behavior) */ #define S_HI(n) ((full)(((sfull)(n))>>HALF_BITS)) /** sc_wide_mul * * Computes the full (double-word) product of two unsigned words, returning the * low word through the first argument and the high word through the second. * * The algorithm works by decomposition into half-word products. To verify, * consider the variables arranged much as in schoolbook multiplication, each * "digit" here being a half-word: * * ah al * x bh bl * --------------- * c * p1h p1l * p2h p2l * p3h p3l * p4h p4l * \_____/ \_____/ * high / low outputs */ void sc_wide_mul(full *a, full *b) { full ah=HI(*a), al=LO(*a), bh=HI(*b), bl=LO(*b), p1=al*bl, p2=ah*bl, p3=al*bh, p4=ah*bh, /* no overflow */ t=LO(p2) + LO(p3), /* no overflow */ c=HI(HI(p1) + t); /* no overflow */ *a = p1 + (t<> (FULL_BITS-1), a_sign_ext = ((sfull)(*a)) >> (FULL_BITS-1), b_sign_bit = (*b) >> (FULL_BITS-1), b_sign_ext = ((sfull)(*b)) >> (FULL_BITS-1), p_sign_bit = a_sign_bit ^ b_sign_bit, p_sign_ext = a_sign_ext ^ b_sign_ext, tmp; /* Conditional two's complement based on sign */ *a = (*a ^ a_sign_ext) + a_sign_bit; *b = (*b ^ b_sign_ext) + b_sign_bit; sc_wide_mul(a, b); /* Likewise for the product, but extending to two words. C being * brain-dead, we have to re-derive the carry bit; this trick from FFA * (http://www.loper-os.org/?p=1913). */ *a = *a ^ p_sign_ext; tmp = *a + p_sign_bit; *b = (*b ^ p_sign_ext) + ((*a & ~tmp) >> (FULL_BITS-1)); *a = tmp; } /* Count leading zeros in a nonzero word, returning an integer from zero to * BITS-1. (This implementation happens to treat zero the same as one, but this * can't be assumed, especially for assembly versions.) * * 1. Let A be a 2^N-bit word rounded down to a power of two, isolating the * most significant 1-bit. * 2. Let D be a de Bruijn bit sequence of order N: a cyclic sequence of 2^N * bits containing all possible N-bit subsequences. * 3. Multiply D by A, thereby up-shifting it. * 4. If at least the N-1 high bits of D are zero, the shift is equivalent to a * rotation within the N high bits, so they form a perfect hash function * over the possible values of A. * * Note that the table so constructed will be complete only if the hash * function is indeed perfect; one need not understand why it works to verify * that it does. * * See Leiserson, Prokop and Randall, "Using de Bruijn Sequences to Index a 1 * in a Computer Word", MIT Laboratory for Computer Science, 1998. * * As a further hack, some subset of de Bruijn sequences also work when * rounding UP to the next power of two minus one, avoiding the need to clear * the low bits. I'm not aware of a better way to find these than brute force. * * A program to generate the numbers is included at the end of this file. */ static int count_leading_zeros(full a) { #if __SIZEOF_POINTER__ == 8 static const uint8_t tbl[64] = { 63, 52, 62, 51, 47, 34, 61, 50, 41, 46, 22, 38, 33, 15, 60, 2, 49, 43, 40, 45, 29, 27, 21, 37, 25, 32, 10, 19, 14, 7, 59, 1, 53, 48, 35, 42, 23, 39, 16, 3, 44, 30, 28, 26, 11, 20, 8, 54, 36, 24, 17, 4, 31, 12, 9, 55, 18, 5, 13, 56, 6, 57, 58, 0 }; a |= a >> 1; a |= a >> 2; a |= a >> 4; a |= a >> 8; a |= a >> 16; a |= a >> 32; return tbl[((full)0x03f08a4c6acb9dbd * a) >> 58]; #elif __SIZEOF_POINTER__ == 4 static const uint8_t tbl[32] = { 31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1, 23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0 }; a |= a >> 1; a |= a >> 2; a |= a >> 4; a |= a >> 8; a |= a >> 16; return tbl[((full)0x07c4acdd * a) >> 27]; #else #error Unsupported word size #endif } /* Bit length of unsigned integer, aka 1-based index of most significant 1-bit, * aka ceil(log2(a+1)). This formulation is well defined for all possible * inputs. */ int sc_bit_length(full a) { return a ? 8*sizeof a - count_leading_zeros(a) : 0; } void sc_div_extended(full *a_lo, full *a_hi, full b) { /* Input: two-word dividend a, one-word divisor b * Precondition: divisor greater than high word of dividend (ensuring * quotient fits in one word) * Output: quotient in a_hi, remainder in a_lo */ full q0, q1, a0 = LO(*a_lo), a1 = HI(*a_lo), b0 = LO(b), b1 = HI(b); if (b1 == 0) { q1 = *a_hi / b0; a1 = ((*a_hi << HALF_BITS) | a1) - q1*b0; *a_lo = (a1 << HALF_BITS) | a0; q0 = *a_lo / b0; *a_lo -= q0*b0; } else { full a2 = LO(*a_hi), a3 = HI(*a_hi); q1 = (a3 == b1) ? (half) -1 : *a_hi / b1; /* TODO while (q1*(b) > (a3 a2 a1)) --q1; subtract q1*(b) from (a3 a2 a1) */ q0 = (a2 == b1) ? (half) -1 : ((a2 << HALF_BITS) | a1) / b1; /* while (q0*(b) > (a2 a1 a0) --q0; subtract q0*(b) from (a2 a1 a0) */ *a_lo = (a1 << HALF_BITS) | a0; } *a_hi = q1 << HALF_BITS | q0; } #ifdef SEARCH_PHF /* Search for perfect hash functions based on de Bruijn sequences */ #include #include static void tab32(uint32_t d) { int8_t tbl[32], i; for (i=0; i<32; ++i) tbl[i] = -1; for (i=0; i<32; ++i) { int hash = (d * (((uint32_t)-1) >> i)) >> (32-5); tbl[hash] = i; } printf("{"); for (i=0; i<32; ++i) printf("%d, ", tbl[i]); printf("}\n"); } static int test32(uint32_t d) { uint32_t slots = 0; int i; /* Check for perfect hash function on powers of two minus one */ for (i=0; i<32; ++i) { int hash = (d * (((uint32_t)-1) >> i)) >> (32-5); uint32_t bit = ((uint32_t)1) << hash; if (slots & bit) return 0; /* collision */ slots |= bit; } /* For good measure, make sure it also works on powers of two (is a de * Bruijn sequence) */ slots = 0; for (i=0; i<32; ++i) { int hash = (d * (((uint32_t)1) << i)) >> (32-5); uint32_t bit = ((uint32_t)1) << hash; if (slots & bit) return 0; slots |= bit; } printf("%x ", d); tab32(d); return 1; } static void tab64(uint64_t d) { int8_t tbl[64], i; for (i=0; i<64; ++i) tbl[i] = -1; for (i=0; i<64; ++i) { int hash = (d * (-1UL >> i)) >> (64-6); tbl[hash] = i; } printf("{"); for (i=0; i<64; ++i) printf("%d, ", tbl[i]); printf("}\n"); } static int test64(uint64_t d) { uint64_t slots = 0; int i; for (i=0; i<64; ++i) { int hash = (d * (-1UL >> i)) >> (64-6); uint64_t bit = 1UL << hash; if (slots & bit) return 0; slots |= bit; } slots = 0; for (i=0; i<64; ++i) { int hash = (d * (1UL << i)) >> (64-6); uint64_t bit = 1UL << hash; if (slots & bit) return 0; slots |= bit; } printf("%lx ", d); tab64(d); return 1; } int main() { uint64_t d; for (d = 0; !test32(d); ++d) ; /* XXX 19 magic internet bits remain in this prefix to make the brute force * approach tractable. Smarter algorithm needed. */ for (d = 0x03f08a4000000000; !test64(d); ++d) ; return 0; } #endif