#include <stdint.h>

#if __SIZEOF_POINTER__ == 4
typedef uint32_t full;
typedef int32_t sfull;
typedef uint16_t half;

#elif __SIZEOF_POINTER__ == 8
typedef uint64_t full;
typedef int64_t sfull;
typedef uint32_t half;

#else
#error Unsupported word size
#endif

#define HALF_BITS (8*sizeof(half))
#define FULL_BITS (8*sizeof(full))

/* Low half of word, unsigned */
#define LO(n) ((full)((half)(n)))
/* High half of word, unsigned */
#define HI(n) ((n)>>HALF_BITS)
/* High half of word, sign extended (casting back to unsigned because overflow
 * on signed types is Undefined Behavior) */
#define S_HI(n) ((full)(((sfull)(n))>>HALF_BITS))

/** sc_wide_mul
 *
 * Computes the full (double-word) product of two unsigned words, returning the
 * low word through the first argument and the high word through the second.
 *
 * The algorithm works by decomposition into half-word products. To verify,
 * consider the variables arranged much as in schoolbook multiplication, each
 * "digit" here being a half-word:
 *
 *          ah  al
 *       x  bh  bl
 * ---------------
 *       c
 *         p1h p1l
 *     p2h p2l
 *     p3h p3l
 * p4h p4l
 * \_____/ \_____/
 *  high  /  low   outputs
 */

void sc_wide_mul(full *a, full *b) {
    full ah=HI(*a), al=LO(*a), bh=HI(*b), bl=LO(*b),
         p1=al*bl, p2=ah*bl, p3=al*bh, p4=ah*bh, /* no overflow */
         t=LO(p2) + LO(p3),                      /* no overflow */
         c=HI(HI(p1) + t);                       /* no overflow */
    *a = p1 + (t<<HALF_BITS);                    /* overflow captured by c */
    *b = c + HI(p2) + HI(p3) + p4;
    /* That the overall multiplication can't overflow should be proof enough
     * that the last sum doesn't, but just in case:
     * Let r = 2^HALF_BITS; the sum must be less than r^2.
     * The upper bound of each partial product is
     *   (r - 1)^2 = r^2 - 2r + 1
     * The upper bound of p2h or p3h is
     *   floor((r^2 - 2r + 1) / r) = r - 2
     * And the upper bound of c is 2, so the sum is bounded by
     *   2 + 2(r - 2) + (r^2 - 2r + 1)
     *   = r^2 + 2 - 4 + 1
     *   = r^2 - 1
     */
}

/** sc_wide_mul_signed
 *
 * Computes the full (double-word) product of two signed words in two's
 * complement, returning the low word through the first argument and the high
 * word through the second.
 *
 * A trivial modification of sc_wide_mult, but the proof may not be obvious.
 *
 * Lemma 1: AB mod r = (jr + A)(kr + B) mod r for any integers j and k
 *   since r divides (jkr^2 + jrB + krA + AB) - AB.
 *
 * By Lemma 1, the single word product (AB mod r) generalizes to negative
 * numbers in two's complement representation, since the representation differs
 * from the number by a multiple of r.
 *
 * The full product (AB) can thus be obtained by sign-extending A and B to
 * double-words and taking the unsigned product mod r^2, that is, discarding
 * overflow. The trick is skipping computations that don't affect the result.
 *
 *     ax  ax  ah  al
 *  x  bx  bx  bh  bl
 * ------------------
 *          c
 *            p1h p1l <- al*bl
 *        p2h p2l     <- ah*bl
 *    p5h p5l         <- ax*bl
 *    p5l             <- ax*bl
 *        p3h p3l     <- al*bh
 *    p4h p4l         <- ah*bh
 *    p6l             <- ax*bh
 *    p7h p7l         <- al*bx
 *    p8l             <- ah*bx
 *    p7l             <- al*bx
 *
 * XXX unfinished
 *
 * Sign extension of a from r to r^2:
 *   a            if 0 <= a < r/2
 *   r^2 - r + a  if r/2 <= a < r
 *
 * Looking for sx(a) * sx(b) mod r^2
 *
 *   ab              if 0 <= a < r/2 and 0 <= b < r/2
 *   a(r^2 - r + b)  if 0 <= a < r/2 and r/2 <= b < r
 *   (r^2 - r + a)b  if r/2 <= a < r and 0 <= b < r/2
 *   (r^2 - r + a)(r^2 - r + a)  if r/2 <= a < r and r/2 <= b < r
 *
 * a or r(r - 1) + a === a - r (mod r^2)
 *
 * Let A = (r*ah + al)
 * A === (r^2 + r*ah + al) mod r^2
 *   = r(r + ah) + al
 * B = r(r + bh) + bl
 *
 * AB = (r(r + ah) + al)(r(r + bh) + bl)
 * AB = (r*ah + al)(r*bh + bl)
 *    = r^2*ah*bh + r(ah*bl + bh*al) + al*bl
 *  === r(ah*bl + bh*al) + al*bl (mod r^2)
 *  === r((ah + r)*bl + (bh + r)*al) + al*bl (mod r^2)
 *
 *  === r^2*(ah + r)*(bh + r) + r*(ah + r)*bl + r*(bh + r)*al + al*bl
 */

void sc_wide_mul_signed(full *a, full *b) {
    /*
    full ah=S_HI(*a), al=LO(*a), bh=S_HI(*b), bl=LO(*b),
         p1=al*bl, p2=ah*bl, p3=al*bh, p4=ah*bh,
         t=LO(p2) + LO(p3),
         c=HI(HI(p1) + t);
    *a = p1 + (t<<HALF_BITS);
    *b = c + S_HI(p2) + S_HI(p3) + p4;

    Since my proof crashed and burned, a longer but more obvious alternative:
    multiply unsigned magnitudes and negate if input signs differ. Still
    branch-free. */
    full a_sign_bit = (*a) >> (FULL_BITS-1),
         a_sign_ext = ((sfull)(*a)) >> (FULL_BITS-1),
         b_sign_bit = (*b) >> (FULL_BITS-1),
         b_sign_ext = ((sfull)(*b)) >> (FULL_BITS-1),
         p_sign_bit = a_sign_bit ^ b_sign_bit,
         p_sign_ext = a_sign_ext ^ b_sign_ext,
         tmp;
    /* Conditional two's complement based on sign */
    *a = (*a ^ a_sign_ext) + a_sign_bit;
    *b = (*b ^ b_sign_ext) + b_sign_bit;
    sc_wide_mul(a, b);
    /* Likewise for the product, but extending to two words. C being
     * brain-dead, we have to re-derive the carry bit; this trick from FFA
     * (http://www.loper-os.org/?p=1913). */
    *a = *a ^ p_sign_ext;
    tmp = *a + p_sign_bit;
    *b = (*b ^ p_sign_ext) + ((*a & ~tmp) >> (FULL_BITS-1));
    *a = tmp;
}

/* Count leading zeros in a nonzero word, returning an integer from zero to
 * BITS-1. (This implementation happens to treat zero the same as one, but this
 * can't be assumed, especially for assembly versions.)
 *
 * 1. Let A be a 2^N-bit word rounded down to a power of two, isolating the
 *    most significant 1-bit.
 * 2. Let D be a de Bruijn bit sequence of order N: a cyclic sequence of 2^N
 *    bits containing all possible N-bit subsequences.
 * 3. Multiply D by A, thereby up-shifting it.
 * 4. If at least the N-1 high bits of D are zero, the shift is equivalent to a
 *    rotation within the N high bits, so they form a perfect hash function
 *    over the possible values of A.
 *
 * Note that the table so constructed will be complete only if the hash
 * function is indeed perfect; one need not understand why it works to verify
 * that it does.
 *
 * See Leiserson, Prokop and Randall, "Using de Bruijn Sequences to Index a 1
 * in a Computer Word", MIT Laboratory for Computer Science, 1998.
 *
 * As a further hack, some subset of de Bruijn sequences also work when
 * rounding UP to the next power of two minus one, avoiding the need to clear
 * the low bits. I'm not aware of a better way to find these than brute force.
 *
 * A program to generate the numbers is included at the end of this file. */
static int count_leading_zeros(full a) {
#if __SIZEOF_POINTER__ == 8
    static const uint8_t tbl[64] = {
        63, 52, 62, 51, 47, 34, 61, 50, 41, 46, 22, 38, 33, 15, 60, 2, 49, 43,
        40, 45, 29, 27, 21, 37, 25, 32, 10, 19, 14, 7, 59, 1, 53, 48, 35, 42,
        23, 39, 16, 3, 44, 30, 28, 26, 11, 20, 8, 54, 36, 24, 17, 4, 31, 12, 9,
        55, 18, 5, 13, 56, 6, 57, 58, 0
    };
    a |= a >> 1;
    a |= a >> 2;
    a |= a >> 4;
    a |= a >> 8;
    a |= a >> 16;
    a |= a >> 32;
    return tbl[((full)0x03f08a4c6acb9dbd * a) >> 58];
#elif __SIZEOF_POINTER__ == 4
    static const uint8_t tbl[32] = {
        31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1, 23, 19, 11,
        3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
    };
    a |= a >> 1;
    a |= a >> 2;
    a |= a >> 4;
    a |= a >> 8;
    a |= a >> 16;
    return tbl[((full)0x07c4acdd * a) >> 27];
#else
#error Unsupported word size
#endif
}

/* Bit length of unsigned integer, aka 1-based index of most significant 1-bit,
 * aka ceil(log2(a+1)). This formulation is well defined for all possible
 * inputs. */
int sc_bit_length(full a) {
    return a ? 8*sizeof a - count_leading_zeros(a) : 0;
}

void sc_div_extended(full *a_lo, full *a_hi, full b) {
    /* Input: two-word dividend a, one-word divisor b
     * Precondition: divisor greater than high word of dividend (ensuring
     *               quotient fits in one word)
     * Output: quotient in a_hi, remainder in a_lo */
    full q0, q1, a0 = LO(*a_lo), a1 = HI(*a_lo), b0 = LO(b), b1 = HI(b);
    if (b1 == 0) {
        q1 = *a_hi / b0;
        a1 = ((*a_hi << HALF_BITS) | a1) - q1*b0;
        *a_lo = (a1 << HALF_BITS) | a0;
        q0 = *a_lo / b0;
        *a_lo -= q0*b0;
    }
    else {
        full a2 = LO(*a_hi), a3 = HI(*a_hi);
        q1 = (a3 == b1) ? (half) -1 : *a_hi / b1;
        /* TODO
        while (q1*(b) > (a3 a2 a1)) --q1;
        subtract q1*(b) from (a3 a2 a1) */
        q0 = (a2 == b1) ? (half) -1 : ((a2 << HALF_BITS) | a1) / b1;
        /*
        while (q0*(b) > (a2 a1 a0) --q0;
        subtract q0*(b) from (a2 a1 a0) */
        *a_lo = (a1 << HALF_BITS) | a0;
    }
    *a_hi = q1 << HALF_BITS | q0;
}

#ifdef SEARCH_PHF
/* Search for perfect hash functions based on de Bruijn sequences */

#include <stdio.h>
#include <stdint.h>

static void tab32(uint32_t d) {
    int8_t tbl[32], i;
    for (i=0; i<32; ++i) tbl[i] = -1;
    for (i=0; i<32; ++i) {
        int hash = (d * (((uint32_t)-1) >> i)) >> (32-5);
        tbl[hash] = i;
    }
    printf("{");
    for (i=0; i<32; ++i) printf("%d, ", tbl[i]);
    printf("}\n");
}

static int test32(uint32_t d) {
    uint32_t slots = 0;
    int i;
    /* Check for perfect hash function on powers of two minus one */
    for (i=0; i<32; ++i) {
        int hash = (d * (((uint32_t)-1) >> i)) >> (32-5);
        uint32_t bit = ((uint32_t)1) << hash;
        if (slots & bit) return 0; /* collision */
        slots |= bit;
    }
    /* For good measure, make sure it also works on powers of two (is a de
     * Bruijn sequence) */
    slots = 0;
    for (i=0; i<32; ++i) {
        int hash = (d * (((uint32_t)1) << i)) >> (32-5);
        uint32_t bit = ((uint32_t)1) << hash;
        if (slots & bit) return 0;
        slots |= bit;
    }
    printf("%x ", d);
    tab32(d);
    return 1;
}

static void tab64(uint64_t d) {
    int8_t tbl[64], i;
    for (i=0; i<64; ++i) tbl[i] = -1;
    for (i=0; i<64; ++i) {
        int hash = (d * (-1UL >> i)) >> (64-6);
        tbl[hash] = i;
    }
    printf("{");
    for (i=0; i<64; ++i) printf("%d, ", tbl[i]);
    printf("}\n");
}

static int test64(uint64_t d) {
    uint64_t slots = 0;
    int i;
    for (i=0; i<64; ++i) {
        int hash = (d * (-1UL >> i)) >> (64-6);
        uint64_t bit = 1UL << hash;
        if (slots & bit) return 0;
        slots |= bit;
    }
    slots = 0;
    for (i=0; i<64; ++i) {
        int hash = (d * (1UL << i)) >> (64-6);
        uint64_t bit = 1UL << hash;
        if (slots & bit) return 0;
        slots |= bit;
    }
    printf("%lx ", d);
    tab64(d);
    return 1;
}

int main() {
    uint64_t d;
    for (d = 0; !test32(d); ++d) ;
    /* XXX 19 magic internet bits remain in this prefix to make the brute force
     * approach tractable. Smarter algorithm needed. */
    for (d = 0x03f08a4000000000; !test64(d); ++d) ;
    return 0;
}

#endif