005 Hash functions and pseudo-random number generation

This notebook uses the low-level phylogenetic library biomcmc-lib (commit da29c5c).

Here I show two small tests, one with generation of an initial set of random numbers, and another on popcount() speeds.

testing the generation of a deterministic array of random numbers

In biomcmc-lib there are a few vectors with "random numbers" (some actually from random.org, some random prime numbers used in hash functions, etc.). The function biomcmc_salt_vector32_from_spice_table() will populate a vector with these number in an order specified by the particular seed[4] values.

//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/biomcmc-lib/lib
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build.191216/biomcmc-lib/lib
//%cflags: /usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build.191216/biomcmc-lib/lib/.libs/libbiomcmc.a
//%cflags:-lm
#include <biomcmc.h>

int
main (int argc, char **argv)
{
    uint32_t i,j;
    uint32_t seeds[]={0,1,2,3}, vec[32], nvec=34;
    uint8_t *c;
    size_t size = sizeof (vec);
    
    for (i=0;i<2;i++) {
        seeds[0] = i;
        for (j=0;j<4;j++) printf ("%8x ",seeds[j]);
        printf ("original seeds\n");
        biomcmc_salt_vector32_from_spice_table (vec, nvec, seeds);
        printf("\nseed: %u\n", i);
        //for (j=0; j<nvec;j++) {printf("%12u ", vec[j]); if (!((j+1)%8)) printf ("\n"); }
        c = (uint8_t*) vec;
        size = sizeof (vec);
        for (; (size > 0); c++, size--) {printf ("%4x ", *c); if (!((size-1)%16)) printf ("\n"); }
    }
    for (i=0;i<15;i++) {
        for (j=0;j<4;j++) printf ("%8x ",biomcmc_hashint_salted (j, i));
        printf ("<< for hash %u\n", i);
    }
        
    return EXIT_SUCCESS;
}
       0        1        2        3 original seeds

seed: 0
  15   da   f0   20   87   60   7f   24   ff   f0   9c   e2   87   7e    1    0 
  ff   ff   4f   46   ff   ff   27   23   d8   9a   b1   55   99    2   99   d7 
  85    5    c   d0   a6    f   a6   c4   9a   99   99   99   52   5a    7   8c 
  ae   11   92   3d   8f   e7    2    0   1d   60    0    0   1d   60    0    0 
  ed   6e   1c    1   fb   a5   71    7    9   f8   e6   5e   7b    a   9f    7 
  ff   ff   4b   59   ff   ff   f3   32   d7    5   f2   28   b7   27   3d   c8 
  fa   9e   6e   d6   4f   64   3f   6e   5c   52   ba   39   34   f8   d7   3a 
  2c   fc   1f   a0   46   7e    9    0   71   f7    2    0   1d   60    0    0 
       1 b621b33c 9cc8151d 5ab560c5 original seeds

seed: 1
  55   93   67   35   87   60   7f   24   2e   d1   9d   e2   7b   b3   31   35 
  ff   ff   7f   4c   ff   ff   da   33   fa   2f   b9   34   4d   26   48   46 
  fa   9e   6e   d6    e   8d   25   79   9f   da   f7   50   ee   a6   50   e1 
  e1   36   ce   6d   68   19   10    0    2   90   fd   2d   55   bd    4    0 
  89    a   50   1a    5   9e   41   2e   63   d1   e7   5e   eb   25   d4   19 
  ff   ff   92   6c   ff   ff   b9   30   c2   98   28   91   66   a9   5a   6f 
  fa   9e   6e   d6   30   6c   db   ea   99   3d   cc   5a   cf   a6   fc   ed 
  c0   13   ac   84   6a    0    9    0   a2   47   42   1a   f8   2b    1    0 
c0a9496a 27922c9d c6793575 87d06fbe << for hash 0
6b4ed927 b48681b6 e267b84c 4f6e0e9c << for hash 1
caa3caa3 12d60bf6 25ac1fe5 3882835c << for hash 2
    d0dc   2a6ee1   55234f   7f181c << for hash 3
deb66b58 deb66ab9 deb66a9a deb66afb << for hash 4
2ba588a6 2ba58337 2ba51840 2ba510d1 << for hash 5
acefdd39 ec26e4d2 75fe3d8c ce4259f1 << for hash 6
4636b9c9 62baf5a0 ff4d1170 2bf062cf << for hash 7
b7255c83 a63ec54b 2a674713 c0186ef5 << for hash 8
ad92d5f0 8e953d2f 8f005635 cb773d1b << for hash 9
e6546b64 e6546b64 e6546b64 e6546b64 << for hash 10
c0a9496a 27922c9d c6793575 87d06fbe << for hash 11
c0a9496a 27922c9d c6793575 87d06fbe << for hash 12
c0a9496a 27922c9d c6793575 87d06fbe << for hash 13
c0a9496a 27922c9d c6793575 87d06fbe << for hash 14

Counting the number of active bits (pop_count())

I have recently implemented a few faster versions of 64 bits popcounts, and below I compare them with my first implementation — in the context of "bipartitions", which are vectors of bitstrings.

First I check if they are calculating the right thing (number of "ones" in a vector of 64 bit integers). And then I see which is faster. I expected pop0() to be slowest (it is linear on the number of active bits), and the others to be similar. Which is what we observe. The one I've chosen to be the new "default" on biomcmc-lib is pop1(), which I've seen in Jue Ruan's RedBean and also in novoBreak. The previous default was the i &= i -1 trick described in K & R. The other two are from Andrew Dalke.

//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/biomcmc-lib/lib
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build.191216/biomcmc-lib/lib
//%cflags: /usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build.191216/biomcmc-lib/lib/.libs/libbiomcmc.a
//%cflags:-lm
#include <biomcmc.h>

int
main (int argc, char **argv)
{
    uint32_t i,j, n_iter = 10000;
    clock_t time0, time1;
    bipartition bp = new_bipartition (70);
    biomcmc_random_number_init (0ULL);
    printf ("STEP 1 : check that all pop counts are correct\n");
    for (i=0; i<4; i++) {
        for (j=0; j < bp->n->ints; j++) {
            bp->bs[j] = biomcmc_rng_get ();
        }
        bipartition_print_to_stdout (bp);
        printf ("\t%4d %4d %4d %4d\n", 
                bipartition_count_n_ones_pop0(bp),
                bipartition_count_n_ones_pop1(bp),
                bipartition_count_n_ones_pop2(bp),
                bipartition_count_n_ones_pop3(bp));
    }
    del_bipartition (bp); bp = new_bipartition (50000); // bigger bipartition
    printf ("STEP 2: time pop counts (pop1 is default, and we call through a wrapper to discount this overhead)\n");
    time0 = clock ();
    for (i=0; i < n_iter; i++) {
        for (j=0; j < bp->n->ints; j++) bp->bs[j] = biomcmc_rng_get ();
        bipartition_count_n_ones (bp);
    }
    time1 = clock (); fprintf (stderr, "pop1: %.8f secs\n", (double)(time1-time0)/(double)CLOCKS_PER_SEC);
    time0 = time1;
    for (i=0; i < n_iter; i++) {
        for (j=0; j < bp->n->ints; j++) bp->bs[j] = biomcmc_rng_get ();
        bipartition_count_n_ones_pop0 (bp);
    }
    time1 = clock (); fprintf (stderr, "pop0: %.8f secs\n", (double)(time1-time0)/(double)CLOCKS_PER_SEC);
    time0 = time1;
    
    for (i=0; i < n_iter; i++) {
        for (j=0; j < bp->n->ints; j++) bp->bs[j] = biomcmc_rng_get ();
        bipartition_count_n_ones_pop2 (bp);
    }
    time1 = clock (); fprintf (stderr, "pop2: %.8f secs\n", (double)(time1-time0)/(double)CLOCKS_PER_SEC);
    time0 = time1;
    
    for (i=0; i < n_iter; i++) {
        for (j=0; j < bp->n->ints; j++) bp->bs[j] = biomcmc_rng_get ();
        bipartition_count_n_ones_pop3 (bp);
    }
    time1 = clock (); fprintf (stderr, "pop3: %.8f secs\n", (double)(time1-time0)/(double)CLOCKS_PER_SEC);
    time0 = time1;
    
    del_bipartition (bp);
    biomcmc_random_number_finalize ();
    return EXIT_SUCCESS;
}
STEP 1 : check that all pop counts are correct
1111101010010110110000001011111000010001001000010111110100110101.001010[0] 	  34   34   34   34
1011001001011101101010010000011101001100100001111101011110110010.011010[34] 	  36   36   36   36
0000101001000100011000111111100000011010001001000110110100001110.011010[36] 	  29   29   29   29
1010001100100100100001000000010101111111101000011110111110001100.100100[29] 	  32   32   32   32
STEP 2: time pop counts (pop1 is default, and we call through a wrapper to discount this overhead)
pop1: 0.19077800 secs
pop0: 0.37243400 secs
pop2: 0.20049800 secs
pop3: 0.19488300 secs