Minimal example of a rolling hash
Description of a current implementation of a rolling hash in the low-level phylogenetic library biomcmc-lib
003 Minimal example of a rolling hash
This notebook uses the low-level phylogenetic library biomcmc-lib (commit 5975331).
The rolling hash (or expoential hash) is a technique to slide through a sequence while updating its hash value using only information from the current hash and the next value. One of the most popular algorithms is the Rabin-Karp algorithm. Below I start with the imnplementation in linclust/MMseqs2
In DNA context, each base is represented by a random number. In MMseqs they work with a reduced amino acid alphabet of 13 groups
Prototype
Here I describe a prototype (future capability of the library), not implemented yet. I am testing the rolling hash function but also shorter types uint8_t
etc. The idea is that each of the testing functionalities will be incorporated into biomcmc-lib
.
Below is a working version that follows closely minclust
.
//%cflags:-lm
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/biomcmc-lib/lib
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build/biomcmc-lib/lib
//%cflags: /usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build/biomcmc-lib/lib/.libs/libbiomcmc.a
#include <biomcmc.h>
uint16_t RAND[21] = {0x4567, 0x23c6, 0x9869, 0x4873, 0xdc51, 0x5cff, 0x944a, 0x58ec,
0x1f29, 0x7ccd, 0x58ba, 0xd7ab, 0x41f2, 0x1efb, 0xa9e3, 0xe146,
0x007c, 0x62c2, 0x0854, 0x27f8, 0x231b};// 16 bit random numbers
#define RoL(val, numbits) (val << numbits) ^ (val >> (32 - numbits))
// Transform each letter x[i] to a fixed random number RAND[x[i]] to ensure instantaneous mixing into the 16 bits
// Do XOR with RAND[x[i]] and 5-bit rotate left for each i from 1 to k
uint32_t circ_hash (const uint8_t * x, uint8_t length, const uint8_t rol){
uint16_t h = 0x0;
h = h ^ RAND[x[0]]; // XOR h and ki
for (uint8_t i = 1; i < length; ++i){
h = RoL(h, rol);
h ^= RAND[x[i]]; // XOR h and ki
}
return h;
}
// Rolling hash for CRC hash variant: Computes hash value for next key x[0:length-1] from previous value
// hash( x[-1:length-2] ) and x_first = x[-1]
uint32_t circ_hash_next (const uint8_t * x, uint8_t length, uint8_t x_first, uint16_t h, const uint8_t rol){
// undo INITIAL_VALUE and first letter x[0] of old key
h ^= RoL(RAND[x_first], (5*(length-1)) % 16);// circularly permute all letters x[1:length-1] to 5 positions to left
// h ^= RoL(RAND[x_first], (5*(length-1)) & 15); // since x%y = x & (y-1)
h = RoL(h, rol);// add new, last letter of new key x[1:length]
h ^= RAND[x[length-1]];
return h;
}
int main (){
uint8_t i, j;
uint8_t my_seq1[] = {1,2,3,1,2, 3,1,2,3,1, 1,1,1,1,1, 1,2,1,2,3, 1,2,1,2,1, 2,1,2}; // n=28
uint8_t my_seq2[] = {1,2,3,4,5, 2,3,4,5,2, 3,4,5,1,1, 4,5,2,5,2, 3,4,3,2,3, 2,3,2};
uint16_t my_h1 = circ_hash((const uint8_t*) my_seq1, 3, 5);
uint16_t my_h2 = circ_hash((const uint8_t *) my_seq2, 3, 7);
for (j = 0; j < 25; j++) {
printf ("%3d >> ", j);
for (i=j;i<j+3;i++) printf ("%d ", my_seq1[i]);
printf (" \t %8u \t >> ",my_h1); // show current kmer and hash
for (i=j;i<j+3;i++) printf ("%d ", my_seq2[i]);
printf (" \t %8u\n",my_h2); // show current kmer and hash
my_h1 = circ_hash_next ((const uint8_t *) my_seq1 + j + 1, 3, my_seq1[j], my_h1, 5);
my_h2 = circ_hash_next ((const uint8_t *) my_seq2 + j + 1, 3, my_seq2[j], my_h2, 7);
}
}
The code above assumes a 16-bits hash, cast to 32 bits (due to the RoL
macro). Below we use all 32 bits of hash, and also use the fact that for non-negative values, x % y == x & (y-1)
which may be faster.
//%cflags:-lm
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/biomcmc-lib/lib
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build/biomcmc-lib/lib
//%cflags: /usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build/biomcmc-lib/lib/.libs/libbiomcmc.a
#include <biomcmc.h>
uint32_t RAND[21] = {0x4567, 0x23c6, 0x9869, 0x4873, 0xdc51, 0x5cff, 0x944a, 0x58ec,
0x1f29, 0x7ccd, 0x58ba, 0xd7ab, 0x41f2, 0x1efb, 0xa9e3, 0xe146,
0x007c, 0x62c2, 0x0854, 0x27f8, 0x231b};
#define RoL(val, numbits) (val << numbits) ^ (val >> (32 - numbits))
uint32_t circ_hash (const uint8_t * x, uint8_t length, const uint8_t rol){
uint32_t h = 0x0;
h = h ^ RAND[x[0]]; // XOR h and ki
for (uint8_t i = 1; i < length; ++i){
h = RoL(h, rol);
h ^= RAND[x[i]]; // XOR h and ki
}
return h;
}
uint32_t circ_hash_next (const uint8_t * x, uint8_t length, uint8_t x_first, uint32_t h, const uint8_t rol){
uint8_t remain = (rol * (length-1)) & 31U; // since x % y = x & (y-1)
// undo INITIAL_VALUE and first letter x[0] of old key
//h ^= RoL(RAND[x_first], (rol * (length-1)) % 32);// circularly permute all letters x[1:length-1] to 5 positions to left
h ^= RoL(RAND[x_first], remain);
h = RoL(h, rol);// add new, last letter of new key x[1:length]
h ^= RAND[x[length-1]];
return h;
}
int main (){
uint8_t i, j;
uint8_t my_seq1[] = {1,2,3,1,2, 3,1,2,3,1, 1,1,1,1,1, 1,2,1,2,3, 1,2,1,2,1, 2,1,2}; // n=28
uint8_t my_seq2[] = {1,2,3,4,5, 2,3,4,5,2, 3,4,5,1,1, 4,5,2,5,2, 3,4,3,2,3, 2,3,2};
uint32_t my_h1 = circ_hash((const uint8_t *) my_seq1, 4, 5);
uint32_t my_h2 = circ_hash((const uint8_t *) my_seq2, 4, 7);
for (j = 0; j < 25; j++) {
printf ("%3d >> ", j);
for (i=j;i<j+4;i++) printf ("%d ", my_seq1[i]);
printf (" %8u \t >> ",my_h1); // show current kmer and hash
for (i=j;i<j+4;i++) printf ("%d ", my_seq2[i]);
printf ("\t%8u\n",my_h2); // show current kmer and hash
my_h1 = circ_hash_next ((const uint8_t *) my_seq1 + j + 1, 4, my_seq1[j], my_h1, 5);
my_h2 = circ_hash_next ((const uint8_t *) my_seq2 + j + 1, 4, my_seq2[j], my_h2, 7);
}
}
/* Short interlude to play with table indices :D */
#include <stdio.h>
int main() {
int t1 = 16, t2 = 4, i1, i2, salt;
for (salt = 0; salt < 72; salt++) {
i1 = (salt & ((t1 >> 2) - 1)) << 2;
i2 = (int)(salt / (t1 >> 2))%t2;
printf ("%3d %3d %3d| ", salt, i1, i2);
if (((salt+1)%8)==0) printf ("\n");
}
}
transforming DNA bases into hash values
Above, we have a single hash value RAND[]
for each base, but we can create a stream of values.
- we define a translation table from DNA to $0...3$ for non-ambiguous data (or some other encoding). Notice that we must know its reverse complement.
- hash values inspired by
RAND[]
are then created. They are "salted" with large primes.
The version currently implemented in biomcmc-lib
is, for non-ambiguous DNA:
uint8_t dna_in_2_bits[256][2] = ;
void
initialize_dna_to_bit_tables (void) {
int i;
for (i = 0; i < 256; i++) dna_in_2_bits[i][0] = dna_in_2_bits[i][1] = 4; // calling function must check if < 4
dna_in_2_bits['A'][0] = 0; dna_in_2_bits['A'][1] = 3; /* A <-> T */
dna_in_2_bits['C'][0] = 1; dna_in_2_bits['C'][1] = 2; /* C <-> G */
dna_in_2_bits['G'][0] = 2; dna_in_2_bits['G'][1] = 1; /* G <-> C */
dna_in_2_bits['T'][0] = 3; dna_in_2_bits['T'][1] = 0; /* T <-> A */
dna_in_2_bits['U'][0] = 3; dna_in_2_bits['U'][1] = 0; /* U <-> A */
}
It just returns a small number, that is concatenated to form the k-mer. I.e. a sequence ACGGT
becomes an integer with binary 00 01 10 10 01
.
Below is the tentative code, closer to my final implementation (notice that it doesn't compress homopolymers). I also implemented the reverse-complement rolling hash (implemented using a different algorithm in ntHash).
//%cflags:-lm
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/biomcmc-lib/lib
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build/biomcmc-lib/lib
//%cflags: /usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build/biomcmc-lib/lib/.libs/libbiomcmc.a
#include <biomcmc.h>
const char my_dna_seq[] =
"AAACCACCTCCCGGTGGTTT" // first 8 and last 8 are revcompl (AAACC<->GGTTT)
"AAACCACCTCCCGGTGGTTT"
"GTTCTTAACATTTCTCGTAC"
"GTTCTTAACATTTCTCGTAC";
uint32_t rand_hash_list[] = {0x4567, 0x23c6, 0x9869, 0x4873, 0xdc51}; // size = 5
uint32_t prime_salt_list[] = { // size = 64
0x343EAF9F, 0x75BD32B, 0x5E1C5A87, 0x343EFDAF, 0x1FDCDBEF, 0x6389CB, 0x1FDCE507, 0x1FDCE3C7,
0x1FDCE15F, 0x75BD431, 0x34C8B0F, 0x5397FEF, 0x87413, 0x6389C9, 0x34C8B23, 0x343EFDB5,
0x75BD479, 0x5398013, 0x5397FCD, 0x87407, 0x75BD413, 0x343EAF4B, 0x343EB047, 0x6389CF,
0x5E1C5A2F, 0x343EFD31, 0x34C8B59, 0x343EFD57, 0x1FDCDC19, 0x5E1C5B0D, 0x34C8B5D, 0x343EB035,
0x75BD307, 0x343EAF7B, 0x343EAEF5, 0x75BD433, 0x1FDCDBD5, 0x1FDCE3B7, 0x343EFD19, 0x5398057,
0x34C8B51, 0x638971, 0x1FDCE16F, 0x63896F, 0x343EB04D, 0x873FD, 0x63896B, 0x1FDCDC9B,
0x5E1C5B1F, 0x343EFCEB, 0x5398007, 0x1FDCE51F, 0x63897B, 0x5E1C5B17, 0x5E1C5ABD, 0xFF85,
0x1FDCDC25, 0x873EB, 0x5E1C5AB1, 0x75BD30D, 0x1FDCDCC1, 0xFF8B, 0x5397FC1, 0x5398001};
uint32_t**
new_dna_salted_hash_encoding (uint8_t salt) {
uint8_t i=255, j;
uint32_t **shash = (uint32_t**) biomcmc_malloc (2 * sizeof (uint32_t*)); // opposite order as dna_in_2_bits[]
for (i = 0; i < 2; ++i) shash[i] = (uint32_t*) biomcmc_malloc (256 * sizeof (uint32_t));
/* notice do{}while() instead of for() since i is always < 256 */
do { shash[0][i] = shash[1][i] = 4;} while (i--); // anything else is fifth state
shash[0]['A'] = shash[0]['a'] = 0; shash[1]['A'] = shash[1]['a'] = 3; /* A <-> T */
shash[0]['C'] = shash[0]['c'] = 1; shash[1]['C'] = shash[1]['c'] = 2; /* C <-> G */
shash[0]['G'] = shash[0]['g'] = 2; shash[1]['G'] = shash[1]['g'] = 1; /* G <-> C */
shash[0]['T'] = shash[0]['t'] = 3; shash[1]['T'] = shash[1]['t'] = 0; /* T <-> A */
shash[0]['U'] = shash[0]['u'] = 3; shash[1]['U'] = shash[1]['u'] = 0; /* U <-> A */
salt &= 63; // modulus, we only have 64 salts
/** now we transform the indexes for their equiv. hash values; all have same salt */
for (j = 0; j < 2; ++j) {
i = 255; do { shash[j][i] = rand_hash_list[shash[j][i]] + prime_salt_list[salt]; } while (i--);
}
return shash;
}
void
del_dna_salted_hash_encoding (uint32_t** shash) {
if (!shash) return;
if (shash[1]) free (shash[1]);
if (shash[0]) free (shash[0]);
free (shash);
}
#define RoL(val, numbits) ((val) << (numbits)) | ((val) >> (32 - (numbits)))
#define RoR(val, numbits) ((val) >> (numbits)) | ((val) << (32 - (numbits)))
void
roll_hash_add (uint32_t *h, const char dna_base, const uint8_t rol, const uint32_t* shashcode)
{
*h = RoL(*h, rol);
*h ^= shashcode[(uint8_t) dna_base]; // XOR h and ki
}
void // kmer_size can't be 0 or 32
roll_hash_replace_f (uint32_t *h, const char old_base, const char new_base, const uint8_t kmer_size,
const uint8_t rol, const uint32_t* shashcode)
{
uint8_t remain = (rol * (kmer_size-1)) & 31U; // since x % y = x & (y-1) // this can be calculated outside
*h ^= RoL(shashcode[(uint8_t) old_base], remain); // remove "leftmost" base
*h = RoL(*h, rol);
*h ^= shashcode[(uint8_t) new_base];
}
void // kmer_size can't be 0 or 32
roll_hash_replace_r (uint32_t *h, const char old_base, const char new_base, const uint8_t kmer_size,
const uint8_t rol, const uint32_t* shashcode)
{
uint8_t remain = (rol * (kmer_size-1)) & 31U; // since x % y = x & (y-1) // this can be calculated outside
*h ^= shashcode[(uint8_t) old_base];
*h = RoR(*h, rol);
*h ^= RoL(shashcode[(uint8_t) new_base], remain);
}
#undef RoL
#undef RoR
int main (){
uint8_t kmer_size = 5, rol = 5;
uint32_t h1, h2, h1R, h2R;
size_t i, j, seqlen = strlen (my_dna_seq);
uint32_t **shcode1, **shcode2;
shcode1 = new_dna_salted_hash_encoding (3);
shcode2 = new_dna_salted_hash_encoding (6);
h1 = shcode1[0][(uint8_t) my_dna_seq[0]];
h2 = shcode2[0][(uint8_t) my_dna_seq[0]];
h1R = shcode1[1][(uint8_t) my_dna_seq[kmer_size-1]];
h2R = shcode2[1][(uint8_t) my_dna_seq[kmer_size-1]];
for (j = 1; j < kmer_size; ++j) {
roll_hash_add (&h1, my_dna_seq[j], rol, shcode1[0]);
roll_hash_add (&h2, my_dna_seq[j], rol, shcode2[0]);
roll_hash_add (&h1R, my_dna_seq[kmer_size-1-j], rol, shcode1[1]);
roll_hash_add (&h2R, my_dna_seq[kmer_size-1-j], rol, shcode2[1]);
}
for (j = (size_t) kmer_size; j < seqlen; ++j) {
if ((j-kmer_size)%kmer_size == 0) {
printf ("%3lu >> ", j - kmer_size);
for (i=j - (size_t) kmer_size; i < j; i++) printf ("%c ", my_dna_seq[i]);
printf (" >> %12u >> %12u || %12u << %12u\n",h1, h1R, h2, h2R);
}
roll_hash_replace_f (&h1, my_dna_seq[j - kmer_size], my_dna_seq[j], kmer_size, rol, shcode1[0]);
roll_hash_replace_f (&h2, my_dna_seq[j - kmer_size], my_dna_seq[j], kmer_size, rol, shcode2[0]);
roll_hash_replace_r (&h1R, my_dna_seq[j - kmer_size], my_dna_seq[j], kmer_size, rol, shcode1[1]);
roll_hash_replace_r (&h2R, my_dna_seq[j - kmer_size], my_dna_seq[j], kmer_size, rol, shcode2[1]);
}
del_dna_salted_hash_encoding (shcode1);
del_dna_salted_hash_encoding (shcode2);
}
conclusions
I've found a bug in linclust
that does not affect MMseqs2, as we see from the above.