003 Minimal example of a rolling hash

This notebook uses the low-level phylogenetic library biomcmc-lib (commit 5975331).

The rolling hash (or expoential hash) is a technique to slide through a sequence while updating its hash value using only information from the current hash and the next value. One of the most popular algorithms is the Rabin-Karp algorithm. Below I start with the imnplementation in linclust/MMseqs2

In DNA context, each base is represented by a random number. In MMseqs they work with a reduced amino acid alphabet of 13 groups

Prototype

Here I describe a prototype (future capability of the library), not implemented yet. I am testing the rolling hash function but also shorter types uint8_t etc. The idea is that each of the testing functionalities will be incorporated into biomcmc-lib.

Below is a working version that follows closely minclust.

//%cflags:-lm
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/biomcmc-lib/lib
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build/biomcmc-lib/lib
//%cflags: /usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build/biomcmc-lib/lib/.libs/libbiomcmc.a
#include <biomcmc.h>

uint16_t RAND[21] = {0x4567, 0x23c6, 0x9869, 0x4873, 0xdc51, 0x5cff, 0x944a, 0x58ec,
                           0x1f29, 0x7ccd, 0x58ba, 0xd7ab, 0x41f2, 0x1efb, 0xa9e3, 0xe146,
                           0x007c, 0x62c2, 0x0854, 0x27f8, 0x231b};// 16 bit random numbers

#define RoL(val, numbits) (val << numbits) ^ (val >> (32 - numbits))

// Transform each letter x[i] to a fixed random number RAND[x[i]] to ensure instantaneous mixing into the 16 bits
// Do XOR with RAND[x[i]] and 5-bit rotate left for each i from 1 to k
uint32_t circ_hash (const uint8_t * x, uint8_t length, const uint8_t rol){
    uint16_t h = 0x0;
    h = h ^ RAND[x[0]];                  // XOR h and ki
    for (uint8_t i = 1; i < length; ++i){
        h = RoL(h, rol);
        h ^= RAND[x[i]];                   // XOR h and ki
    }
    return h;
}

// Rolling hash for CRC hash variant: Computes hash value for next key x[0:length-1] from previous value
// hash( x[-1:length-2] ) and x_first = x[-1]
uint32_t circ_hash_next (const uint8_t * x, uint8_t length, uint8_t x_first, uint16_t h, const uint8_t rol){
    // undo INITIAL_VALUE and first letter x[0] of old key
    h ^= RoL(RAND[x_first], (5*(length-1)) % 16);// circularly permute all letters x[1:length-1] to 5 positions to left
    // h ^= RoL(RAND[x_first], (5*(length-1)) & 15); // since x%y = x & (y-1)
    h =  RoL(h, rol);// add new, last letter of new key x[1:length]
    h ^= RAND[x[length-1]];
    return h;
}

int main (){
    uint8_t i, j;
    uint8_t my_seq1[] = {1,2,3,1,2, 3,1,2,3,1, 1,1,1,1,1, 1,2,1,2,3, 1,2,1,2,1, 2,1,2}; // n=28
    uint8_t my_seq2[] = {1,2,3,4,5, 2,3,4,5,2, 3,4,5,1,1, 4,5,2,5,2, 3,4,3,2,3, 2,3,2}; 
    uint16_t my_h1 = circ_hash((const uint8_t*) my_seq1, 3, 5);
    uint16_t my_h2 = circ_hash((const uint8_t *) my_seq2, 3, 7);
    for (j = 0; j < 25; j++) {
        printf ("%3d >>  ", j);
        for (i=j;i<j+3;i++) printf ("%d ", my_seq1[i]); 
        printf (" \t %8u  \t >>  ",my_h1); // show current kmer and hash
        for (i=j;i<j+3;i++) printf ("%d ", my_seq2[i]); 
        printf (" \t %8u\n",my_h2); // show current kmer and hash
        my_h1 = circ_hash_next ((const uint8_t *) my_seq1 + j + 1, 3, my_seq1[j], my_h1, 5);
        my_h2 = circ_hash_next ((const uint8_t *) my_seq2 + j + 1, 3, my_seq2[j], my_h2, 7);
    }
}
  0 >>  1 2 3  	    23891  	 >>  1 2 3  	    64755
  1 >>  2 3 1  	    35238  	 >>  2 3 4  	    42449
  2 >>  3 1 2  	    11433  	 >>  3 4 5  	    46207
  3 >>  1 2 3  	    23891  	 >>  4 5 2  	    42985
  4 >>  2 3 1  	    35238  	 >>  5 2 3  	    48371
  5 >>  3 1 2  	    11433  	 >>  2 3 4  	    42449
  6 >>  1 2 3  	    23891  	 >>  3 4 5  	    46207
  7 >>  2 3 1  	    35238  	 >>  4 5 2  	    42985
  8 >>  3 1 1  	    38662  	 >>  5 2 3  	    48371
  9 >>  1 1 1  	    17158  	 >>  2 3 4  	    42449
 10 >>  1 1 1  	    17158  	 >>  3 4 5  	    46207
 11 >>  1 1 1  	    17158  	 >>  4 5 1  	     7238
 12 >>  1 1 1  	    17158  	 >>  5 1 1  	      198
 13 >>  1 1 1  	    17158  	 >>  1 1 4  	    48977
 14 >>  1 1 2  	    63657  	 >>  1 4 5  	    62591
 15 >>  1 2 1  	    14054  	 >>  4 5 2  	    42985
 16 >>  2 1 2  	    17577  	 >>  5 2 5  	    43135
 17 >>  1 2 3  	    23891  	 >>  2 5 2  	    42985
 18 >>  2 3 1  	    35238  	 >>  5 2 3  	    48371
 19 >>  3 1 2  	    11433  	 >>  2 3 4  	    42449
 20 >>  1 2 1  	    14054  	 >>  3 4 3  	    41203
 21 >>  2 1 2  	    17577  	 >>  4 3 2  	    57833
 22 >>  1 2 1  	    14054  	 >>  3 2 3  	    48371
 23 >>  2 1 2  	    17577  	 >>  2 3 2  	    57833
 24 >>  1 2 1  	    14054  	 >>  3 2 3  	    48371

The code above assumes a 16-bits hash, cast to 32 bits (due to the RoL macro). Below we use all 32 bits of hash, and also use the fact that for non-negative values, x % y == x & (y-1) which may be faster.

//%cflags:-lm
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/biomcmc-lib/lib
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build/biomcmc-lib/lib
//%cflags: /usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build/biomcmc-lib/lib/.libs/libbiomcmc.a
#include <biomcmc.h>

uint32_t RAND[21] = {0x4567, 0x23c6, 0x9869, 0x4873, 0xdc51, 0x5cff, 0x944a, 0x58ec,
                           0x1f29, 0x7ccd, 0x58ba, 0xd7ab, 0x41f2, 0x1efb, 0xa9e3, 0xe146,
                           0x007c, 0x62c2, 0x0854, 0x27f8, 0x231b};

#define RoL(val, numbits) (val << numbits) ^ (val >> (32 - numbits))
uint32_t circ_hash (const uint8_t * x, uint8_t length, const uint8_t rol){
    uint32_t h = 0x0;
    h = h ^ RAND[x[0]];                  // XOR h and ki
    for (uint8_t i = 1; i < length; ++i){
        h = RoL(h, rol);
        h ^= RAND[x[i]];                 // XOR h and ki
    }
    return h;
}

uint32_t circ_hash_next (const uint8_t * x, uint8_t length, uint8_t x_first, uint32_t h, const uint8_t rol){
    uint8_t remain = (rol * (length-1)) & 31U; // since x % y = x & (y-1)
    // undo INITIAL_VALUE and first letter x[0] of old key
    //h ^= RoL(RAND[x_first], (rol * (length-1)) % 32);// circularly permute all letters x[1:length-1] to 5 positions to left
    h ^= RoL(RAND[x_first], remain);
    h =  RoL(h, rol);// add new, last letter of new key x[1:length]
    h ^= RAND[x[length-1]];
    return h;
}

int main (){
    uint8_t i, j;
    uint8_t my_seq1[] = {1,2,3,1,2, 3,1,2,3,1, 1,1,1,1,1, 1,2,1,2,3, 1,2,1,2,1, 2,1,2}; // n=28
    uint8_t my_seq2[] = {1,2,3,4,5, 2,3,4,5,2, 3,4,5,1,1, 4,5,2,5,2, 3,4,3,2,3, 2,3,2}; 
    uint32_t my_h1 = circ_hash((const uint8_t *) my_seq1, 4, 5);
    uint32_t my_h2 = circ_hash((const uint8_t *) my_seq2, 4, 7);
    for (j = 0; j < 25; j++) {
        printf ("%3d >>  ", j);
        for (i=j;i<j+4;i++) printf ("%d ", my_seq1[i]); 
        printf (" %8u  \t >>  ",my_h1); // show current kmer and hash
        for (i=j;i<j+4;i++) printf ("%d ", my_seq2[i]); 
        printf ("\t%8u\n",my_h2); // show current kmer and hash
        my_h1 = circ_hash_next ((const uint8_t *) my_seq1 + j + 1, 4, my_seq1[j], my_h1, 5);
        my_h2 = circ_hash_next ((const uint8_t *) my_seq2 + j + 1, 4, my_seq2[j], my_h2, 7);
    }
}
  0 >>  1 2 3 1  327911846  	 >>  1 2 3 4 	1593746901
  1 >>  2 3 1 2  1293003945  	 >>  2 3 4 5 	525513836
  2 >>  3 1 2 3  614849875  	 >>  3 4 5 2 	962242528
  3 >>  1 2 3 1  327911846  	 >>  4 5 2 3 	2639510760
  4 >>  2 3 1 2  1293003945  	 >>  5 2 3 4 	3118376410
  5 >>  3 1 2 3  614849875  	 >>  2 3 4 5 	525513836
  6 >>  1 2 3 1  327911846  	 >>  3 4 5 2 	962242528
  7 >>  2 3 1 1  1292965638  	 >>  4 5 2 3 	2639510760
  8 >>  3 1 1 1  615695110  	 >>  5 2 3 4 	3118376410
  9 >>  1 1 1 1  292045574  	 >>  2 3 4 5 	525513836
 10 >>  1 1 1 1  292045574  	 >>  3 4 5 1 	962206799
 11 >>  1 1 1 1  292045574  	 >>  4 5 1 1 	2634940637
 12 >>  1 1 1 1  292045574  	 >>  5 1 1 4 	2533408602
 13 >>  1 1 1 2  292092073  	 >>  1 1 4 5 	1885336699
 14 >>  1 1 2 1  293549798  	 >>  1 4 5 2 	1341827053
 15 >>  1 2 1 2  327566505  	 >>  4 5 2 5 	2639505508
 16 >>  2 1 2 3  1286135123  	 >>  5 2 5 2 	3117721570
 17 >>  1 2 3 1  327911846  	 >>  2 5 2 3 	441695456
 18 >>  2 3 1 2  1293003945  	 >>  5 2 3 4 	3118376410
 19 >>  3 1 2 1  614840038  	 >>  2 3 4 3 	525508832
 20 >>  1 2 1 2  327566505  	 >>  3 4 3 2 	961602016
 21 >>  2 1 2 1  1286125286  	 >>  4 3 2 3 	2557525224
 22 >>  1 2 1 2  327566505  	 >>  3 2 3 2 	677306848
 23 >>  2 1 2 1  1286125286  	 >>  2 3 2 3 	527482080
 24 >>  1 2 1 2  327566505  	 >>  3 2 3 2 	677306848
/* Short interlude to play with table indices :D */
#include <stdio.h>
int main() {
    int t1 = 16, t2 = 4, i1, i2, salt; 
    for (salt = 0; salt < 72; salt++) {
        i1 = (salt & ((t1 >> 2) - 1)) << 2;
        i2 = (int)(salt / (t1 >> 2))%t2;
        printf ("%3d %3d %3d| ", salt, i1, i2);
        if (((salt+1)%8)==0) printf ("\n");
    }
}
  0   0   0|   1   4   0|   2   8   0|   3  12   0|   4   0   1|   5   4   1|   6   8   1|   7  12   1| 
  8   0   2|   9   4   2|  10   8   2|  11  12   2|  12   0   3|  13   4   3|  14   8   3|  15  12   3| 
 16   0   0|  17   4   0|  18   8   0|  19  12   0|  20   0   1|  21   4   1|  22   8   1|  23  12   1| 
 24   0   2|  25   4   2|  26   8   2|  27  12   2|  28   0   3|  29   4   3|  30   8   3|  31  12   3| 
 32   0   0|  33   4   0|  34   8   0|  35  12   0|  36   0   1|  37   4   1|  38   8   1|  39  12   1| 
 40   0   2|  41   4   2|  42   8   2|  43  12   2|  44   0   3|  45   4   3|  46   8   3|  47  12   3| 
 48   0   0|  49   4   0|  50   8   0|  51  12   0|  52   0   1|  53   4   1|  54   8   1|  55  12   1| 
 56   0   2|  57   4   2|  58   8   2|  59  12   2|  60   0   3|  61   4   3|  62   8   3|  63  12   3| 
 64   0   0|  65   4   0|  66   8   0|  67  12   0|  68   0   1|  69   4   1|  70   8   1|  71  12   1| 

transforming DNA bases into hash values

Above, we have a single hash value RAND[] for each base, but we can create a stream of values.

  1. we define a translation table from DNA to $0...3$ for non-ambiguous data (or some other encoding). Notice that we must know its reverse complement.
  2. hash values inspired by RAND[] are then created. They are "salted" with large primes.

The version currently implemented in biomcmc-lib is, for non-ambiguous DNA:

uint8_t dna_in_2_bits[256][2] = ;

void
initialize_dna_to_bit_tables (void) {
    int i;
    for (i = 0; i < 256; i++) dna_in_2_bits[i][0] = dna_in_2_bits[i][1] = 4; // calling function must check if < 4
    dna_in_2_bits['A'][0] = 0;   dna_in_2_bits['A'][1] = 3;  /*  A  <-> T  */
    dna_in_2_bits['C'][0] = 1;   dna_in_2_bits['C'][1] = 2;  /*  C  <-> G  */
    dna_in_2_bits['G'][0] = 2;   dna_in_2_bits['G'][1] = 1;  /*  G  <-> C  */
    dna_in_2_bits['T'][0] = 3;   dna_in_2_bits['T'][1] = 0;  /*  T  <-> A  */
    dna_in_2_bits['U'][0] = 3;   dna_in_2_bits['U'][1] = 0;  /*  U  <-> A  */
}

It just returns a small number, that is concatenated to form the k-mer. I.e. a sequence ACGGT becomes an integer with binary 00 01 10 10 01.

Below is the tentative code, closer to my final implementation (notice that it doesn't compress homopolymers). I also implemented the reverse-complement rolling hash (implemented using a different algorithm in ntHash).

//%cflags:-lm
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/biomcmc-lib/lib
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build/biomcmc-lib/lib
//%cflags: /usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build/biomcmc-lib/lib/.libs/libbiomcmc.a
#include <biomcmc.h>
const char my_dna_seq[] = 
    "AAACCACCTCCCGGTGGTTT" // first 8 and last 8 are revcompl (AAACC<->GGTTT)
    "AAACCACCTCCCGGTGGTTT"
    "GTTCTTAACATTTCTCGTAC"
    "GTTCTTAACATTTCTCGTAC";

uint32_t rand_hash_list[] = {0x4567, 0x23c6, 0x9869, 0x4873, 0xdc51}; // size = 5
uint32_t prime_salt_list[] = { // size = 64 
  0x343EAF9F, 0x75BD32B,  0x5E1C5A87, 0x343EFDAF, 0x1FDCDBEF, 0x6389CB,   0x1FDCE507, 0x1FDCE3C7, 
  0x1FDCE15F, 0x75BD431,  0x34C8B0F,  0x5397FEF,  0x87413,    0x6389C9,   0x34C8B23,  0x343EFDB5, 
  0x75BD479,  0x5398013,  0x5397FCD,  0x87407,    0x75BD413,  0x343EAF4B, 0x343EB047, 0x6389CF, 
  0x5E1C5A2F, 0x343EFD31, 0x34C8B59,  0x343EFD57, 0x1FDCDC19, 0x5E1C5B0D, 0x34C8B5D,  0x343EB035, 
  0x75BD307,  0x343EAF7B, 0x343EAEF5, 0x75BD433,  0x1FDCDBD5, 0x1FDCE3B7, 0x343EFD19, 0x5398057, 
  0x34C8B51,  0x638971,   0x1FDCE16F, 0x63896F,   0x343EB04D, 0x873FD,    0x63896B,   0x1FDCDC9B, 
  0x5E1C5B1F, 0x343EFCEB, 0x5398007,  0x1FDCE51F, 0x63897B,   0x5E1C5B17, 0x5E1C5ABD, 0xFF85, 
  0x1FDCDC25, 0x873EB,    0x5E1C5AB1, 0x75BD30D,  0x1FDCDCC1, 0xFF8B,     0x5397FC1,  0x5398001};

uint32_t**
new_dna_salted_hash_encoding (uint8_t salt) {
    uint8_t i=255, j;
    uint32_t **shash = (uint32_t**) biomcmc_malloc (2 * sizeof (uint32_t*)); // opposite order as dna_in_2_bits[]
    for (i = 0; i < 2; ++i) shash[i] = (uint32_t*) biomcmc_malloc (256 * sizeof (uint32_t));
    /* notice do{}while() instead of for() since i is always < 256 */
    do { shash[0][i] = shash[1][i] = 4;} while (i--); // anything else is fifth state
    
    shash[0]['A'] = shash[0]['a'] = 0; shash[1]['A'] = shash[1]['a'] = 3;  /*  A  <-> T  */
    shash[0]['C'] = shash[0]['c'] = 1; shash[1]['C'] = shash[1]['c'] = 2;  /*  C  <-> G  */
    shash[0]['G'] = shash[0]['g'] = 2; shash[1]['G'] = shash[1]['g'] = 1;  /*  G  <-> C  */
    shash[0]['T'] = shash[0]['t'] = 3; shash[1]['T'] = shash[1]['t'] = 0;  /*  T  <-> A  */
    shash[0]['U'] = shash[0]['u'] = 3; shash[1]['U'] = shash[1]['u'] = 0;  /*  U  <-> A  */
    
    salt &= 63; // modulus, we only have 64 salts
    /** now we transform the indexes for their equiv. hash values; all have same salt */
    for (j = 0; j < 2; ++j) {
        i = 255; do { shash[j][i] = rand_hash_list[shash[j][i]] + prime_salt_list[salt]; } while (i--);
    }
    return shash;
}

void
del_dna_salted_hash_encoding (uint32_t** shash) {
    if (!shash) return;
    if (shash[1]) free (shash[1]);
    if (shash[0]) free (shash[0]);
    free (shash);
}

#define RoL(val, numbits) ((val) << (numbits)) | ((val) >> (32 - (numbits)))
#define RoR(val, numbits) ((val) >> (numbits)) | ((val) << (32 - (numbits)))
void
roll_hash_add (uint32_t *h, const char dna_base, const uint8_t rol, const uint32_t* shashcode)
{
  *h = RoL(*h, rol);
  *h ^= shashcode[(uint8_t) dna_base]; // XOR h and ki
}

void // kmer_size can't be 0 or 32 
roll_hash_replace_f (uint32_t *h, const char old_base, const char new_base, const uint8_t kmer_size, 
                   const uint8_t rol, const uint32_t* shashcode)
{
    uint8_t remain = (rol * (kmer_size-1)) & 31U; // since x % y = x & (y-1) // this can be calculated outside
    *h ^= RoL(shashcode[(uint8_t) old_base], remain); // remove "leftmost" base 
    *h  = RoL(*h, rol);
    *h ^= shashcode[(uint8_t) new_base];
}
void // kmer_size can't be 0 or 32 
roll_hash_replace_r (uint32_t *h, const char old_base, const char new_base, const uint8_t kmer_size, 
                   const uint8_t rol, const uint32_t* shashcode)
{
    uint8_t remain = (rol * (kmer_size-1)) & 31U; // since x % y = x & (y-1) // this can be calculated outside 
    *h ^= shashcode[(uint8_t) old_base];
    *h  = RoR(*h, rol);
    *h ^= RoL(shashcode[(uint8_t) new_base], remain);  
}
#undef RoL
#undef RoR

int main (){
    uint8_t kmer_size = 5, rol = 5;
    uint32_t h1, h2, h1R, h2R;
    size_t i, j, seqlen = strlen (my_dna_seq);
    uint32_t **shcode1, **shcode2;
    shcode1 = new_dna_salted_hash_encoding (3);
    shcode2 = new_dna_salted_hash_encoding (6);
    h1  = shcode1[0][(uint8_t) my_dna_seq[0]];
    h2  = shcode2[0][(uint8_t) my_dna_seq[0]];
    h1R = shcode1[1][(uint8_t) my_dna_seq[kmer_size-1]];
    h2R = shcode2[1][(uint8_t) my_dna_seq[kmer_size-1]];
    
    for (j = 1; j < kmer_size; ++j) {
        roll_hash_add (&h1, my_dna_seq[j], rol, shcode1[0]); 
        roll_hash_add (&h2, my_dna_seq[j], rol, shcode2[0]);
        roll_hash_add (&h1R, my_dna_seq[kmer_size-1-j], rol, shcode1[1]); 
        roll_hash_add (&h2R, my_dna_seq[kmer_size-1-j], rol, shcode2[1]);
    }

    for (j = (size_t) kmer_size; j < seqlen; ++j) {
       if ((j-kmer_size)%kmer_size == 0) {
            printf ("%3lu >>  ", j - kmer_size);
            for (i=j - (size_t) kmer_size; i < j; i++) printf ("%c ", my_dna_seq[i]); 
            printf (" >> %12u >> %12u || %12u << %12u\n",h1, h1R, h2, h2R); 
        }
        roll_hash_replace_f (&h1, my_dna_seq[j - kmer_size], my_dna_seq[j], kmer_size, rol, shcode1[0]); 
        roll_hash_replace_f (&h2, my_dna_seq[j - kmer_size], my_dna_seq[j], kmer_size, rol, shcode2[0]); 
        roll_hash_replace_r (&h1R, my_dna_seq[j - kmer_size], my_dna_seq[j], kmer_size, rol, shcode1[1]);
        roll_hash_replace_r (&h2R, my_dna_seq[j - kmer_size], my_dna_seq[j], kmer_size, rol, shcode2[1]);
    }
    del_dna_salted_hash_encoding (shcode1);
    del_dna_salted_hash_encoding (shcode2);
}
  0 >>  A A A C C  >>   3728674536 >>   3829420882 ||   2734906157 <<   4185167999
  5 >>  A C C T C  >>   4002146312 >>   2375924754 ||   3000820173 <<   3551637823
 10 >>  C C G G T  >>   3397139481 >>   4002502693 ||   2570072376 <<   3000407344
 15 >>  G G T T T  >>   3829420882 >>   3728674536 ||   4185167999 <<   2734906157
 20 >>  A A A C C  >>   3728674536 >>   3829420882 ||   2734906157 <<   4185167999
 25 >>  A C C T C  >>   4002146312 >>   2375924754 ||   3000820173 <<   3551637823
 30 >>  C C G G T  >>   3397139481 >>   4002502693 ||   2570072376 <<   3000407344
 35 >>  G G T T T  >>   3829420882 >>   3728674536 ||   4185167999 <<   2734906157
 40 >>  G T T C T  >>   2354166194 >>   3031712392 ||   3514117791 <<   2290618189
 45 >>  T A A C A  >>   2373938315 >>   3890239083 ||   3527637390 <<   4191461230
 50 >>  T T T C T  >>   2414983615 >>   3031703787 ||   3520409242 <<   2290626030
 55 >>  C G T A C  >>   2458948238 >>   2353379720 ||   2728681295 <<   3513331349
 60 >>  G T T C T  >>   2354166194 >>   3031712392 ||   3514117791 <<   2290618189
 65 >>  T A A C A  >>   2373938315 >>   3890239083 ||   3527637390 <<   4191461230
 70 >>  T T T C T  >>   2414983615 >>   3031703787 ||   3520409242 <<   2290626030

conclusions

I've found a bug in linclust that does not affect MMseqs2, as we see from the above.