kalign3 as a library and colour printing
How the multiple sequence alignment kalign3 is implemented in the low-level phylogenetic library biomcmc-lib, and how to use colours in C.
004 kalign and colour printing
incorporating kalign3 as a library
This notebook uses the low-level phylogenetic library biomcmc-lib (commit 5975331).
Recently I (re-)discovered the kalign software for multiple sequence alignment by Timo Lassmann, and was happily surprised with the code simplicity and a liberal license (GPL-3.0-or-later).
Therefore I decided to incorporate its code into a library, that I can access using a char_vector::
from the biomcmc-lib
library. Currently I am working on a derived library/software called "cumaru" (still private), with a modified version of kalign
algorithms.
Autotools
First, a digresssion on the configuration aspects of cumaru
(or another software importing these libraries).
This is needed by configure.ac
as well as the described files, in order for kalign3
to use the AVX extensions.
# M4 macros for checking of CPU features from kalign3
m4_include([m4/ax_gcc_x86_avx_xgetbv.m4])
m4_include([m4/ax_gcc_x86_cpuid.m4])
m4_include([m4/ax_check_compile_flag.m4])
m4_include([m4/ax_ext.m4])
m4_include([m4/ax_openmp.m4])
AX_EXT
For importing biomcmc-lib
, you have two options:
- downloading the (upstream) software through github, with option
--recursive
(default for final users of software). This will downloadbiomcmc-lib
into${srcdir}/submodules/biomcmc-lib
. - download
biomcmc-lib
independently into${srcdir}/biomcmc-lib
or somewhere else and then link it to there. This is how I do it, since I use the same directory for several projects that rely on it.
If you opted for (1) above (i.e. you just cloned the repository recursively), then the configuration option below will create a link as in option (2).
AC_CHECK_FILE([${srcdir}/biomcmc-lib],[],[ln -s submodules/biomcmc-lib ${srcdir}/biomcmc-lib])
AC_CHECK_FILE([${srcdir}/biomcmc-lib/configure.ac],[], [AC_MSG_ERROR(["biomcmc-lib submodule missing, please git clone --recursive or link by hand to location of source code"])])
dnl Call biomcmc-lib ./configure script recursively.
AC_CONFIG_SUBDIRS([biomcmc-lib])
AC_SUBST([BIOMCMCLIB], [biomcmc-lib])
Then the file kalign/Makefile.am
has the following information, to create a static local library. This library will encapsulate biomcmc-lib
and will be available to the main software as libalign.la
:
AM_CPPFLAGS = $(GTKDEPS_CFLAGS) -I$(srcdir)/../@BIOMCMCLIB@/lib @OPENMP_CPPFLAGS@ @ZLIB_LIBS@
AM_CFLAGS = @SIMD_FLAGS@ @AM_CFLAGS@ @OPENMP_CFLAGS@
LDADD = $(GTKDEPS_LIBS) @ZLIB_LIBS@ ../biomcmc-lib/lib/libbiomcmc.la $(AM_LDFLAGS)
common_headers = kalign.h \
tldevel.h rng.h global.h \
alignment_parameters.h \
bisectingKmeans.h \
sequence_distance.h \
alignment.h bpm.h
common_src = run_kalign.c \
tldevel.c rng.c \
alignment_parameters.c \
bisectingKmeans.c \
sequence_distance.c \
alignment.c bpm.c
noinst_LTLIBRARIES = libkalign.la ## noinst_LT: linked statically (not installed globally)
libkalign_la_SOURCES = config.h $(common_headers) $(common_src)
And the src/Makefile.am
(with the final software) can be something like:
AM_CPPFLAGS = $(GTKDEPS_CFLAGS) -I$(srcdir)/../kalign -I$(srcdir)/../@BIOMCMCLIB@/lib @OPENMP_CPPFLAGS@ @ZLIB_LIBS@
AM_CFLAGS = @AM_CFLAGS@ @OPENMP_CFLAGS@ @CHECK_CFLAGS@
LDADD = $(GTKDEPS_LIBS) @CHECK_LIBS@ ../kalign/libkalign.la ../biomcmc-lib/lib/libbiomcmc.la @ZLIB_LIBS@ $(AM_LDFLAGS)
bin_PROGRAMS = cumaru
cumaru_SOURCES = main.c kseq.h
cumaru_LDADD = $(LDADD)
Notice that we need to include the path to the local biomcmc-lib
as well, since it is also statically linked.
The code below performs multiple sequence alignment from hand-fed sequences (in practice, fasta reading usually come from kseq.h
).
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/biomcmc-lib/lib
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build.191216/biomcmc-lib/lib
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/leomrtns.cumaru/kalign
//%cflags: -I/usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build.191216/kalign
//%cflags: /usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build.191216/kalign/.libs/libkalign.a
//%cflags: /usr/users/QIB_fr005/deolivl/Academic/Quadram/009.supersptree/build.191216/biomcmc-lib/lib/.libs/libbiomcmc.a
//%cflags:-lm
#include <kalign.h>
int
main (int argc, char **argv)
{
int i;
clock_t time0, time1;
char_vector seqname = new_char_vector (1);
char_vector dna = new_char_vector (1);
char_vector align = NULL;
time0 = clock ();
char_vector_add_string (dna, "ATCGAAAGAATTAGGCTTAAGCTAAAAGCTTATGACCACAGAGTTCTAGACCGTACAGAT");
char_vector_add_string (seqname, "Campy");
char_vector_add_string (dna, "AGAAAGAATTAGGCTTAAGCTAAGCTTATGACCACAGAGTTCTAGACCGTACAGTT");
char_vector_add_string (seqname, "Vibrio");
char_vector_add_string (dna, "ATGGAAAGAATTAGGCTTAAGCCGGAGCTTATGACCACAGAGTTCTAGAGCGTACAGTT");
char_vector_add_string (seqname, "Strepto");
char_vector_add_string (dna, "ATGGAAAGAATTAGGCTTAAGCTAAAAGCTTATGACTTCATTCTGTACAGTT");
char_vector_add_string (seqname, "Staph");
align = kalign3_from_char_vector (dna);
time1 = clock ();
fprintf (stderr, "finished in %lf secs\n", (double)(time1-time0)/(double)(CLOCKS_PER_SEC));
fflush(stderr);
for (i= 0; i < align->nstrings; i++) printf (">%s\n%s\n", seqname->string[i], align->string[i]);
del_char_vector (dna);
del_char_vector (align);
del_char_vector (seqname);
return EXIT_SUCCESS;
}
const char *mytext[] = {"this has ` , $ percent \% and ' while", "so \%d ____^H^H^H^Hmany"};
const char col_reset[] = "\e[0m";
const char *rownames[7] = {"regular", "bold", "underline", "background", "highbckgrnd", "hightext", "boldhightxt"};
const char *prt_col[][8]={ // 0-black 1-red 2-grn 3-yel 4-blu 5-mag 6-cyn 7-white
{"\e[0;30m", "\e[0;31m", "\e[0;32m", "\e[0;33m", "\e[0;34m", "\e[0;35m", "\e[0;36m", "\e[0;37m"}, // 0 regular text
{"\e[1;30m", "\e[1;31m", "\e[1;32m", "\e[1;33m", "\e[1;34m", "\e[1;35m", "\e[1;36m", "\e[1;37m"}, // 1 regular bold text
{"\e[4;30m", "\e[4;31m", "\e[4;32m", "\e[4;33m", "\e[4;34m", "\e[4;35m", "\e[4;36m", "\e[4;37m"}, // 2 regular underline text
{"\e[40m", "\e[41m", "\e[42m", "\e[43m", "\e[44m", "\e[45m", "\e[46m", "\e[47m"}, // 3 regular background
{"\e[0;100m", "\e[0;101m", "\e[0;102m", "\e[0;103m", "\e[0;104m", "\e[0;105m", "\e[0;106m", "\e[0;107m"}, // 4 high intensity underground
{"\e[0;90m", "\e[0;91m", "\e[0;92m", "\e[0;93m", "\e[0;94m", "\e[0;95m", "\e[0;96m", "\e[0;97m"}, // 5 high intensity text
{"\e[1;90m", "\e[1;91m", "\e[1;92m", "\e[1;93m", "\e[1;94m", "\e[1;95m", "\e[1;96m", "\e[1;97m"} // 6 bold high intensity text
};
int main() {
int i,j;
for (i=0;i<8;i++) {
printf ("\n%d\n", i);
for (j=0; j < 7; j++) printf ("%15s : %s>%s<>%s<%s\n",rownames[j],prt_col[j][i],mytext[0], mytext[1], col_reset);
};
// test with marsaglia constants (should print prime numbers)
printf ("%5d %5d %5d", (18000<<16)-1, (18000<<15)-1, (18030<<16)-1);
}
And this is the same output in my terminal (black background):