Commit e5846535 authored by Thomas Haschka's avatar Thomas Haschka
Browse files

Initial Commit

parents
Copyright (C) 2021 Thomas Haschka
This software is provided ’as-is’, without any express or implied warranty.
In no event will the authors be held liable for any dam- ages arising
from the use of this software. Permission is granted to anyone to use
this software for any purpose, including commercial applications, and
to alter it and redistribute it freely, subject to the following
restrictions:
1. The origin of this software must not be
misrepresented; you must not claim that you wrote the original
software. If you use this software in a product, an acknowledgment in
the product documentation would be appreciated but is not
required.
2. Altered source versions must be plainly marked as such,
and must not be misrepresented as being the original soft-
ware.
3. This notice may not be removed or altered from any source,
binary distribution and this manual.
\ No newline at end of file
# A FASTA - Fourier Transform based Sequence in Sequence Finder
(c) Thomas Haschka 2021 - for a license see the provided license file.
This tool finds a fasta sequence in a fasta sequence using
the fourier transform.
# Installation
This tool requires the FFTW3 library with double precision support. In
order to build this tool also the -dev package (i.e. header files) are
required. On debian/ubuntu based distributions you may install this package
using:
```sudo apt-get install build-essential fftw3 fftw3-dev```
Once all dependencies are installed you may compile this tool on your system
like this:
```gcc -O3 -march=native binary_array.c dataset.c seq_in_seq.c -lfftw3_threads -lfftw3 -lm -o seq_in_seq```
Once you have compiled this tool you can call it with the following arguments:
```
[fasta] sequence to search sequence in
[fasta] sequence to be searched for
[int] number of sequence to search in
in the first fasta file. Beginning with 1
[int] 1 created a wisdom file during this run (if you do not have one)
0 use a wisdom file already available
[wisdom] path of a wisdom file, always to be specified
[int] cutoff - number of sequence changes to accept
[string] chromosome specifier
[int] number of threads fftw can use
```
The first run of the program might be long as you are supposed to generate a
*wisdom file* for the sequence to be searched in. This allows you to find the
optimal fourier transformation for your system as well as the size of your
problem. Once this is done a sequence in a sequence can be found very quickly
using such a *wisdom file*. I.e. if you are searching frequently in hg38
I suggest you create wisdom files for each chromosome.
The *chromosome specifier* is the first string that will be written in your
bed coordinates output. I.e. if you search on hg38-chr1 and type chr1 you can
upload your output to the UCSC Genome Browser and visulize the locations of
the sequence that you are searching for.
# Memory usage
The current version requires a lot of memory. A machine with 32 GB of memory
should nevertheless be sufficant to search all of Hg38-chr1. On a 64 GB
searching Hg38-chr1 poses no problem. On machines with less ram you might cut
your fasta files into pieces and perform the search piecwise. A fix for this
might come up in a future version that performs this cut automatically on low
memory machines.
/*! \file binary_array.c
* \brief This file contains functions to set and get values from binary arrays
*/
#include<stdlib.h>
#include<string.h>
char* alloc_and_set_zero_binary_array(size_t array_length) {
char* bm = (char*)malloc(sizeof(char*)*((array_length/8+1)));
if(bm == NULL) {
return NULL;
}
memset(bm,0,sizeof(char*)*((array_length/8+1)));
return(bm);
}
char* set_zero_binary_array(char* b_array,size_t array_length) {
return((char*)(memset(b_array,0,sizeof(char*)*(array_length/8+1))));
}
char* alloc_binary_array(size_t array_length) {
char* bm = (char*)malloc(sizeof(char*)*((array_length/8+1)));
return(bm);
}
void binary_array_or(char* result, size_t array_length, char* a, char* b) {
int i;
for(i=0;i<array_length/8+1;i++) {
result[i] = a[i] | b[i];
}
}
void binary_array_and(char* result, size_t array_length, char* a, char* b) {
int i;
for(i=0;i<array_length/8+1;i++) {
result[i] = a[i] & b[i];
}
}
void set_value_in_binary_array_at_index(char* b_array,
size_t index) {
char small_index;
int module = index%(sizeof(char)*8);
size_t location = index/(sizeof(char)*8);
small_index = (char)1 << module;
b_array[location] |= small_index;
}
int get_value_in_binary_array_at_index(char* b_array,
size_t index) {
char small_index;
int module = index%(sizeof(char)*8);
size_t location = index/(sizeof(char)*8);
small_index = (char)1 << module;
return(small_index == (b_array[location] & small_index));
}
/*! \file binary_array.h
* \brief defines function for binary array manipulation
*/
/*! \brief Allocates a binary array and sets all values to zero. */
char* alloc_and_set_zero_binary_array(size_t array_length);
/*! \brief Sets all values in a binary array to zero. */
char* set_zero_binary_array(char* b_array,size_t array_length);
/*! \brief Allocates a binary array. */
char* alloc_binary_array(size_t array_length);
/*! \brief Results a binary array from the OR result of two binary array.
* Calculates the OR array from the arrays a and b. All arrays have to be of
* the same length and have to be allocated prior calling this function
*/
void binary_array_or(char* result, size_t array_length, char* a, char* b);
/*! \brief Results a binary array from the AND result of two binary array.
* Calculates the OR array from the arrays a and b. All arrays have to be of
* the same length and have to be allocated prior calling this function
*/
void binary_array_and(char* result, size_t array_length, char* a, char* b);
/*! \brief Sets the value in a binary array to 1 at index. */
void set_value_in_binary_array_at_index(char* b_array,
size_t index);
/*! \brief Gets the value in a binary array at index.
* This function obtains the value of a binary array at a specified index and
* returns it as integer 0 or 1 */
int get_value_in_binary_array_at_index(char* b_array,
size_t index);
/*! \file colors.h
* \brief defines shorthands for ANSI escape sequences that color fonts
* in the terminal. i.e. ANSI_COLOR_RED is defined as "\x1b[31m".
*/
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_YELLOW "\x1b[33m"
#define ANSI_COLOR_BLUE "\x1b[34m"
#define ANSI_COLOR_MAGENTA "\x1b[35m"
#define ANSI_COLOR_CYAN "\x1b[36m"
#define ANSI_COLOR_RESET "\x1b[0m"
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<ctype.h>
#include<unistd.h>
#include<math.h>
#include"dataset.h"
#include"binary_array.h"
#include"colors.h"
typedef struct {
int buffer[2];
} sorthelper;
int comp(const void *a, const void *b) {
sorthelper* sa = (sorthelper*)a;
sorthelper* sb = (sorthelper*)b;
return((sa->buffer[1]-sb->buffer[1]));
}
void sort_unique_sequences(unique_sequences us) {
int i;
sorthelper* helper = (sorthelper*)malloc(sizeof(sorthelper)*us.n_seq);
for(i=0;i<us.n_seq;i++) {
helper[i].buffer[0] = us.u_seq[i];
helper[i].buffer[1] = us.multiplicities[i];
}
qsort(helper, us.n_seq, sizeof(sorthelper), comp);
for(i=0;i<us.n_seq;i++) {
us.u_seq[i] = helper[i].buffer[0];
us.multiplicities[i] = helper[i].buffer[1];
}
}
unique_sequences get_sequence_multiplicities(dataset ds) {
unique_sequences us;
char* visited_sites = alloc_and_set_zero_binary_array(ds.n_values);
int i,j, u_seq_counter;
char* current_sequence;
us.u_seq = (int*)malloc(sizeof(int)*ds.n_values);
us.multiplicities = (int*)malloc(sizeof(int)*ds.n_values);
for(i=0; i<ds.n_values;i++) {
us.multiplicities[i] = 1;
}
u_seq_counter=0;
for(i=0; i<ds.n_values;i++) {
if (!get_value_in_binary_array_at_index(visited_sites,i)) {
us.u_seq[u_seq_counter] = i;
set_value_in_binary_array_at_index(visited_sites,i);
for(j=0; j<ds.n_values;j++) {
if(!get_value_in_binary_array_at_index(visited_sites,j)) {
if (!strcmp(ds.sequences[i],ds.sequences[j]) &&
(strlen(ds.sequences[i]) == strlen(ds.sequences[j]))) {
set_value_in_binary_array_at_index(visited_sites,j);
us.multiplicities[u_seq_counter]++;
}
}
}
u_seq_counter++;
}
}
us.u_seq = (int*)realloc(us.u_seq, sizeof(int)*u_seq_counter);
us.multiplicities =
(int*)realloc(us.multiplicities, sizeof(int)*u_seq_counter);
us.n_seq = u_seq_counter;
return(us);
}
void dataset_to_fasta(FILE* f, dataset ds) {
int j, k;
for(j=0;j < ds.n_values; j++) {
fprintf(f,">sequence_%i\n",j);
for ( k = 0;
k < (ds.sequence_lengths[j]-1);
k++) {
if( k != 0 && k%50 == 0 ) {
fprintf(f, "\n");
fputc(ds.sequences[j][k],f);
} else {
fputc(ds.sequences[j][k],f);
}
}
if ( (ds.sequence_lengths[j]-1) != 0 &&
(ds.sequence_lengths[j]-1) %50 == 0 ) {
fprintf(f, "\n");
fputc(ds.sequences[j][k],f);
} else {
fputc(ds.sequences[j][k],f);
}
fprintf(f, "\n");
}
}
void reverse_complement_sequences(dataset* ds, char* binary_mask) {
int i,j;
char* buffer = (char*)malloc(sizeof(char)*ds->max_sequence_length);
for (i = 0 ; i<ds->n_values; i++) {
if (get_value_in_binary_array_at_index(binary_mask,i)) {
memcpy(buffer, ds->sequences[i], ds->sequence_lengths[i]);
for(j = 0; j<ds->sequence_lengths[i];j++) {
switch(buffer[ds->sequence_lengths[i]-1-j]) {
case 'A':
ds->sequences[i][j] = 'T';
break;
case 'C':
ds->sequences[i][j] = 'G';
break;
case 'G':
ds->sequences[i][j] = 'C';
break;
case 'T':
ds->sequences[i][j] = 'A';
break;
default:
ds->sequences[i][j] = buffer[ds->sequence_lengths[i]-1-j];
break;
}
}
}
}
free(buffer);
}
void reverse_sequences(dataset* ds, char* binary_mask) {
int i,j;
char* buffer = (char*)malloc(sizeof(char)*ds->max_sequence_length);
for (i = 0 ; i<ds->n_values; i++) {
if (get_value_in_binary_array_at_index(binary_mask,i)) {
memcpy(buffer, ds->sequences[i], ds->sequence_lengths[i]);
for (j = 0 ; j<ds->sequence_lengths[i]; j++) {
ds->sequences[i][j] = buffer[ds->sequence_lengths[i]-1-j];
}
}
}
free(buffer);
}
void write_unique_sequences(FILE* outfile, dataset ds, unique_sequences us) {
int i,k;
for(i=0; i<us.n_seq; i++) {
fprintf(outfile,">sequence_%i_mutiplicity_%i\n",
us.u_seq[i],
us.multiplicities[i]);
for( k = 0;
k < (strlen(ds.sequences[us.u_seq[i]]))-1;
k++) {
if( k != 0 && k%50 == 0 ) {
fprintf(outfile, "\n");
fputc(ds.sequences[us.u_seq[i]][k],outfile);
} else {
fputc(ds.sequences[us.u_seq[i]][k],outfile);
}
}
if( (strlen(ds.sequences[us.u_seq[i]])-1) != 0 &&
(strlen(ds.sequences[us.u_seq[i]])-1) %50 == 0 ) {
fprintf(outfile, "\n");
fputc(ds.sequences[us.u_seq[i]][k], outfile);
} else {
fputc(ds.sequences[us.u_seq[i]][k], outfile);
}
fprintf(outfile,"\n");
}
}
void char_sequence_to_binary(char* c_seq, char* b_seq, size_t seq_len) {
size_t i;
int j;
char bin;
for(i = 0; i< seq_len; i++) {
switch(c_seq[i]) {
case 'A':
bin = 0;
break;
case 'C':
bin = 1;
break;
case 'G':
bin = 2;
break;
case 'T':
bin = 3;
break;
default:
printf("unhandled character for binary conversion %c\n", c_seq[i]);
_exit(1);
}
j=i%4;
b_seq[i/4] |= (bin << 2*j);
}
}
char get_char_from_binary_sequence_at_index(char* b_seq, size_t idx) {
size_t b_idx = idx/4;
size_t b_off = idx%4;
char test_a = (1 << (b_off*2));
char test_b = (1 << (b_off*2)+1);
char res = 0;
char ret_val;
if(test_a == (b_seq[b_idx] & test_a)) res += 1;
if(test_b == (b_seq[b_idx] & test_b)) res += 2;
switch (res) {
case 0:
ret_val = 'A';
break;
case 1:
ret_val = 'C';
break;
case 2:
ret_val = 'G';
break;
case 3:
ret_val = 'T';
break;
}
return(ret_val);
}
void add_binary_sequences_to_dataset(dataset* ds) {
size_t i;
char** c_seq = ds->sequences;
size_t* lengths = ds->sequence_lengths;
char** b_seq = malloc(sizeof(ds->n_values)*sizeof(char*));
for(i = 0 ; i< ds->n_values; i++ ) {
b_seq[i] = (char*)malloc(sizeof(char)*(lengths[i]/4+1));
char_sequence_to_binary(c_seq[i], b_seq[i], lengths[i]);
}
}
dataset dataset_from_fasta(FILE* in) {
char* line = NULL;
size_t line_size = 0;
int sequences = 0;
dataset ds;
char linebuffer[2000];
size_t linebuffer_length;
size_t sequence_length;
int i;
size_t length_before_addition_of_line;
rewind(in);
while ( -1 != getline(&line, &line_size, in) ) {
if( line[0] == '>' ) sequences++;
}
rewind(in);
ds.sequence_lengths = (size_t*)malloc(sizeof(size_t*)*sequences);
ds.sequences = (char**)malloc(sizeof(char*)*sequences);
ds.n_values = sequences;
sequences = -1;
ds.max_sequence_length = 0;
while ( -1 != getline(&line, &line_size, in) ) {
if ( line[0] == '>') {
/* new sequence in file */
sequences++;
/* redress memory usage of previous sequence */
if(sequences > 0) {
ds.sequences[sequences-1] =
(char*)realloc(ds.sequences[sequences-1],
sizeof(char)*(ds.sequence_lengths[sequences-1]+1));
}
ds.sequences[sequences] = (char*)malloc(sizeof(char)*1001);
ds.sequences[sequences][0] = 0;
ds.sequence_lengths[sequences] = 0;
} else {
/* continuation of current sequence */
sscanf(line,"%s",linebuffer);
linebuffer_length = strlen(linebuffer);
length_before_addition_of_line = ds.sequence_lengths[sequences];
ds.sequence_lengths[sequences] += linebuffer_length;
if(ds.sequence_lengths[sequences] > 1000) {
ds.sequences[sequences] =
(char*)realloc(ds.sequences[sequences],
sizeof(char)*(ds.sequence_lengths[sequences]+1));
}
for(i=0; i < linebuffer_length; i++) {
linebuffer[i] = toupper(linebuffer[i]);
}
memcpy(ds.sequences[sequences]+length_before_addition_of_line,
linebuffer,
linebuffer_length);
ds.sequences[sequences][length_before_addition_of_line
+linebuffer_length] = 0;
}
}
for(i=0;i<ds.n_values;i++) {
if(ds.max_sequence_length < ds.sequence_lengths[i]) {
ds.max_sequence_length = ds.sequence_lengths[i];
}
}
free(line);
return(ds);
}
data_shape shape_from_kmer_file(int infile) {
data_shape s;
unsigned char current_character;
off_t size;
int i;
s.n_features = 0;
s.n_samples = 0;
size = lseek(infile, 0, SEEK_END);
lseek(infile, 0, SEEK_SET);
char* f_buffer = (char*)malloc(sizeof(char)*size);
if (size != read(infile, f_buffer, size)) {
printf("Could not read file in order to read obtain data shape\n");
_exit(1);
}
i = 0;
while( f_buffer[i] != '\n') {
if ( f_buffer[i] == '\t') s.n_features++;
i++;
}
while( i < size ) {
if ( f_buffer[i] == '\n' ) s.n_samples++;
i++;
}
return(s);
}
dataset load_kmer_from_file_into_dataset(FILE* in_file, data_shape shape) {
dataset ds;
int i, j;
char buffer[1024];
ds.n_dimensions = shape.n_features;
ds.n_values = shape.n_samples;
ds.values = (float**)malloc(sizeof(float*)*ds.n_dimensions);
for(i = 0 ; i < ds.n_dimensions; i++) {
#if defined(__AVX__) || defined(__SSE__)
if(0 != posix_memalign((void**)&ds.values[i],
#if defined(__AVX__)
32,
#elif defined(__SSE__)
16,
#endif
sizeof(float)*ds.n_values)) {
printf("Error allocating aligned memory for kmers \n");
_exit(1);
}
#else
ds.values[i] =
(float*)malloc(sizeof(float)*ds.n_values);
#endif
}
rewind(in_file);
for(i=0;i<shape.n_samples;i++) {
fscanf(in_file,"%s", buffer);
for(j=0;j<shape.n_features;j++) {
fscanf(in_file,"%f", ds.values[j]+i);
}
}
return(ds);
}
void load_projections_from_file_into_dataset(FILE* projections,
size_t dimensions,
dataset* ds) {