|
#include <stdlib.h> |
|
#include <ctype.h> |
|
#include <stdio.h> |
|
#include <string.h> |
|
|
|
#include "include/weights.h" |
|
#include "include/twister.h" |
|
|
|
#ifdef USE_FLOAT |
|
char* weights_fmt = "%0.8e\n"; |
|
#else |
|
char* weights_fmt = "%0.16e\n"; |
|
#endif |
|
|
|
void MSAReweightSequences(alignment_t *ali, options_t *options) { |
|
|
|
|
|
|
|
|
|
for (int i = 0; i < ali->nSeqs; i++) ali->weights[i] = 1.0; |
|
|
|
|
|
if (options->theta >= 0 && options->theta <= 1) { |
|
|
|
|
|
|
|
if (options->fastWeights > 0 && options->fastWeights < ali->nSeqs) { |
|
|
|
int nClusters = options->fastWeights; |
|
int nIterations = 10; |
|
int nSeqs = ali->nSeqs; |
|
int nCodes = ali->nCodes; |
|
int nSites = ali->nSites; |
|
|
|
#define COUNTS(i,j,a) counts[i * nSites * nCodes + j * nCodes + a] |
|
#define CONSENSUS(i,j) consensus[i * nSites + j] |
|
#define ALI(i,j) aliPermute[i * nSites + j] |
|
|
|
|
|
int *clusters = (int *) malloc(nClusters * sizeof(int)); |
|
letter_t *consensus = (letter_t *) malloc(nClusters * nSites * sizeof(letter_t)); |
|
for (int i = 0; i < nClusters; i++) clusters[i] = i; |
|
for (int i = nClusters; i < nSeqs; i++) { |
|
int ix = genrand_int32() % (i); |
|
if (ix < nClusters) clusters[ix] = i; |
|
} |
|
for (int i = 0; i < nClusters; i++) |
|
for (int j = 0; j < nSites; j++) |
|
CONSENSUS(i,j) = seq(clusters[i], j); |
|
free(clusters); |
|
|
|
|
|
int *assignment = (int *) malloc(nSeqs * sizeof(int)); |
|
int *counts = (int *) malloc(nClusters * nSites * nCodes * sizeof(int)); |
|
int *radii = (int *) malloc(nClusters * sizeof(int)); |
|
for (int i = 0; i < nSeqs; i++) assignment[i] = 0; |
|
fprintf(stderr, "Clustering"); |
|
for (int t = 0; t < nIterations; t++) { |
|
fprintf(stderr, "."); |
|
|
|
for (int i = 0; i < nClusters; i++) radii[i] = 0; |
|
#pragma omp parallel for |
|
for (int s = 0; s < nSeqs; s++) { |
|
int ixOld = assignment[s]; |
|
|
|
numeric_t distance = 0; |
|
for (int j = 0; j < nSites; j++) |
|
distance += (CONSENSUS(ixOld, j) != seq(s, j)); |
|
|
|
int ixNew = ixOld; |
|
for (int i = 0; i < nClusters; i++) { |
|
numeric_t distanceI = 0; |
|
for (int j = 0; j < nSites; j++) |
|
distanceI += (CONSENSUS(i, j) != seq(s, j)); |
|
if (distanceI < distance) { |
|
ixNew = i; |
|
distance = distanceI; |
|
} |
|
} |
|
if (ixNew != ixOld) assignment[s] = ixNew; |
|
if (radii[ixNew] < distance) radii[ixNew] = distance; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (t < nIterations - 1) { |
|
for (int i = 0; i < nClusters * nSites * nCodes; i++) |
|
counts[i] = 0; |
|
for (int s = 0; s < nSeqs; s++) |
|
for (int j = 0; j < nSites; j++) |
|
COUNTS(assignment[s], j, seq(s, j)) += 1; |
|
#pragma omp parallel for |
|
for (int i = 0; i < nClusters; i++) |
|
for (int j = 0; j < nSites; j++) { |
|
int topCode = 0; |
|
int topCounts = COUNTS(i, j, 0); |
|
for (int b = 1; b < nCodes; b++) |
|
if (COUNTS(i, j, b) > topCounts) { |
|
topCode = b; |
|
topCounts = COUNTS(i, j, b); |
|
} |
|
CONSENSUS(i ,j) = topCode; |
|
} |
|
} |
|
} |
|
fprintf(stderr, "\n"); |
|
|
|
|
|
numeric_t *clusterID = (numeric_t *) malloc(nClusters * nClusters * sizeof(numeric_t)); |
|
for (int i = 0; i < nClusters * nClusters; i++) clusterID[i] = 0; |
|
#pragma omp parallel for |
|
for (int pi = 0; pi < nClusters; pi++) |
|
for (int pj = 0; pj < nClusters; pj++) |
|
for (int j = 0; j < nSites; j++) |
|
clusterID[pi + pj * nClusters] += |
|
(CONSENSUS(pi,j) == CONSENSUS(pj,j)); |
|
|
|
free(consensus); |
|
free(counts); |
|
|
|
|
|
int *clusterSizes = (int *) malloc(nClusters * sizeof(int)); |
|
int *clusterStart = (int *) malloc(nClusters * sizeof(int)); |
|
int *clusterEnd = (int *) malloc(nClusters * sizeof(int)); |
|
int *permuteMap = (int *) malloc(nSeqs * sizeof(int)); |
|
numeric_t *weightsP = (numeric_t *) malloc(nSeqs * sizeof(numeric_t)); |
|
letter_t *aliPermute = (letter_t *) malloc(nSeqs * nSites * sizeof(letter_t)); |
|
for (int i = 0; i < nClusters; i++) clusterSizes[i] = 0; |
|
for (int s = 0; s < ali->nSeqs; s++) |
|
clusterSizes[assignment[s]] += 1; |
|
int ix = 0; |
|
for (int i = 0; i < nClusters; i++) { |
|
clusterStart[i] = ix; |
|
ix += clusterSizes[i]; |
|
clusterEnd[i] = ix; |
|
} |
|
ix = 0; |
|
for (int i = 0; i < nClusters; i++) |
|
for (int s = 0; s < ali->nSeqs; s++) |
|
if (assignment[s] == i) { |
|
for (int j = 0; j < nSites; j++) |
|
ALI(ix,j) = seq(s,j); |
|
permuteMap[ix] = s; |
|
ix++; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
numeric_t cutoff = (numeric_t) ((1 - options->theta) * ali->nSites); |
|
for (int s = 0; s < nSeqs; s++) weightsP[s] = 1; |
|
#pragma omp parallel for |
|
for (int ci = 0; ci < nClusters; ci++) |
|
for (int cj = 0; cj < nClusters; cj++) |
|
if (clusterID[ci * nClusters + cj] >= 0.9 * cutoff) |
|
for (int s = clusterStart[ci]; s < clusterEnd[ci]; s++) |
|
for (int t = clusterStart[cj]; t < clusterEnd[cj]; t++) |
|
if (s != t) { |
|
int id = 0; |
|
for (int n = 0; n < ali->nSites; n++) |
|
id += (ALI(s, n) == ALI(t, n)); |
|
if (id >= cutoff) weightsP[s] += 1.0; |
|
} |
|
for (int s = 0; s < nSeqs; s++) |
|
ali->weights[permuteMap[s]] = weightsP[s]; |
|
|
|
#undef COUNTS |
|
#undef CONSENSUS |
|
#undef ALI |
|
|
|
free(clusterSizes); |
|
free(clusterStart); |
|
free(clusterEnd); |
|
free(permuteMap); |
|
free(weightsP); |
|
free(radii); |
|
free(aliPermute); |
|
} else { |
|
|
|
#if defined(_OPENMP) |
|
|
|
#pragma omp parallel for |
|
for (int s = 0; s < ali->nSeqs; s++) |
|
for (int t = 0; t < ali->nSeqs; t++) |
|
if (s != t) { |
|
int id = 0; |
|
for (int n = 0; n < ali->nSites; n++) |
|
id += (seq(s, n) == seq(t, n)); |
|
if (id >= ((1 - options->theta) * ali->nSites)) |
|
ali->weights[s] += 1.0; |
|
} |
|
#else |
|
|
|
for (int s = 0; s < ali->nSeqs - 1; s++) |
|
for (int t = s + 1; t < ali->nSeqs; t++) { |
|
int id = 0; |
|
for (int n = 0; n < ali->nSites; n++) |
|
id += (seq(s, n) == seq(t, n)); |
|
if (id >= ((1 - options->theta) * ali->nSites)) { |
|
ali->weights[s] += 1.0; |
|
ali->weights[t] += 1.0; |
|
} |
|
} |
|
#endif |
|
} |
|
|
|
|
|
for (int i = 0; i < ali->nSeqs; i++) |
|
ali->weights[i] = 1.0 / ali->weights[i]; |
|
} |
|
|
|
|
|
for (int i = 0; i < ali->nSeqs; i++) |
|
ali->weights[i] *= options->scale; |
|
|
|
|
|
ali->nEff = 0; |
|
for (int i = 0; i < ali->nSeqs; i++) ali->nEff += ali->weights[i]; |
|
|
|
if (options->theta >= 0 && options->theta <= 1) { |
|
fprintf(stderr, |
|
"Effective number of samples: %.1f\t(%.0f%% identical neighborhood = %.3f samples)\n", |
|
ali->nEff, 100 * (1 - options->theta), options->scale); |
|
} else { |
|
fprintf(stderr, |
|
"Theta not between 0 and 1, no sequence reweighting applied (N = %.2f)\n", |
|
ali->nEff); |
|
} |
|
} |
|
|
|
int ValidateCustomWeightsFile(char *weightsFile, alignment_t *ali) { |
|
|
|
|
|
FILE *fp = fopen(weightsFile, "r"); |
|
if (fp == NULL) { |
|
fprintf(stderr, "Error: weights file %s does not exist\n", weightsFile); |
|
fclose(fp); |
|
return 1; |
|
} |
|
|
|
|
|
int nLines = 0; |
|
int MAX_LINE_LENGTH = 1024; |
|
char line[MAX_LINE_LENGTH]; |
|
while (fgets(line, MAX_LINE_LENGTH, fp) != NULL) nLines++; |
|
|
|
int nSeqsRaw = ali->nSeqs + ali->nSkippedSeqs; |
|
if (nLines != nSeqsRaw) { |
|
fprintf(stderr, "Error: weights file %s has %d lines, but alignment has %d sequences\n", |
|
weightsFile, nLines, nSeqsRaw); |
|
fclose(fp); |
|
return 1; |
|
} |
|
|
|
fclose(fp); |
|
return 0; |
|
} |
|
|
|
void ReadCustomWeightsFile(char *weightsFile, alignment_t *ali) { |
|
|
|
|
|
|
|
int validCode = ValidateCustomWeightsFile(weightsFile, ali); |
|
if (validCode != 0) { |
|
fprintf(stderr, "Error: weights file %s is invalid\n", weightsFile); |
|
exit(1); |
|
} |
|
|
|
|
|
FILE *fp = fopen(weightsFile, "r"); |
|
if (fp == NULL) { |
|
fprintf(stderr, "Error: could not open weights file %s\n", weightsFile); |
|
exit(1); |
|
} |
|
|
|
for (int i = 0; i < ali->nSeqs; i++) ali->weights[i] = 1.0; |
|
|
|
|
|
int skippedIdx = 0, reducedIdx = 0, nWarnings = 0, maxWarnings = 64; |
|
int nSeqsTotal = ali->nSeqs + ali->nSkippedSeqs; |
|
for (int i = 0; i < nSeqsTotal; i++) { |
|
long double w; |
|
if (fscanf(fp, "%Lf", &w) != 1) { |
|
fprintf(stderr, "Error reading weights file %s at position %d\n", weightsFile, i); |
|
exit(1); |
|
} |
|
|
|
if ((skippedIdx < ali->nSkippedSeqs) && (i == ali->skippedSeqs[skippedIdx])) { |
|
if (w > 0 && nWarnings < maxWarnings) { |
|
fprintf(stderr, "Warning: Skipped nonzero weight in file %s at position %d\n", weightsFile, i); |
|
nWarnings++; |
|
} |
|
skippedIdx++; |
|
continue; |
|
} else { |
|
ali->weights[reducedIdx] = (numeric_t)w; |
|
reducedIdx++; |
|
} |
|
} |
|
fclose(fp); |
|
|
|
|
|
ali->nEff = 0; |
|
for (int i = 0; i < ali->nSeqs; i++) ali->nEff += ali->weights[i]; |
|
|
|
fprintf(stderr, |
|
"Weights loaded successfully. Effective number of samples (to 1 decimal place): %.1f.\n", |
|
ali->nEff); |
|
} |
|
|
|
void WriteWeightsFile(char *weightsFile, alignment_t *ali) { |
|
|
|
|
|
FILE *fpOutput = fopen(weightsFile, "w"); |
|
if (fpOutput == NULL) { |
|
fprintf(stderr, "Error: could not open weights file %s\n", weightsFile); |
|
exit(1); |
|
} |
|
|
|
|
|
int skipix = 0, reducedix = 0; |
|
for (int i = 0; i < (ali->nSeqs + ali->nSkippedSeqs); i++) { |
|
if (skipix < ali->nSkippedSeqs && i == ali->skippedSeqs[skipix]) { |
|
|
|
fprintf(fpOutput, "0.0\n"); |
|
skipix++; |
|
} else { |
|
numeric_t w = ali->weights[reducedix]; |
|
fprintf(fpOutput, weights_fmt, w); |
|
reducedix++; |
|
} |
|
} |
|
fclose(fpOutput); |
|
} |
|
|