Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Windowed POA #335

Merged
merged 2 commits into from
Oct 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions bar/cactus_bar.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ void usage() {

fprintf(stderr, "-M --minimumCoverageToRescue : Unaligned segments must have at least this proportion of their bases covered by an outgroup to be rescued.\n");

fprintf(stderr, "-P --partialOrderAlignment : Use partial order aligner instead of Pecan for multiple alignment subproblems.\n");
fprintf(stderr, "-P --partialOrderAlignmentWindow (int >= 0): Use partial order aligner instead of Pecan for multiple alignment subproblems, on blocks up to given length (0=disable POA).\n");

fprintf(stderr, "-h --help : Print this help screen\n");
}
Expand Down Expand Up @@ -113,8 +113,9 @@ int main(int argc, char *argv[]) {
char *ingroupCoverageFilePath = NULL;
int64_t minimumSizeToRescue = 1;
double minimumCoverageToRescue = 0.0;
// toggle from pecan to abpoa for multiple alignment
bool poaMode = false;
// toggle from pecan to abpoa for multiple alignment, by setting to non-zero
// Note that poa uses about N^2, so maximum value is generally in 10s of kb
int64_t poaWindow = 0;

PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters = pairwiseAlignmentBandingParameters_construct();

Expand Down Expand Up @@ -144,12 +145,12 @@ int main(int argc, char *argv[]) {
{"minimumSizeToRescue", required_argument, 0, 'K'},
{"minimumCoverageToRescue", required_argument, 0, 'M'},
{ "minimumNumberOfSpecies", required_argument, 0, 'N' },
{"partialOrderAlignment", no_argument, 0, 'P'},
{"partialOrderAlignmentWindow", required_argument, 0, 'P'},
{ 0, 0, 0, 0 } };

int option_index = 0;

int key = getopt_long(argc, argv, "a:b:hi:j:kl:o:p:q:r:t:u:wy:A:B:D:E:FGI:J:K:L:M:N:P", long_options, &option_index);
int key = getopt_long(argc, argv, "a:b:hi:j:kl:o:p:q:r:t:u:wy:A:B:D:E:FGI:J:K:L:M:N:P:", long_options, &option_index);

if (key == -1) {
break;
Expand Down Expand Up @@ -271,7 +272,10 @@ int main(int argc, char *argv[]) {
}
break;
case 'P':
poaMode = true;
i = sscanf(optarg, "%" PRIi64 "", &poaWindow);
if (i != 1) {
st_errAbort("Error parsing poaLength parameter");
}
break;
default:
usage();
Expand Down Expand Up @@ -327,7 +331,7 @@ int main(int argc, char *argv[]) {
st_errAbort("The end %" PRIi64 " was not found in the flower\n", *((Name *)stList_get(names, i)));
}
stSortedSet *endAlignment = makeEndAlignment(sM, end, spanningTrees, maximumLength, useProgressiveMerging,
matchGamma, pairwiseAlignmentBandingParameters, poaMode);
matchGamma, pairwiseAlignmentBandingParameters, poaWindow);
writeEndAlignmentToDisk(end, endAlignment, fileHandle);
stSortedSet_destruct(endAlignment);
}
Expand Down Expand Up @@ -380,7 +384,7 @@ int main(int argc, char *argv[]) {
st_logInfo("Processing a flower\n");

stSortedSet *alignedPairs = makeFlowerAlignment3(sM, flower, listOfEndAlignmentFiles, spanningTrees, maximumLength,
useProgressiveMerging, matchGamma, pairwiseAlignmentBandingParameters, pruneOutStubAlignments, poaMode);
useProgressiveMerging, matchGamma, pairwiseAlignmentBandingParameters, pruneOutStubAlignments, poaWindow);
st_logInfo("Created the alignment: %" PRIi64 " pairs\n", stSortedSet_size(alignedPairs));
stPinchIterator *pinchIterator = stPinchIterator_constructFromAlignedPairs(alignedPairs, getNextAlignedPairAlignment);

Expand Down
8 changes: 4 additions & 4 deletions bar/impl/endAligner.c
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ int alignedPair_cmpFn(const AlignedPair *alignedPair1, const AlignedPair *aligne
stSortedSet *makeEndAlignment(StateMachine *sM, End *end, int64_t spanningTrees, int64_t maxSequenceLength,
bool useProgressiveMerging, float gapGamma,
PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters,
bool poa) {
int64_t poaWindow) {
//Make an alignment of the sequences in the ends

//Get the adjacency sequences to be aligned.
Expand Down Expand Up @@ -87,8 +87,8 @@ stSortedSet *makeEndAlignment(StateMachine *sM, End *end, int64_t spanningTrees,

//Get the alignment.
MultipleAlignment *mA;
if (poa && stList_length(seqFrags) > 1) {
mA = makePartialOrderAlignment(sM, seqFrags, gapGamma, pairwiseAlignmentBandingParameters);
if (poaWindow > 0 && stList_length(seqFrags) > 1) {
mA = makePartialOrderAlignment(sM, seqFrags, gapGamma, pairwiseAlignmentBandingParameters, poaWindow);
} else {
mA = makeAlignment(sM, seqFrags, spanningTrees, 100000000, useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters);
}
Expand Down Expand Up @@ -121,7 +121,7 @@ stSortedSet *makeEndAlignment(StateMachine *sM, End *end, int64_t spanningTrees,
int64_t commonInstanceNumber = *(int64_t *)stHash_search(endInstanceNumbers, otherEnd);
int64_t nonCommonInstanceNumber = stList_length(seqFrags) - commonInstanceNumber;

if (poa) {
if (poaWindow > 0) {
// hack hack hack
scoreAdjustmentsNonCommonEnds[i] = 1;
scoreAdjustmentsCommonEnds[i] = 1;
Expand Down
12 changes: 6 additions & 6 deletions bar/impl/flowerAligner.c
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,7 @@ static stSortedSet *getEndsToAlign(Flower *flower, int64_t maxSequenceLength) {
static void computeMissingEndAlignments(StateMachine *sM, Flower *flower, stHash *endAlignments, int64_t spanningTrees,
int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma,
PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters,
bool poa) {
int64_t poaWindow) {
/*
* Creates end alignments for the ends that
* do not have an alignment in the "endAlignments" hash, only creating
Expand All @@ -542,7 +542,7 @@ static void computeMissingEndAlignments(StateMachine *sM, Flower *flower, stHash
end,
makeEndAlignment(sM, end, spanningTrees, maxSequenceLength,
useProgressiveMerging, gapGamma,
pairwiseAlignmentBandingParameters, poa));
pairwiseAlignmentBandingParameters, poaWindow));
} else {
stHash_insert(endAlignments, end, stSortedSet_construct());
}
Expand All @@ -554,10 +554,10 @@ static void computeMissingEndAlignments(StateMachine *sM, Flower *flower, stHash

stSortedSet *makeFlowerAlignment(StateMachine *sM, Flower *flower, int64_t spanningTrees, int64_t maxSequenceLength,
bool useProgressiveMerging, float gapGamma,
PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, bool pruneOutStubAlignments, bool poa) {
PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, bool pruneOutStubAlignments, int64_t poaWindow) {
stHash *endAlignments = stHash_construct2(NULL, (void(*)(void *)) stSortedSet_destruct);
computeMissingEndAlignments(sM, flower, endAlignments, spanningTrees, maxSequenceLength,
useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters, poa);
useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters, poaWindow);
return makeFlowerAlignment2(flower, endAlignments, pruneOutStubAlignments);
}

Expand All @@ -579,13 +579,13 @@ static void loadEndAlignments(Flower *flower, stHash *endAlignments, stList *lis

stSortedSet *makeFlowerAlignment3(StateMachine *sM, Flower *flower, stList *listOfEndAlignmentFiles, int64_t spanningTrees,
int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma,
PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, bool pruneOutStubAlignments, bool poa) {
PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, bool pruneOutStubAlignments, int64_t poaWindow) {
stHash *endAlignments = stHash_construct2(NULL, (void(*)(void *)) stSortedSet_destruct);
if(listOfEndAlignmentFiles != NULL) {
loadEndAlignments(flower, endAlignments, listOfEndAlignmentFiles);
}
computeMissingEndAlignments(sM, flower, endAlignments, spanningTrees, maxSequenceLength,
useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters, poa);
useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters, poaWindow);
return makeFlowerAlignment2(flower, endAlignments, pruneOutStubAlignments);
}

Expand Down
166 changes: 121 additions & 45 deletions bar/impl/poaAligner.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

// char <--> uint8_t conversion copied over from abPOA example
// AaCcGgTtNn ==> 0,1,2,3,4
static unsigned char nst_nt4_table[256] = {
static uint8_t nst_nt4_table[256] = {
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4,
Expand All @@ -49,31 +49,50 @@ static inline uint8_t toByte(char c) {
return nst_nt4_table[(int)c];
}

/** Copy the fragment sequences into the matrix format that abpoa expects, using
* offsets and windowSize to subset as needed. updates bseqs and chunk_lens
*/
static void toPoaInput(stList* seqFrags, int64_t n_seqs, int64_t* seq_lens, int64_t* seq_offsets,
int64_t windowSize, uint8_t** bseqs, int* chunk_lens) {
for (int64_t i = 0; i < n_seqs; ++i) {
chunk_lens[i] = 0;
SeqFrag* frag = (SeqFrag*)stList_get(seqFrags, i);
for (int64_t j = seq_offsets[i]; j < seq_lens[i] && chunk_lens[i] < windowSize; ++j, ++chunk_lens[i]) {
// todo: support iupac characters?
bseqs[i][chunk_lens[i]] = toByte(frag->seq[j]);
}
}
}

MultipleAlignment *makePartialOrderAlignment(StateMachine *sM, stList *seqFrags,
float matchGamma,
PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters) {

PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters,
int64_t windowSize) {

// transorm the input for abpoa, using its example.c as a guide
int64_t n_seqs = stList_length(seqFrags);

// collect sequence length, trasform ACGT to 0123
int *seq_lens = (int*)malloc(sizeof(int) * n_seqs);
uint8_t **bseqs = (uint8_t**)malloc(sizeof(uint8_t*) * n_seqs);

stListIterator *it = stList_getIterator(seqFrags);
SeqFrag* frag;
int i = 0;
int j = 0;
while ((frag = stList_getNext(it)) != NULL) {
// lengths of each input fragment
int64_t *seq_lens = (int64_t*)st_malloc(sizeof(int64_t) * n_seqs);
int64_t bases_remaining = 0;

// keep track of current lengths
int *chunk_lens = (int*)st_malloc(sizeof(int) * n_seqs);
// keep track of current offsets
int64_t* seq_offsets = (int64_t*)st_calloc(n_seqs, sizeof(int64_t));
// keep track of empty chunks
bool* empty_seqs = (bool*)st_calloc(n_seqs, sizeof(bool));

// initialize the poa input matrix
uint8_t **bseqs = (uint8_t**)st_malloc(sizeof(uint8_t*) * n_seqs);
for (int64_t i = 0; i < n_seqs; ++i) {
SeqFrag* frag = stList_get(seqFrags, i);
seq_lens[i] = frag->length;
bseqs[i] = (uint8_t*)malloc(sizeof(uint8_t) * frag->length);
for (j = 0; j < frag->length; ++j) {
// todo: support iupac characters?
bseqs[i][j] = toByte(frag->seq[j]);
}
++i;
int64_t chunk_len = windowSize < seq_lens[i] ? windowSize : seq_lens[i];
bseqs[i] = (uint8_t*)st_malloc(sizeof(uint8_t) * (chunk_len));
bases_remaining += seq_lens[i];
}

// initialize variables
abpoa_t *ab = abpoa_init();
abpoa_para_t *abpt = abpoa_init_para();
Expand All @@ -99,32 +118,95 @@ MultipleAlignment *makePartialOrderAlignment(StateMachine *sM, stList *seqFrags,

abpoa_post_set_para(abpt);

// variables to store resulting MSA;
uint8_t **msa_seq; int msa_l=0;

// perform abpoa-msa
abpoa_msa(ab, abpt, n_seqs, NULL, seq_lens, bseqs, NULL, NULL, NULL, NULL, NULL, &msa_seq, &msa_l);

// todo: score
MultipleAlignment *mA = st_calloc(1, sizeof(MultipleAlignment));

mA->alignedPairs = poaMatrixToAlignedPairs(msa_seq, n_seqs, msa_l, 1.0, seqFrags);
mA->alignedPairs = stList_construct3(0, (void(*)(void *)) stIntTuple_destruct);
// todo: refactor the multiple alignment object, as these should be optional
mA->chosenPairwiseAlignments = stList_construct3(0, (void(*)(void *)) stIntTuple_destruct);
mA->columns = stSet_construct(); //makeColumns(seqFrags);

int64_t prev_bases_remaining = bases_remaining;
while (bases_remaining > 0) {

// load up to windowSize of each sequence into the input matrix for poa
toPoaInput(seqFrags, n_seqs, seq_lens, seq_offsets, windowSize, bseqs, chunk_lens);

// clean up
for (i = 0; i < n_seqs; ++i) {
free(msa_seq[i]);
// variables to store resulting MSA;
uint8_t **msa_seq; int msa_l=0;

// poa can't handle empty sequences. this is a hack to get around that
int emptyCount = 0;
for (int64_t i = 0; i < n_seqs; ++i) {
if (chunk_lens[i] == 0) {
empty_seqs[i] = true;
chunk_lens[i] = 1;
bseqs[i][0] = toByte('N');
++emptyCount;
} else {
empty_seqs[i] = false;
}
}

// perform abpoa-msa
abpoa_msa(ab, abpt, n_seqs, NULL, chunk_lens, bseqs, NULL, NULL, NULL, NULL, NULL, &msa_seq, &msa_l);

// mask out empty sequences that were phonied in as Ns above
for (int64_t i = 0; i < n_seqs && emptyCount > 0; ++i) {
if (empty_seqs[i] == true) {
for (int j = 0; j < msa_l; ++j) {
if (toBase(msa_seq[i][j]) != '-') {
assert(toBase(msa_seq[i][j]) == 'N');
msa_seq[i][j] = toByte('-');
--chunk_lens[i];
assert(chunk_lens[i] == 0);
--emptyCount;
break;
}
}
}
}
assert(emptyCount == 0);

// todo: lots of room to be smarter about finding the next anchor here by, for example, unaligning any
// bad-looking columns at the end of the matrix (and updating the various offset structures) or
// just using a sliding window with overlap, and ignoring alignments near the end

// todo: do we want to use global alignment each time?

// add the msa matrix as tuples to our aligned pairs list
// note that seq_offsets is modified in place here, updating the position along the original sequences
poaMatrixToAlignedPairs(msa_seq, n_seqs, msa_l, 1.0, seqFrags, seq_offsets, mA->alignedPairs);

// clean up
for (int64_t i = 0; i < n_seqs; ++i) {
free(msa_seq[i]);
}
free(msa_seq);

// remember how much we aligned this round
for (int64_t i = 0; i < n_seqs; ++i) {
bases_remaining -= chunk_lens[i];
}
// and use for sanity check
assert(prev_bases_remaining > bases_remaining && bases_remaining >= 0);

if (bases_remaining > 0) {
// reset graph before re-use
abpoa_reset_graph(ab, abpt, seq_lens[0]);
}

//used only for sanity check
prev_bases_remaining = bases_remaining;
}
free(msa_seq);

for (i = 0; i < n_seqs; ++i) {
for (int64_t i = 0; i < n_seqs; ++i) {
free(bseqs[i]);
}

free(bseqs);
free(seq_lens);
free(chunk_lens);
free(seq_offsets);
free(empty_seqs);
abpoa_free(ab, abpt);
abpoa_free_para(abpt);

Expand All @@ -134,31 +216,28 @@ MultipleAlignment *makePartialOrderAlignment(StateMachine *sM, stList *seqFrags,
return mA;
}

stList *poaMatrixToAlignedPairs(uint8_t** msaSeq, int numSeqs, int msaWidth, int score, stList* seqFrags) {
void poaMatrixToAlignedPairs(uint8_t** msaSeq, int64_t numSeqs, int64_t msaWidth, int64_t score, stList* seqFrags, int64_t* offsets, stList* outPairs) {
/*
fprintf(stdout, ">Multiple_sequence_alignment\n");
for (int i = 0; i < numSeqs; ++i) {
for (int j = 0; j < msaWidth; ++j) {
for (int64_t i = 0; i < numSeqs; ++i) {
for (int64_t j = 0; j < msaWidth; ++j) {
fprintf(stdout, "%c", toBase(msaSeq[i][j]));
}
fprintf(stdout, "\n");
}
*/

stList *alignedPairs = stList_construct3(0, (void(*)(void *)) stIntTuple_destruct);
int64_t* offsets = st_calloc(numSeqs, sizeof(int64_t));

for (int column = 0; column < msaWidth; ++column) { // For each column
for (int64_t column = 0; column < msaWidth; ++column) { // For each column
int64_t anchor = -1, anchorCoordinate;
for (int seqIdx = 0; seqIdx < numSeqs; ++seqIdx) {
for (int64_t seqIdx = 0; seqIdx < numSeqs; ++seqIdx) {
if(toBase(msaSeq[seqIdx][column]) != '-') { // If is not a gap
int64_t seqCoordinate = offsets[seqIdx]++;
if (toBase(msaSeq[seqIdx][column]) != 'N') { // If is not an ambiguous base
if (anchor == -1) { // Set as the anchoring base
anchor = seqIdx;
anchorCoordinate = seqCoordinate;
} else { // Otherwise make an aligned pair between the anchor and the base
stList_append(alignedPairs, stIntTuple_construct5(
stList_append(outPairs, stIntTuple_construct5(
score, anchor, anchorCoordinate,
seqIdx, seqCoordinate));
}
Expand All @@ -168,8 +247,5 @@ stList *poaMatrixToAlignedPairs(uint8_t** msaSeq, int numSeqs, int msaWidth, int
}
}

free(offsets);

return alignedPairs;
}

2 changes: 1 addition & 1 deletion bar/inc/endAligner.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ int alignedPair_cmpFn(const AlignedPair *alignedPair1, const AlignedPair *aligne
stSortedSet *makeEndAlignment(StateMachine *sM, End *end, int64_t spanningTrees, int64_t maxSequenceLength,
bool useProgressiveMerging, float gapGamma,
PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters,
bool poa);
int64_t poaWindow);

/*
* Writes an end alignment to the given file.
Expand Down
Loading