From 96af3dbbc7146e5b336aefeaedb372a13aedb866 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 21 Oct 2020 10:58:52 -0400 Subject: [PATCH] (naively) break poa alignments into chunks --- bar/cactus_bar.c | 20 +-- bar/impl/endAligner.c | 8 +- bar/impl/flowerAligner.c | 12 +- bar/impl/poaAligner.c | 166 +++++++++++++++++------ bar/inc/endAligner.h | 2 +- bar/inc/flowerAligner.h | 4 +- bar/inc/poaAligner.h | 17 ++- bar/tests/runEndAlignment.c | 30 +++- src/cactus/cactus_progressive_config.xml | 1 + src/cactus/pipeline/cactus_workflow.py | 35 +++-- src/cactus/shared/common.py | 4 +- 11 files changed, 208 insertions(+), 91 deletions(-) diff --git a/bar/cactus_bar.c b/bar/cactus_bar.c index d0e1370a3..30ba8450f 100644 --- a/bar/cactus_bar.c +++ b/bar/cactus_bar.c @@ -70,7 +70,7 @@ void usage() { fprintf(stderr, "-M --minimumCoverageToRescue : Unaligned segments must have at least this proportion of their bases covered by an outgroup to be rescued.\n"); - fprintf(stderr, "-P --partialOrderAlignment : Use partial order aligner instead of Pecan for multiple alignment subproblems.\n"); + fprintf(stderr, "-P --partialOrderAlignmentWindow (int >= 0): Use partial order aligner instead of Pecan for multiple alignment subproblems, on blocks up to given length (0=disable POA).\n"); fprintf(stderr, "-h --help : Print this help screen\n"); } @@ -113,8 +113,9 @@ int main(int argc, char *argv[]) { char *ingroupCoverageFilePath = NULL; int64_t minimumSizeToRescue = 1; double minimumCoverageToRescue = 0.0; - // toggle from pecan to abpoa for multiple alignment - bool poaMode = false; + // toggle from pecan to abpoa for multiple alignment, by setting to non-zero + // Note that poa uses about N^2, so maximum value is generally in 10s of kb + int64_t poaWindow = 0; PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters = pairwiseAlignmentBandingParameters_construct(); @@ -144,12 +145,12 @@ int main(int argc, char *argv[]) { {"minimumSizeToRescue", required_argument, 0, 'K'}, {"minimumCoverageToRescue", required_argument, 0, 'M'}, { "minimumNumberOfSpecies", required_argument, 0, 'N' }, - {"partialOrderAlignment", no_argument, 0, 'P'}, + {"partialOrderAlignmentWindow", required_argument, 0, 'P'}, { 0, 0, 0, 0 } }; int option_index = 0; - int key = getopt_long(argc, argv, "a:b:hi:j:kl:o:p:q:r:t:u:wy:A:B:D:E:FGI:J:K:L:M:N:P", long_options, &option_index); + int key = getopt_long(argc, argv, "a:b:hi:j:kl:o:p:q:r:t:u:wy:A:B:D:E:FGI:J:K:L:M:N:P:", long_options, &option_index); if (key == -1) { break; @@ -271,7 +272,10 @@ int main(int argc, char *argv[]) { } break; case 'P': - poaMode = true; + i = sscanf(optarg, "%" PRIi64 "", &poaWindow); + if (i != 1) { + st_errAbort("Error parsing poaLength parameter"); + } break; default: usage(); @@ -327,7 +331,7 @@ int main(int argc, char *argv[]) { st_errAbort("The end %" PRIi64 " was not found in the flower\n", *((Name *)stList_get(names, i))); } stSortedSet *endAlignment = makeEndAlignment(sM, end, spanningTrees, maximumLength, useProgressiveMerging, - matchGamma, pairwiseAlignmentBandingParameters, poaMode); + matchGamma, pairwiseAlignmentBandingParameters, poaWindow); writeEndAlignmentToDisk(end, endAlignment, fileHandle); stSortedSet_destruct(endAlignment); } @@ -380,7 +384,7 @@ int main(int argc, char *argv[]) { st_logInfo("Processing a flower\n"); stSortedSet *alignedPairs = makeFlowerAlignment3(sM, flower, listOfEndAlignmentFiles, spanningTrees, maximumLength, - useProgressiveMerging, matchGamma, pairwiseAlignmentBandingParameters, pruneOutStubAlignments, poaMode); + useProgressiveMerging, matchGamma, pairwiseAlignmentBandingParameters, pruneOutStubAlignments, poaWindow); st_logInfo("Created the alignment: %" PRIi64 " pairs\n", stSortedSet_size(alignedPairs)); stPinchIterator *pinchIterator = stPinchIterator_constructFromAlignedPairs(alignedPairs, getNextAlignedPairAlignment); diff --git a/bar/impl/endAligner.c b/bar/impl/endAligner.c index 035d75cf3..72ad97d46 100644 --- a/bar/impl/endAligner.c +++ b/bar/impl/endAligner.c @@ -56,7 +56,7 @@ int alignedPair_cmpFn(const AlignedPair *alignedPair1, const AlignedPair *aligne stSortedSet *makeEndAlignment(StateMachine *sM, End *end, int64_t spanningTrees, int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma, PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, - bool poa) { + int64_t poaWindow) { //Make an alignment of the sequences in the ends //Get the adjacency sequences to be aligned. @@ -87,8 +87,8 @@ stSortedSet *makeEndAlignment(StateMachine *sM, End *end, int64_t spanningTrees, //Get the alignment. MultipleAlignment *mA; - if (poa && stList_length(seqFrags) > 1) { - mA = makePartialOrderAlignment(sM, seqFrags, gapGamma, pairwiseAlignmentBandingParameters); + if (poaWindow > 0 && stList_length(seqFrags) > 1) { + mA = makePartialOrderAlignment(sM, seqFrags, gapGamma, pairwiseAlignmentBandingParameters, poaWindow); } else { mA = makeAlignment(sM, seqFrags, spanningTrees, 100000000, useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters); } @@ -121,7 +121,7 @@ stSortedSet *makeEndAlignment(StateMachine *sM, End *end, int64_t spanningTrees, int64_t commonInstanceNumber = *(int64_t *)stHash_search(endInstanceNumbers, otherEnd); int64_t nonCommonInstanceNumber = stList_length(seqFrags) - commonInstanceNumber; - if (poa) { + if (poaWindow > 0) { // hack hack hack scoreAdjustmentsNonCommonEnds[i] = 1; scoreAdjustmentsCommonEnds[i] = 1; diff --git a/bar/impl/flowerAligner.c b/bar/impl/flowerAligner.c index 461311c64..175a73931 100644 --- a/bar/impl/flowerAligner.c +++ b/bar/impl/flowerAligner.c @@ -524,7 +524,7 @@ static stSortedSet *getEndsToAlign(Flower *flower, int64_t maxSequenceLength) { static void computeMissingEndAlignments(StateMachine *sM, Flower *flower, stHash *endAlignments, int64_t spanningTrees, int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma, PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, - bool poa) { + int64_t poaWindow) { /* * Creates end alignments for the ends that * do not have an alignment in the "endAlignments" hash, only creating @@ -542,7 +542,7 @@ static void computeMissingEndAlignments(StateMachine *sM, Flower *flower, stHash end, makeEndAlignment(sM, end, spanningTrees, maxSequenceLength, useProgressiveMerging, gapGamma, - pairwiseAlignmentBandingParameters, poa)); + pairwiseAlignmentBandingParameters, poaWindow)); } else { stHash_insert(endAlignments, end, stSortedSet_construct()); } @@ -554,10 +554,10 @@ static void computeMissingEndAlignments(StateMachine *sM, Flower *flower, stHash stSortedSet *makeFlowerAlignment(StateMachine *sM, Flower *flower, int64_t spanningTrees, int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma, - PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, bool pruneOutStubAlignments, bool poa) { + PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, bool pruneOutStubAlignments, int64_t poaWindow) { stHash *endAlignments = stHash_construct2(NULL, (void(*)(void *)) stSortedSet_destruct); computeMissingEndAlignments(sM, flower, endAlignments, spanningTrees, maxSequenceLength, - useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters, poa); + useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters, poaWindow); return makeFlowerAlignment2(flower, endAlignments, pruneOutStubAlignments); } @@ -579,13 +579,13 @@ static void loadEndAlignments(Flower *flower, stHash *endAlignments, stList *lis stSortedSet *makeFlowerAlignment3(StateMachine *sM, Flower *flower, stList *listOfEndAlignmentFiles, int64_t spanningTrees, int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma, - PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, bool pruneOutStubAlignments, bool poa) { + PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, bool pruneOutStubAlignments, int64_t poaWindow) { stHash *endAlignments = stHash_construct2(NULL, (void(*)(void *)) stSortedSet_destruct); if(listOfEndAlignmentFiles != NULL) { loadEndAlignments(flower, endAlignments, listOfEndAlignmentFiles); } computeMissingEndAlignments(sM, flower, endAlignments, spanningTrees, maxSequenceLength, - useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters, poa); + useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters, poaWindow); return makeFlowerAlignment2(flower, endAlignments, pruneOutStubAlignments); } diff --git a/bar/impl/poaAligner.c b/bar/impl/poaAligner.c index 2fdaf6886..62edc72ce 100644 --- a/bar/impl/poaAligner.c +++ b/bar/impl/poaAligner.c @@ -22,7 +22,7 @@ // char <--> uint8_t conversion copied over from abPOA example // AaCcGgTtNn ==> 0,1,2,3,4 -static unsigned char nst_nt4_table[256] = { +static uint8_t nst_nt4_table[256] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, @@ -49,31 +49,50 @@ static inline uint8_t toByte(char c) { return nst_nt4_table[(int)c]; } +/** Copy the fragment sequences into the matrix format that abpoa expects, using + * offsets and windowSize to subset as needed. updates bseqs and chunk_lens + */ +static void toPoaInput(stList* seqFrags, int64_t n_seqs, int64_t* seq_lens, int64_t* seq_offsets, + int64_t windowSize, uint8_t** bseqs, int* chunk_lens) { + for (int64_t i = 0; i < n_seqs; ++i) { + chunk_lens[i] = 0; + SeqFrag* frag = (SeqFrag*)stList_get(seqFrags, i); + for (int64_t j = seq_offsets[i]; j < seq_lens[i] && chunk_lens[i] < windowSize; ++j, ++chunk_lens[i]) { + // todo: support iupac characters? + bseqs[i][chunk_lens[i]] = toByte(frag->seq[j]); + } + } +} + MultipleAlignment *makePartialOrderAlignment(StateMachine *sM, stList *seqFrags, float matchGamma, - PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters) { - + PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, + int64_t windowSize) { + // transorm the input for abpoa, using its example.c as a guide int64_t n_seqs = stList_length(seqFrags); - // collect sequence length, trasform ACGT to 0123 - int *seq_lens = (int*)malloc(sizeof(int) * n_seqs); - uint8_t **bseqs = (uint8_t**)malloc(sizeof(uint8_t*) * n_seqs); - - stListIterator *it = stList_getIterator(seqFrags); - SeqFrag* frag; - int i = 0; - int j = 0; - while ((frag = stList_getNext(it)) != NULL) { + // lengths of each input fragment + int64_t *seq_lens = (int64_t*)st_malloc(sizeof(int64_t) * n_seqs); + int64_t bases_remaining = 0; + + // keep track of current lengths + int *chunk_lens = (int*)st_malloc(sizeof(int) * n_seqs); + // keep track of current offsets + int64_t* seq_offsets = (int64_t*)st_calloc(n_seqs, sizeof(int64_t)); + // keep track of empty chunks + bool* empty_seqs = (bool*)st_calloc(n_seqs, sizeof(bool)); + + // initialize the poa input matrix + uint8_t **bseqs = (uint8_t**)st_malloc(sizeof(uint8_t*) * n_seqs); + for (int64_t i = 0; i < n_seqs; ++i) { + SeqFrag* frag = stList_get(seqFrags, i); seq_lens[i] = frag->length; - bseqs[i] = (uint8_t*)malloc(sizeof(uint8_t) * frag->length); - for (j = 0; j < frag->length; ++j) { - // todo: support iupac characters? - bseqs[i][j] = toByte(frag->seq[j]); - } - ++i; + int64_t chunk_len = windowSize < seq_lens[i] ? windowSize : seq_lens[i]; + bseqs[i] = (uint8_t*)st_malloc(sizeof(uint8_t) * (chunk_len)); + bases_remaining += seq_lens[i]; } - + // initialize variables abpoa_t *ab = abpoa_init(); abpoa_para_t *abpt = abpoa_init_para(); @@ -99,32 +118,95 @@ MultipleAlignment *makePartialOrderAlignment(StateMachine *sM, stList *seqFrags, abpoa_post_set_para(abpt); - // variables to store resulting MSA; - uint8_t **msa_seq; int msa_l=0; - - // perform abpoa-msa - abpoa_msa(ab, abpt, n_seqs, NULL, seq_lens, bseqs, NULL, NULL, NULL, NULL, NULL, &msa_seq, &msa_l); - // todo: score MultipleAlignment *mA = st_calloc(1, sizeof(MultipleAlignment)); - - mA->alignedPairs = poaMatrixToAlignedPairs(msa_seq, n_seqs, msa_l, 1.0, seqFrags); + mA->alignedPairs = stList_construct3(0, (void(*)(void *)) stIntTuple_destruct); // todo: refactor the multiple alignment object, as these should be optional mA->chosenPairwiseAlignments = stList_construct3(0, (void(*)(void *)) stIntTuple_destruct); mA->columns = stSet_construct(); //makeColumns(seqFrags); + + int64_t prev_bases_remaining = bases_remaining; + while (bases_remaining > 0) { + + // load up to windowSize of each sequence into the input matrix for poa + toPoaInput(seqFrags, n_seqs, seq_lens, seq_offsets, windowSize, bseqs, chunk_lens); - // clean up - for (i = 0; i < n_seqs; ++i) { - free(msa_seq[i]); + // variables to store resulting MSA; + uint8_t **msa_seq; int msa_l=0; + + // poa can't handle empty sequences. this is a hack to get around that + int emptyCount = 0; + for (int64_t i = 0; i < n_seqs; ++i) { + if (chunk_lens[i] == 0) { + empty_seqs[i] = true; + chunk_lens[i] = 1; + bseqs[i][0] = toByte('N'); + ++emptyCount; + } else { + empty_seqs[i] = false; + } + } + + // perform abpoa-msa + abpoa_msa(ab, abpt, n_seqs, NULL, chunk_lens, bseqs, NULL, NULL, NULL, NULL, NULL, &msa_seq, &msa_l); + + // mask out empty sequences that were phonied in as Ns above + for (int64_t i = 0; i < n_seqs && emptyCount > 0; ++i) { + if (empty_seqs[i] == true) { + for (int j = 0; j < msa_l; ++j) { + if (toBase(msa_seq[i][j]) != '-') { + assert(toBase(msa_seq[i][j]) == 'N'); + msa_seq[i][j] = toByte('-'); + --chunk_lens[i]; + assert(chunk_lens[i] == 0); + --emptyCount; + break; + } + } + } + } + assert(emptyCount == 0); + + // todo: lots of room to be smarter about finding the next anchor here by, for example, unaligning any + // bad-looking columns at the end of the matrix (and updating the various offset structures) or + // just using a sliding window with overlap, and ignoring alignments near the end + + // todo: do we want to use global alignment each time? + + // add the msa matrix as tuples to our aligned pairs list + // note that seq_offsets is modified in place here, updating the position along the original sequences + poaMatrixToAlignedPairs(msa_seq, n_seqs, msa_l, 1.0, seqFrags, seq_offsets, mA->alignedPairs); + + // clean up + for (int64_t i = 0; i < n_seqs; ++i) { + free(msa_seq[i]); + } + free(msa_seq); + + // remember how much we aligned this round + for (int64_t i = 0; i < n_seqs; ++i) { + bases_remaining -= chunk_lens[i]; + } + // and use for sanity check + assert(prev_bases_remaining > bases_remaining && bases_remaining >= 0); + + if (bases_remaining > 0) { + // reset graph before re-use + abpoa_reset_graph(ab, abpt, seq_lens[0]); + } + + //used only for sanity check + prev_bases_remaining = bases_remaining; } - free(msa_seq); - for (i = 0; i < n_seqs; ++i) { + for (int64_t i = 0; i < n_seqs; ++i) { free(bseqs[i]); } - free(bseqs); free(seq_lens); + free(chunk_lens); + free(seq_offsets); + free(empty_seqs); abpoa_free(ab, abpt); abpoa_free_para(abpt); @@ -134,23 +216,20 @@ MultipleAlignment *makePartialOrderAlignment(StateMachine *sM, stList *seqFrags, return mA; } -stList *poaMatrixToAlignedPairs(uint8_t** msaSeq, int numSeqs, int msaWidth, int score, stList* seqFrags) { +void poaMatrixToAlignedPairs(uint8_t** msaSeq, int64_t numSeqs, int64_t msaWidth, int64_t score, stList* seqFrags, int64_t* offsets, stList* outPairs) { /* fprintf(stdout, ">Multiple_sequence_alignment\n"); - for (int i = 0; i < numSeqs; ++i) { - for (int j = 0; j < msaWidth; ++j) { + for (int64_t i = 0; i < numSeqs; ++i) { + for (int64_t j = 0; j < msaWidth; ++j) { fprintf(stdout, "%c", toBase(msaSeq[i][j])); } fprintf(stdout, "\n"); } */ - stList *alignedPairs = stList_construct3(0, (void(*)(void *)) stIntTuple_destruct); - int64_t* offsets = st_calloc(numSeqs, sizeof(int64_t)); - - for (int column = 0; column < msaWidth; ++column) { // For each column + for (int64_t column = 0; column < msaWidth; ++column) { // For each column int64_t anchor = -1, anchorCoordinate; - for (int seqIdx = 0; seqIdx < numSeqs; ++seqIdx) { + for (int64_t seqIdx = 0; seqIdx < numSeqs; ++seqIdx) { if(toBase(msaSeq[seqIdx][column]) != '-') { // If is not a gap int64_t seqCoordinate = offsets[seqIdx]++; if (toBase(msaSeq[seqIdx][column]) != 'N') { // If is not an ambiguous base @@ -158,7 +237,7 @@ stList *poaMatrixToAlignedPairs(uint8_t** msaSeq, int numSeqs, int msaWidth, int anchor = seqIdx; anchorCoordinate = seqCoordinate; } else { // Otherwise make an aligned pair between the anchor and the base - stList_append(alignedPairs, stIntTuple_construct5( + stList_append(outPairs, stIntTuple_construct5( score, anchor, anchorCoordinate, seqIdx, seqCoordinate)); } @@ -168,8 +247,5 @@ stList *poaMatrixToAlignedPairs(uint8_t** msaSeq, int numSeqs, int msaWidth, int } } - free(offsets); - - return alignedPairs; } diff --git a/bar/inc/endAligner.h b/bar/inc/endAligner.h index 96dc84cc4..a983cbcba 100644 --- a/bar/inc/endAligner.h +++ b/bar/inc/endAligner.h @@ -50,7 +50,7 @@ int alignedPair_cmpFn(const AlignedPair *alignedPair1, const AlignedPair *aligne stSortedSet *makeEndAlignment(StateMachine *sM, End *end, int64_t spanningTrees, int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma, PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, - bool poa); + int64_t poaWindow); /* * Writes an end alignment to the given file. diff --git a/bar/inc/flowerAligner.h b/bar/inc/flowerAligner.h index c44dd1a29..83de7209c 100644 --- a/bar/inc/flowerAligner.h +++ b/bar/inc/flowerAligner.h @@ -25,14 +25,14 @@ */ stSortedSet *makeFlowerAlignment(StateMachine *sM, Flower *flower, int64_t spanningTrees, int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma, - PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, bool pruneOutStubAlignments, bool poa); + PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, bool pruneOutStubAlignments, int64_t poaWindow); /* * As above, but including alignments from disk. */ stSortedSet *makeFlowerAlignment3(StateMachine *sM, Flower *flower, stList *listOfEndAlignmentFiles, int64_t spanningTrees, int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma, - PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, bool pruneOutStubAlignments, bool poa); + PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, bool pruneOutStubAlignments, int64_t poaWindow); /* * Ascertain which ends should be aligned separately. diff --git a/bar/inc/poaAligner.h b/bar/inc/poaAligner.h index bcce50c0c..9d0ec71ee 100644 --- a/bar/inc/poaAligner.h +++ b/bar/inc/poaAligner.h @@ -22,12 +22,11 @@ * For larger numbers of sequences, it can be orders of magnitude faster than Pecan. But... the maximum * length needs to be bounded or it'll just crash out right away trying to allocated a big matrix. */ -MultipleAlignment *makePartialOrderAlignment(StateMachine *sM, stList *seqFrags, +MultipleAlignment *makePartialOrderAlignment(StateMachine *sM, + stList *seqFrags, float matchGamma, - PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters); - - - + PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters, + int64_t windowSize); /** * Convert the gapped matrix structure that abpoa returns into list of aligned pairs. The algorithm used is @@ -36,6 +35,12 @@ MultipleAlignment *makePartialOrderAlignment(StateMachine *sM, stList *seqFrags, * - keep scanning, reporting a pairwise alignment of each subsequent sequence and the anchor if non-gapped * So the number of pairs here is O(N * L) (or linear in the size of the MSA matrix) */ -stList *poaMatrixToAlignedPairs(uint8_t** msaSeq, int numSeqs, int msaWidth, int score, stList* seqFrags); +void poaMatrixToAlignedPairs(uint8_t** msaSeq, + int64_t numSeqs, + int64_t msaWidth, + int64_t score, + stList* seqFrags, + int64_t* offsets, + stList* outPairs); #endif diff --git a/bar/tests/runEndAlignment.c b/bar/tests/runEndAlignment.c index ae2da6c14..e130fba88 100644 --- a/bar/tests/runEndAlignment.c +++ b/bar/tests/runEndAlignment.c @@ -75,6 +75,10 @@ void usage() { fprintf(stderr, "-M --minimumCoverageToRescue : Unaligned segments must have at least this proportion of their bases covered by an outgroup to be rescued.\n"); + fprintf(stderr, "-P --partialOrderAlignmentLength (int >= 0): Use partial order aligner instead of Pecan for multiple alignment subproblems, on blocks up to given length (0=disable POA).\n"); + + fprintf(stderr, "-n --noPecan: Dont do pecan, just poa\n"); + fprintf(stderr, "-h --help : Print this help screen\n"); } @@ -122,6 +126,8 @@ int main(int argc, char *argv[]) { char *ingroupCoverageFilePath = NULL; int64_t minimumSizeToRescue = 1; double minimumCoverageToRescue = 0.0; + int64_t poaWindow = 10000; + bool doPecan = true; PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters = pairwiseAlignmentBandingParameters_construct(); @@ -158,11 +164,13 @@ int main(int argc, char *argv[]) { {"minimumCoverageToRescue", required_argument, 0, 'M'}, { "minimumNumberOfSpecies", required_argument, 0, 'N' }, {"inputFasta", required_argument, 0, 'f'}, + {"partialOrderAlignmentLength", required_argument, 0, 'P'}, + {"noPecan", no_argument, 0, 'n'}, { 0, 0, 0, 0 } }; int option_index = 0; - int key = getopt_long(argc, argv, "a:b:hi:j:kl:o:p:q:r:t:u:wy:A:B:D:E:FGI:J:K:L:M:N:f:", long_options, &option_index); + int key = getopt_long(argc, argv, "a:b:hi:j:kl:o:p:q:r:t:u:wy:A:B:D:E:FGI:J:K:L:M:N:f:P:n", long_options, &option_index); if (key == -1) { break; @@ -286,6 +294,15 @@ int main(int argc, char *argv[]) { st_errAbort("Error parsing minimumNumberOfSpecies parameter"); } break; + case 'P': + i = sscanf(optarg, "%" PRIi64 "", &poaWindow); + if (i != 1) { + st_errAbort("Error parsing poaWindow parameter"); + } + break; + case 'n': + doPecan = false; + break; default: usage(); return 1; @@ -320,15 +337,20 @@ int main(int argc, char *argv[]) { fprintf(stderr, "doing pecan alignment\n"); clock_t t = clock(); - MultipleAlignment *mA = makeAlignment(sM, seqFrags, spanningTrees, 100000000, useProgressiveMerging, pairwiseAlignmentBandingParameters->gapGamma, pairwiseAlignmentBandingParameters); + MultipleAlignment *mA = NULL; + if (doPecan) { + mA = makeAlignment(sM, seqFrags, spanningTrees, 100000000, useProgressiveMerging, pairwiseAlignmentBandingParameters->gapGamma, pairwiseAlignmentBandingParameters); + } t = clock() - t; double pecanTime = ((double)t)/CLOCKS_PER_SEC; - print_results(mA); + if (doPecan) { + print_results(mA); + } fprintf(stderr, "doing poa alignment\n"); t = clock(); - MultipleAlignment *pA = makePartialOrderAlignment(sM, seqFrags, pairwiseAlignmentBandingParameters->gapGamma, pairwiseAlignmentBandingParameters); + MultipleAlignment *pA = makePartialOrderAlignment(sM, seqFrags, pairwiseAlignmentBandingParameters->gapGamma, pairwiseAlignmentBandingParameters, poaWindow); t = clock() - t; double abpoaTime = ((double)t)/CLOCKS_PER_SEC; diff --git a/src/cactus/cactus_progressive_config.xml b/src/cactus/cactus_progressive_config.xml index d55be659f..5c3e3b981 100644 --- a/src/cactus/cactus_progressive_config.xml +++ b/src/cactus/cactus_progressive_config.xml @@ -178,6 +178,7 @@ minimumSizeToRescue="100" minimumCoverageToRescue="0.5" partialOrderAlignment="0" + partialOrderAlignmentWindow="10000" > diff --git a/src/cactus/pipeline/cactus_workflow.py b/src/cactus/pipeline/cactus_workflow.py index 05095f5b5..87880797a 100644 --- a/src/cactus/pipeline/cactus_workflow.py +++ b/src/cactus/pipeline/cactus_workflow.py @@ -881,7 +881,8 @@ def runBarForJob(self, fileStore=None, features=None, calculateWhichEndsToComput minimumSizeToRescue=self.getOptionalPhaseAttrib("minimumSizeToRescue"), minimumCoverageToRescue=self.getOptionalPhaseAttrib("minimumCoverageToRescue"), minimumNumberOfSpecies=self.getOptionalPhaseAttrib("minimumNumberOfSpecies", int), - partialOrderAlignment=self.getOptionalPhaseAttrib("partialOrderAlignment", bool)) + partialOrderAlignment=self.getOptionalPhaseAttrib("partialOrderAlignment", bool), + partialOrderAlignmentWindow=self.getOptionalPhaseAttrib("partialOrderAlignmentWindow", int)) class CactusBarWrapper(CactusRecursionJob): """Runs the BAR algorithm implementation. @@ -889,8 +890,12 @@ class CactusBarWrapper(CactusRecursionJob): def featuresFn(self): """Merges both end size features and flower features--they will both have an impact on resource usage.""" - maximumLength=self.getOptionalPhaseAttrib("bandingLimit", int) - d = {'poaSize': min(float(max(self.flowerSizes)) / 2, maximumLength)} + if self.getOptionalPhaseAttrib("partialOrderAlignment"): + maximumLength=self.getOptionalPhaseAttrib("partialOrderAlignmentWindow", int) + assert maximumLength is not None + d = {'poaSize': min(float(max(self.flowerSizes)) / 2, maximumLength)} + else: + d = {} d.update(self.flowerFeatures()) return d @@ -980,20 +985,20 @@ def featuresFn(self): d.update(self.flowerFeatures()) return d - memoryPoly = [1.495e-03, 4.87e+09] - feature = 'maxEndSize' - memoryCap = 40e09 - def __init__(self, phaseNode, constantsNode, cactusDiskDatabaseString, flowerNames, flowerSizes, overlarge, endsToAlign, endSizes, cactusWorkflowArguments): self.cactusWorkflowArguments = cactusWorkflowArguments self.endsToAlign = endsToAlign self.endSizes = endSizes self.phaseNode = phaseNode + self.memoryPoly = [1.495e-03, 4.87e+09] + self.feature = 'maxEndSize' + self.memoryCap = 40e09 # poa mode needs a little extra memory if self.getOptionalPhaseAttrib("partialOrderAlignment"): - maximumLength = self.getOptionalPhaseAttrib("bandingLimit", int) + maximumLength = self.getOptionalPhaseAttrib("partialOrderAlignmentWindow", int) + assert maximumLength is not None self.memoryPoly[-1] += 23 * maximumLength * maximumLength CactusRecursionJob.__init__(self, phaseNode, constantsNode, cactusDiskDatabaseString, flowerNames, flowerSizes, overlarge, cactusWorkflowArguments=self.cactusWorkflowArguments, preemptable=True) @@ -1013,18 +1018,20 @@ def run(self, fileStore): class CactusBarWrapperWithPrecomputedEndAlignments(CactusRecursionJob): """Runs the BAR algorithm implementation with some precomputed end alignments.""" - featuresFn = lambda self: {'alignmentsSize': sum([fileID.size for fileID in self.precomputedAlignmentIDs])} - feature = 'alignmentsSize' - memoryPoly = [1.99700749e+00, 3.29659639e+08] - def __init__(self, *args, **kwargs): self.cactusWorkflowArguments = kwargs["cactusWorkflowArguments"] self.phaseNode = kwargs["phaseNode"] + self.precomputedAlignmentIDs = kwargs['precomputedAlignmentIDs'] + self.memoryPoly = [1.99700749e+00, 3.29659639e+08] + self.featuresFn = lambda : {'alignmentsSize': sum([fileID.size for fileID in self.precomputedAlignmentIDs])} + self.feature = 'alignmentsSize' # poa mode needs a little extra memory if self.getOptionalPhaseAttrib("partialOrderAlignment"): - maximumLength = self.getOptionalPhaseAttrib("bandingLimit", int) - self.memoryPoly[0] += 0.0000023 * maximumLength * maximumLength + maximumLength = self.getOptionalPhaseAttrib("partialOrderAlignmentWindow", int) + assert maximumLength is not None + alignmentsSize = self.featuresFn()['alignmentsSize'] + self.memoryPoly[1] += 0.0000023 * min(alignmentsSize, 10000000) * maximumLength * maximumLength CactusRecursionJob.__init__(self, *args, **kwargs) diff --git a/src/cactus/shared/common.py b/src/cactus/shared/common.py index 5728d4d85..c8e125a16 100644 --- a/src/cactus/shared/common.py +++ b/src/cactus/shared/common.py @@ -494,6 +494,7 @@ def runCactusBar(cactusDiskDatabaseString, flowerNames, logLevel=None, minimumCoverageToRescue=None, minimumNumberOfSpecies=None, partialOrderAlignment=None, + partialOrderAlignmentWindow=None, jobName=None, fileStore=None, features=None): @@ -550,7 +551,8 @@ def runCactusBar(cactusDiskDatabaseString, flowerNames, logLevel=None, if minimumNumberOfSpecies is not None: args += ["--minimumNumberOfSpecies", str(minimumNumberOfSpecies)] if partialOrderAlignment is True: - args += ["--partialOrderAlignment"] + assert partialOrderAlignmentWindow is not None and int(partialOrderAlignmentWindow) > 1 + args += ["--partialOrderAlignmentWindow", str(partialOrderAlignmentWindow)] masterMessages = cactus_call(stdin_string=flowerNames, check_output=True, parameters=["cactus_bar"] + args,