From ea00971842d915b5305d8ff1ef065832750110bf Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 11 Dec 2023 20:12:38 -0500 Subject: [PATCH 1/2] try to orient bar jobs on reference to left-align downstream --- api/impl/cactus_params_parser.c | 33 +++++++++++++++++++++++++++ api/inc/cactus_params_parser.h | 7 +++++- bar/impl/bar.c | 11 ++++++++- bar/impl/flowerAligner.c | 39 +++++++++++++++++++++++++++----- bar/impl/poaBarAligner.c | 4 ++-- bar/inc/flowerAligner.h | 3 ++- bar/inc/poaBarAligner.h | 4 +++- bar/tests/poaBarTest.c | 4 ++-- src/cactus/setup/cactus_align.py | 3 +++ 9 files changed, 94 insertions(+), 14 deletions(-) diff --git a/api/impl/cactus_params_parser.c b/api/impl/cactus_params_parser.c index 4784f7d8c..c11c7c6aa 100644 --- a/api/impl/cactus_params_parser.c +++ b/api/impl/cactus_params_parser.c @@ -174,3 +174,36 @@ double cactusParams_get_float(CactusParams *p, int num, ...) { va_end(args); return j; } + +static bool cactusParams_has_attr2(CactusParams *p, int num, va_list *args) { + va_list args2; + va_copy(args2, *args); // to use the variable args we must copy it - see https://wiki.sei.cmu.edu/confluence/display/c/MSC39-C.+Do+not+call+va_arg%28%29+on+a+va_list+that+has+an+indeterminate+value + xmlNodePtr c = get_descendant_node(p->cur, num-1, &args2); + + if(c == NULL) { + return false; + } + const char *attribute_name = NULL; + for (int64_t i = 0; i < num; i++) { // Loop to discard the earlier strings in the input + attribute_name = va_arg(args2, char *); + } + char *v = (char *)xmlGetProp(c, (const xmlChar *)attribute_name); + + if(v == NULL) { + return false; + } + + va_end(args2); + + return true; + +} + +bool cactusParams_has_attr(CactusParams *p, int num, ...) { + va_list args; + bool ret; + va_start(args, num); + ret = cactusParams_has_attr2(p, num, &args); + va_end(args); + return ret; +} diff --git a/api/inc/cactus_params_parser.h b/api/inc/cactus_params_parser.h index 8db4f3348..751f61c8d 100644 --- a/api/inc/cactus_params_parser.h +++ b/api/inc/cactus_params_parser.h @@ -39,7 +39,7 @@ void cactusParams_set_root(CactusParams *p, int num, ...); /* * Get a string parameter. */ -char *cactusParams_get_string(CactusParams *p, int, ...); +char *cactusParams_get_string(CactusParams *p, int num, ...); /* * Get an integer parameter. @@ -56,4 +56,9 @@ double cactusParams_get_float(CactusParams *p, int, ...); */ int64_t *cactusParams_get_ints(CactusParams *p, int64_t *length, int, ...); +/* + * Check if attribute exists + */ +bool cactusParams_has_attr(CactusParams *p, int, ...); + #endif /* ST_CACTUS_PARAMS_PARSER_H_ */ diff --git a/bar/impl/bar.c b/bar/impl/bar.c index 36ff8ef68..39edecabd 100644 --- a/bar/impl/bar.c +++ b/bar/impl/bar.c @@ -93,6 +93,15 @@ void bar(stList *flowers, CactusParams *params, CactusDisk *cactusDisk, stList * fa->minimumDegree = cactusParams_get_int(params, 2, "bar", "minimumBlockDegree"); fa->minimumNumberOfSpecies = cactusParams_get_int(params, 2, "bar", "minimumNumberOfSpecies"); fa->flower = flower; + // This option uses orients bar alignments so they are forward on the given event (to left-align in output) + Name forwardEventName = NULL_NAME; + if (cactusParams_has_attr(params, 3, "bar", "poa", "referenceEventName")) { + char *forwardEventHeader = cactusParams_get_string(params, 3, "bar", "poa", "referenceEventName"); + Event *forwardEvent = eventTree_getEventByHeader(flower_getEventTree(flower), forwardEventHeader); + assert(forwardEvent != NULL); + forwardEventName = event_getName(forwardEvent); + free(forwardEventHeader); + } void *alignments; if (usePoa) { @@ -101,7 +110,7 @@ void bar(stList *flowers, CactusParams *params, CactusDisk *cactusDisk, stList * * * It does not use any precomputed alignments, if they are provided they will be ignored */ - alignments = make_flower_alignment_poa(flower, maximumLength, poaWindow, maskFilter, poaParameters); + alignments = make_flower_alignment_poa(flower, maximumLength, poaWindow, maskFilter, poaParameters, forwardEventName); st_logDebug("Created the poa alignments: %" PRIi64 " poa alignment blocks for flower\n", stList_length(alignments)); } else { alignments = makeFlowerAlignment3(sM, flower, listOfEndAlignmentFiles, spanningTrees, maximumLength, diff --git a/bar/impl/flowerAligner.c b/bar/impl/flowerAligner.c index f1221c282..faa1f0aa9 100644 --- a/bar/impl/flowerAligner.c +++ b/bar/impl/flowerAligner.c @@ -459,7 +459,7 @@ int64_t getMaxAdjacencyLength(Flower *flower) { return maxAdjacencyLength; } -End *getDominantEnd(Flower *flower) { +End *getDominantEnd(Flower *flower, Name ref_event_name) { //return NULL; assert(flower_getGroupNumber(flower) <= 1); Flower_EndIterator *endIt = flower_getEndIterator(flower); @@ -467,11 +467,38 @@ End *getDominantEnd(Flower *flower) { int64_t maxInstanceNumber = 0; // Get the end with the largest number of caps End *dominantEnd = NULL; + // break ties with the largest number of forward sequences (prioritizing the ref_event_name) + int64_t dominantForwardRefCount = 0; + int64_t dominantForwardCount = 0; while ((end = flower_getNextEnd(endIt)) != NULL) { - if (end_getInstanceNumber(end) > maxInstanceNumber) { - maxInstanceNumber = end_getInstanceNumber(end); - dominantEnd = end; - } + int64_t endInstanceNumber = end_getInstanceNumber(end); + if (endInstanceNumber >= maxInstanceNumber) { + // compute the stranding information + End_InstanceIterator *endIt = end_getInstanceIterator(end); + Cap *cap; + int64_t forwardRefCount = 0; + int64_t forwardCount = 0; + while ((cap = end_getNext(endIt)) != NULL) { + Cap *adj_cap = cap_getAdjacency(cap); + if (adj_cap != NULL && cap_getCoordinate(cap) < cap_getCoordinate(adj_cap)) { + ++forwardCount; + Event *event = cap_getEvent(cap); + if (ref_event_name != NULL_NAME && event != NULL && event_getName(event) == ref_event_name) { + ++forwardRefCount; + } + } + } + end_destructInstanceIterator(endIt); + if (endInstanceNumber > maxInstanceNumber || + (endInstanceNumber == maxInstanceNumber && + (forwardRefCount > dominantForwardRefCount || + (forwardRefCount == dominantForwardRefCount && forwardCount > dominantForwardCount)))) { + maxInstanceNumber = endInstanceNumber; + dominantEnd = end; + dominantForwardRefCount = forwardRefCount; + dominantForwardCount = forwardCount; + } + } } flower_destructEndIterator(endIt); if (dominantEnd == NULL) { // This will only be true if there are no ends or only ends with no caps @@ -504,7 +531,7 @@ static stSortedSet *getEndsToAlign(Flower *flower, int64_t maxSequenceLength) { * Gets a set of the ends that we need to construct actual alignments for. */ stSortedSet *endsToAlign = stSortedSet_construct(); - End *dominantEnd = getDominantEnd(flower); //an end to which all adjacencies are incident + End *dominantEnd = getDominantEnd(flower, NULL_NAME); //an end to which all adjacencies are incident if (dominantEnd != NULL && getMaxAdjacencyLength(flower) <= 2 * maxSequenceLength) { stSortedSet_insert(endsToAlign, dominantEnd); } else { diff --git a/bar/impl/poaBarAligner.c b/bar/impl/poaBarAligner.c index d5b85f860..248b45d2f 100644 --- a/bar/impl/poaBarAligner.c +++ b/bar/impl/poaBarAligner.c @@ -1094,8 +1094,8 @@ int64_t getMaxSequenceLength(End *end) { } stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_t window_size, int64_t mask_filter, - abpoa_para_t * poa_parameters) { - End *dominantEnd = getDominantEnd(flower); + abpoa_para_t * poa_parameters, Name forward_event_name) { + End *dominantEnd = getDominantEnd(flower, forward_event_name); int64_t seq_no = dominantEnd != NULL ? end_getInstanceNumber(dominantEnd) : -1; if(dominantEnd != NULL && getMaxSequenceLength(dominantEnd) < max_seq_length) { /* diff --git a/bar/inc/flowerAligner.h b/bar/inc/flowerAligner.h index 44d663c94..e3acffc8e 100644 --- a/bar/inc/flowerAligner.h +++ b/bar/inc/flowerAligner.h @@ -36,8 +36,9 @@ stSortedSet *makeFlowerAlignment3(StateMachine *sM, Flower *flower, stList *list /* * Returns an end, if exists, that has cap involved in every adjacency, else returns null. + * In case of tie, choose the end that would align forward along ref_event_name */ -End *getDominantEnd(Flower *flower); +End *getDominantEnd(Flower *flower, Name ref_event_name); /* * Ascertain which ends should be aligned separately. diff --git a/bar/inc/poaBarAligner.h b/bar/inc/poaBarAligner.h index 2d3a65af0..bf24c2fe9 100644 --- a/bar/inc/poaBarAligner.h +++ b/bar/inc/poaBarAligner.h @@ -138,13 +138,15 @@ char *get_adjacency_string(Cap *cap, int *length, bool return_string); * @param mask_filter Trim input sequences if encountering this many consecutive soft of hard masked bases (0 = disabled) * @param poa_band_constant abpoa "b" parameter, where adaptive band is b+f* (b < 0 = disabled) * @param poa_band_fraction abpoa "f" parameter, where adaptive band is b+f* (b < 0 = disabled) + * @param forward_event_name try to orient (via getDominantEnd) alignments so they are forward wrt to this even (use NULL_NAME to disable) * Returns a list of AlignmentBlock ojects */ stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_t window_size, int64_t mask_filter, - abpoa_para_t * poa_parameters); + abpoa_para_t * poa_parameters, + Name forward_event_name); /** * Create a pinch iterator for a list of alignment blocks. diff --git a/bar/tests/poaBarTest.c b/bar/tests/poaBarTest.c index d9950dd0c..d7bd15414 100644 --- a/bar/tests/poaBarTest.c +++ b/bar/tests/poaBarTest.c @@ -212,7 +212,7 @@ void test_make_flower_alignment_poa(CuTest *testCase) { } flower_destructEndIterator(endIterator); - stList *alignment_blocks = make_flower_alignment_poa(flower, 2, 1000000, 5, abpt); + stList *alignment_blocks = make_flower_alignment_poa(flower, 2, 1000000, 5, abpt, NULL_NAME); for(int64_t i=0; iwf = 0.01; abpoa_post_set_para(abpt); - stList *alignment_blocks = make_flower_alignment_poa(flower, 10000, 1000000, 5, abpt); + stList *alignment_blocks = make_flower_alignment_poa(flower, 10000, 1000000, 5, abpt, NULL_NAME); abpoa_free_para(abpt); #ifdef stderr_logging diff --git a/src/cactus/setup/cactus_align.py b/src/cactus/setup/cactus_align.py index 82a8978fb..f2a7992ce 100644 --- a/src/cactus/setup/cactus_align.py +++ b/src/cactus/setup/cactus_align.py @@ -318,6 +318,9 @@ def make_align_job(options, toil, config_wrapper=None, chrom_name=None): barNode.attrib["minimumBlockDegree"] = "1" # turn off POA seeding poaNode.attrib["partialOrderAlignmentDisableSeeding"] = "1" + if options.reference: + # activate left-aligning for the reference inside BAR + poaNode.attrib["referenceEventName"] = options.reference[0] # import the PAF alignments paf_id = toil.importFile(makeURL(options.pafFile)) From a5c1eb42062faf1df850d1d4b37b06ea3581ec9f Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 13 Dec 2023 11:54:58 -0500 Subject: [PATCH 2/2] flip abpoa input/output if end is backwards on reference --- bar/impl/flowerAligner.c | 41 ++++++++++++++++++++++++++-------------- bar/impl/poaBarAligner.c | 29 +++++++++++++++++++++++++++- bar/inc/flowerAligner.h | 5 +++-- 3 files changed, 58 insertions(+), 17 deletions(-) diff --git a/bar/impl/flowerAligner.c b/bar/impl/flowerAligner.c index faa1f0aa9..02626cfb8 100644 --- a/bar/impl/flowerAligner.c +++ b/bar/impl/flowerAligner.c @@ -459,16 +459,16 @@ int64_t getMaxAdjacencyLength(Flower *flower) { return maxAdjacencyLength; } -End *getDominantEnd(Flower *flower, Name ref_event_name) { - //return NULL; +End *getDominantEnd(Flower *flower, Name refEventName, bool* outIsFlipped) { assert(flower_getGroupNumber(flower) <= 1); Flower_EndIterator *endIt = flower_getEndIterator(flower); End *end; int64_t maxInstanceNumber = 0; // Get the end with the largest number of caps End *dominantEnd = NULL; - // break ties with the largest number of forward sequences (prioritizing the ref_event_name) - int64_t dominantForwardRefCount = 0; + // break ties with the largest number of forward sequences (prioritizing the refEventName) + int64_t dominantForwardRefBases = 0; + int64_t dominantReverseRefBases = 0; int64_t dominantForwardCount = 0; while ((end = flower_getNextEnd(endIt)) != NULL) { int64_t endInstanceNumber = end_getInstanceNumber(end); @@ -476,31 +476,44 @@ End *getDominantEnd(Flower *flower, Name ref_event_name) { // compute the stranding information End_InstanceIterator *endIt = end_getInstanceIterator(end); Cap *cap; - int64_t forwardRefCount = 0; + int64_t forwardRefBases = 0; + int64_t reverseRefBases = 0; int64_t forwardCount = 0; while ((cap = end_getNext(endIt)) != NULL) { Cap *adj_cap = cap_getAdjacency(cap); - if (adj_cap != NULL && cap_getCoordinate(cap) < cap_getCoordinate(adj_cap)) { - ++forwardCount; + if (adj_cap != NULL) { + int64_t cap_coord = cap_getCoordinate(cap); + int64_t adj_coord = cap_getCoordinate(adj_cap); Event *event = cap_getEvent(cap); - if (ref_event_name != NULL_NAME && event != NULL && event_getName(event) == ref_event_name) { - ++forwardRefCount; + bool is_ref = refEventName != NULL_NAME && event != NULL && event_getName(event) == refEventName; + if (cap_coord < adj_coord) { + ++forwardCount; + if (is_ref) { + forwardRefBases += adj_coord - cap_coord; + } + } else if (cap_coord > adj_coord && is_ref) { + reverseRefBases += cap_coord - adj_coord; } } } end_destructInstanceIterator(endIt); if (endInstanceNumber > maxInstanceNumber || (endInstanceNumber == maxInstanceNumber && - (forwardRefCount > dominantForwardRefCount || - (forwardRefCount == dominantForwardRefCount && forwardCount > dominantForwardCount)))) { + (forwardRefBases > dominantForwardRefBases || + (forwardRefBases == dominantForwardRefBases && forwardCount > dominantForwardCount)))) { maxInstanceNumber = endInstanceNumber; dominantEnd = end; - dominantForwardRefCount = forwardRefCount; + dominantForwardRefBases = forwardRefBases; + dominantReverseRefBases = reverseRefBases; dominantForwardCount = forwardCount; } - } + } } + flower_destructEndIterator(endIt); + if (outIsFlipped != NULL) { + *outIsFlipped = dominantReverseRefBases > dominantForwardRefBases; + } if (dominantEnd == NULL) { // This will only be true if there are no ends or only ends with no caps return NULL; } @@ -531,7 +544,7 @@ static stSortedSet *getEndsToAlign(Flower *flower, int64_t maxSequenceLength) { * Gets a set of the ends that we need to construct actual alignments for. */ stSortedSet *endsToAlign = stSortedSet_construct(); - End *dominantEnd = getDominantEnd(flower, NULL_NAME); //an end to which all adjacencies are incident + End *dominantEnd = getDominantEnd(flower, NULL_NAME, NULL); //an end to which all adjacencies are incident if (dominantEnd != NULL && getMaxAdjacencyLength(flower) <= 2 * maxSequenceLength) { stSortedSet_insert(endsToAlign, dominantEnd); } else { diff --git a/bar/impl/poaBarAligner.c b/bar/impl/poaBarAligner.c index 248b45d2f..841fb0366 100644 --- a/bar/impl/poaBarAligner.c +++ b/bar/impl/poaBarAligner.c @@ -1095,7 +1095,8 @@ int64_t getMaxSequenceLength(End *end) { stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_t window_size, int64_t mask_filter, abpoa_para_t * poa_parameters, Name forward_event_name) { - End *dominantEnd = getDominantEnd(flower, forward_event_name); + bool is_flipped = false; + End *dominantEnd = getDominantEnd(flower, forward_event_name, &is_flipped); int64_t seq_no = dominantEnd != NULL ? end_getInstanceNumber(dominantEnd) : -1; if(dominantEnd != NULL && getMaxSequenceLength(dominantEnd) < max_seq_length) { /* @@ -1110,8 +1111,34 @@ stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_ Cap *indices_to_caps[seq_no]; get_end_sequences(dominantEnd, end_strings, end_string_lengths, overlaps, indices_to_caps, max_seq_length, mask_filter); + + if (is_flipped) { + for (int64_t row = 0; row < seq_no; ++row) { + // todo: in place reverse complement + char *flipped_string = stString_reverseComplementString(end_strings[row]); + free(end_strings[row]); + end_strings[row] = flipped_string; + } + } + Msa *msa = msa_make_partial_order_alignment(end_strings, end_string_lengths, seq_no, window_size, poa_parameters); + if (is_flipped) { + for (int64_t row = 0; row < seq_no; ++row) { + // todo: in place reverse complement + char *flipped_string = stString_reverseComplementString(msa->seqs[row]); + free(msa->seqs[row]); + msa->seqs[row] = flipped_string; + uint8_t *flipped_row = st_malloc(sizeof(uint8_t*) * msa->column_no); + for (int64_t col = 0; col < msa->column_no; ++col) { + char c = stString_reverseComplementChar(msa_to_base(msa->seqs[row][col])); + flipped_row[msa->column_no - 1 - col] = msa_to_byte(c); + } + free(msa->seqs[row]); + msa->seqs[row] = flipped_row; + } + } + //Now convert to set of alignment blocks stList *alignment_blocks = stList_construct3(0, (void (*)(void *))alignmentBlock_destruct); create_alignment_blocks(msa, indices_to_caps, alignment_blocks); diff --git a/bar/inc/flowerAligner.h b/bar/inc/flowerAligner.h index e3acffc8e..a4d7e7d5d 100644 --- a/bar/inc/flowerAligner.h +++ b/bar/inc/flowerAligner.h @@ -36,9 +36,10 @@ stSortedSet *makeFlowerAlignment3(StateMachine *sM, Flower *flower, stList *list /* * Returns an end, if exists, that has cap involved in every adjacency, else returns null. - * In case of tie, choose the end that would align forward along ref_event_name + * In case of tie, choose the end that would align forward along refEventName + * If the dominant end is still flipped along the reference, set outIsFlipped (if it isn't NULL) to true */ -End *getDominantEnd(Flower *flower, Name ref_event_name); +End *getDominantEnd(Flower *flower, Name refEventName, bool* outIsFlipped); /* * Ascertain which ends should be aligned separately.