diff --git a/imgs/coverage.svg b/imgs/coverage.svg index a9915353..ea71b13a 100644 --- a/imgs/coverage.svg +++ b/imgs/coverage.svg @@ -17,7 +17,7 @@ coverage - 89% - 89% + 90% + 90% diff --git a/repo_utils/answer_key/bench/bench_bnd/candidate.refine.bed b/repo_utils/answer_key/bench/bench_bnd/candidate.refine.bed new file mode 100644 index 00000000..c5259f82 --- /dev/null +++ b/repo_utils/answer_key/bench/bench_bnd/candidate.refine.bed @@ -0,0 +1,64 @@ +chr1 181135 181156 +chr1 6005049 6006670 +chr1 224013761 224013782 +chr10 1238267 1238288 +chr10 39478703 39478724 +chr10 132505641 132506422 +chr11 964817 964838 +chr11 44662925 44662946 +chr11 48880455 48880476 +chr11 48888289 48888310 +chr11 68232223 68232244 +chr11 80583401 80583422 +chr11 105902622 105902643 +chr12 2572354 2572375 +chr12 10428997 10444441 +chr12 26566549 26566649 +chr12 106703356 106718437 +chr12 132242311 132242332 +chr13 56380894 56381040 +chr13 114073358 114074122 +chr14 102919538 102919632 +chr15 74894096 74894117 +chr16 1025231 1025252 +chr16 49626994 49627100 +chr16 89536459 89536480 +chr17 31964205 32004138 +chr17 77850848 77850938 +chr17 83120464 83120485 +chr17 83127668 83127689 +chr18 70465189 70465210 +chr18 78390326 78390347 +chr19 395626 395835 +chr19 3477781 3477868 +chr19 29439299 29439424 +chr19 38981201 38983536 +chr19 55068010 55068465 +chr2 9737589 9737610 +chr2 138335007 138335083 +chr2 157917136 157917157 +chr2 238119351 238119427 +chr20 16410045 16410066 +chr20 38428231 38428252 +chr20 62755268 62755289 +chr20 63443033 63443054 +chr22 31903282 31903303 +chr22 45328219 45328240 +chr22 47732364 47732476 +chr22 49494347 49494368 +chr3 112164423 112164444 +chr3 184714589 184714610 +chr4 1429347 1434609 +chr4 8631479 8637158 +chr5 71009525 71015235 +chr5 176031036 176031057 +chr7 23502979 23503000 +chr7 68922678 68950323 +chr7 75988853 75988924 +chr7 158151440 158151461 +chr8 48193098 48193205 +chr8 77023596 77040442 +chr8 143669349 143669370 +chr9 104414084 104414105 +chr9 136535716 136535737 +chrX 125901244 125901387 \ No newline at end of file diff --git a/repo_utils/answer_key/bench/bench_bnd/fn.vcf.gz b/repo_utils/answer_key/bench/bench_bnd/fn.vcf.gz new file mode 100644 index 00000000..1dcb017f Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/fn.vcf.gz differ diff --git a/repo_utils/answer_key/bench/bench_bnd/fn.vcf.gz.tbi b/repo_utils/answer_key/bench/bench_bnd/fn.vcf.gz.tbi new file mode 100644 index 00000000..819c3953 Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/fn.vcf.gz.tbi differ diff --git a/repo_utils/answer_key/bench/bench_bnd/fp.vcf.gz b/repo_utils/answer_key/bench/bench_bnd/fp.vcf.gz new file mode 100644 index 00000000..9b8a4e53 Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/fp.vcf.gz differ diff --git a/repo_utils/answer_key/bench/bench_bnd/fp.vcf.gz.tbi b/repo_utils/answer_key/bench/bench_bnd/fp.vcf.gz.tbi new file mode 100644 index 00000000..2179aaeb Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/fp.vcf.gz.tbi differ diff --git a/repo_utils/answer_key/bench/bench_bnd/log.txt b/repo_utils/answer_key/bench/bench_bnd/log.txt new file mode 100644 index 00000000..258b41b8 --- /dev/null +++ b/repo_utils/answer_key/bench/bench_bnd/log.txt @@ -0,0 +1,61 @@ +2025-01-05 07:08:37,421 [INFO] Truvari v5.0.0 +2025-01-05 07:08:37,422 [INFO] Command /data/truvari/__main__.py bench -b repo_utils/test_files/variants/bnd.base.vcf.gz -c repo_utils/test_files/variants/bnd.comp.vcf.gz -p 0 -o test_results/bench_bnd/ +2025-01-05 07:08:37,423 [INFO] Params: +{ + "base": "/data/repo_utils/test_files/variants/bnd.base.vcf.gz", + "comp": "/data/repo_utils/test_files/variants/bnd.comp.vcf.gz", + "output": "test_results/bench_bnd/", + "includebed": null, + "extend": 0, + "debug": false, + "reference": null, + "refdist": 500, + "pctseq": 0.0, + "pctsize": 0.7, + "pctovl": 0.0, + "typeignore": false, + "no_roll": true, + "chunksize": 1000, + "bSample": "HG008-T", + "cSample": "HG008_T_hiphase.haplotagged", + "dup_to_ins": false, + "bnddist": 100, + "sizemin": 50, + "sizefilt": 30, + "sizemax": 50000, + "passonly": false, + "no_ref": false, + "pick": "single", + "check_monref": true, + "check_multi": true +} +2025-01-05 07:08:37,460 [WARNING] Excluding 193 contigs present in comparison calls header but not baseline calls. +2025-01-05 07:08:37,552 [INFO] Zipped 418 variants Counter({'comp': 243, 'base': 175}) +2025-01-05 07:08:37,553 [INFO] 185 chunks of 418 variants Counter({'comp': 190, '__filtered': 115, 'base': 113}) +2025-01-05 07:08:37,638 [INFO] Stats: { + "TP-base": 85, + "TP-comp": 85, + "FP": 81, + "FN": 28, + "precision": 0.5120481927710844, + "recall": 0.7522123893805309, + "f1": 0.6093189964157706, + "base cnt": 113, + "comp cnt": 166, + "TP-comp_TP-gt": 68, + "TP-comp_FP-gt": 17, + "TP-base_TP-gt": 68, + "TP-base_FP-gt": 17, + "gt_concordance": 0.8, + "gt_matrix": { + "(0, 1)": { + "(0, 1)": 65, + "(0, 0)": 16, + "(1, 1)": 1 + }, + "(1, 0)": { + "(0, 1)": 3 + } + } +} +2025-01-05 07:08:37,638 [INFO] Finished bench diff --git a/repo_utils/answer_key/bench/bench_bnd/params.json b/repo_utils/answer_key/bench/bench_bnd/params.json new file mode 100644 index 00000000..cdafda38 --- /dev/null +++ b/repo_utils/answer_key/bench/bench_bnd/params.json @@ -0,0 +1 @@ +{"base": "/data/repo_utils/test_files/variants/bnd.base.vcf.gz", "comp": "/data/repo_utils/test_files/variants/bnd.comp.vcf.gz", "output": "test_results/bench_bnd/", "includebed": null, "extend": 0, "debug": false, "reference": null, "refdist": 500, "pctseq": 0.0, "pctsize": 0.7, "pctovl": 0.0, "typeignore": false, "no_roll": true, "chunksize": 1000, "bSample": "HG008-T", "cSample": "HG008_T_hiphase.haplotagged", "dup_to_ins": false, "bnddist": 100, "sizemin": 50, "sizefilt": 30, "sizemax": 50000, "passonly": false, "no_ref": false, "pick": "single", "check_monref": true, "check_multi": true} \ No newline at end of file diff --git a/repo_utils/answer_key/bench/bench_bnd/summary.json b/repo_utils/answer_key/bench/bench_bnd/summary.json new file mode 100644 index 00000000..f23027ac --- /dev/null +++ b/repo_utils/answer_key/bench/bench_bnd/summary.json @@ -0,0 +1,26 @@ +{ + "TP-base": 85, + "TP-comp": 85, + "FP": 81, + "FN": 28, + "precision": 0.5120481927710844, + "recall": 0.7522123893805309, + "f1": 0.6093189964157706, + "base cnt": 113, + "comp cnt": 166, + "TP-comp_TP-gt": 68, + "TP-comp_FP-gt": 17, + "TP-base_TP-gt": 68, + "TP-base_FP-gt": 17, + "gt_concordance": 0.8, + "gt_matrix": { + "(0, 1)": { + "(0, 1)": 65, + "(0, 0)": 16, + "(1, 1)": 1 + }, + "(1, 0)": { + "(0, 1)": 3 + } + } +} \ No newline at end of file diff --git a/repo_utils/answer_key/bench/bench_bnd/tp-base.vcf.gz b/repo_utils/answer_key/bench/bench_bnd/tp-base.vcf.gz new file mode 100644 index 00000000..f9bba69e Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/tp-base.vcf.gz differ diff --git a/repo_utils/answer_key/bench/bench_bnd/tp-base.vcf.gz.tbi b/repo_utils/answer_key/bench/bench_bnd/tp-base.vcf.gz.tbi new file mode 100644 index 00000000..a6ae62ba Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/tp-base.vcf.gz.tbi differ diff --git a/repo_utils/answer_key/bench/bench_bnd/tp-comp.vcf.gz b/repo_utils/answer_key/bench/bench_bnd/tp-comp.vcf.gz new file mode 100644 index 00000000..a7c2e365 Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/tp-comp.vcf.gz differ diff --git a/repo_utils/answer_key/bench/bench_bnd/tp-comp.vcf.gz.tbi b/repo_utils/answer_key/bench/bench_bnd/tp-comp.vcf.gz.tbi new file mode 100644 index 00000000..b77233a1 Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/tp-comp.vcf.gz.tbi differ diff --git a/repo_utils/sub_tests/bench.sh b/repo_utils/sub_tests/bench.sh index 2605ccae..97319f83 100644 --- a/repo_utils/sub_tests/bench.sh +++ b/repo_utils/sub_tests/bench.sh @@ -86,3 +86,10 @@ run test_bench_starallele $truv bench -b $INDIR/variants/star.base.vcf.gz \ if [ $test_bench_starallele ]; then bench_assert _starallele fi + +run test_bench_bnd $truv bench -b $INDIR/variants/bnd.base.vcf.gz \ + -c $INDIR/variants/bnd.comp.vcf.gz \ + -p 0 -o $OD/bench_bnd/ +if [ $test_bench_bnd ]; then + bench_assert _bnd +fi diff --git a/repo_utils/test_files/variants/bnd.base.vcf.gz b/repo_utils/test_files/variants/bnd.base.vcf.gz new file mode 100644 index 00000000..c76be640 Binary files /dev/null and b/repo_utils/test_files/variants/bnd.base.vcf.gz differ diff --git a/repo_utils/test_files/variants/bnd.base.vcf.gz.tbi b/repo_utils/test_files/variants/bnd.base.vcf.gz.tbi new file mode 100644 index 00000000..316860d0 Binary files /dev/null and b/repo_utils/test_files/variants/bnd.base.vcf.gz.tbi differ diff --git a/repo_utils/test_files/variants/bnd.comp.vcf.gz b/repo_utils/test_files/variants/bnd.comp.vcf.gz new file mode 100644 index 00000000..9538ceee Binary files /dev/null and b/repo_utils/test_files/variants/bnd.comp.vcf.gz differ diff --git a/repo_utils/test_files/variants/bnd.comp.vcf.gz.tbi b/repo_utils/test_files/variants/bnd.comp.vcf.gz.tbi new file mode 100644 index 00000000..51a529b8 Binary files /dev/null and b/repo_utils/test_files/variants/bnd.comp.vcf.gz.tbi differ diff --git a/truvari/bench.py b/truvari/bench.py index aba26022..065a7cf7 100644 --- a/truvari/bench.py +++ b/truvari/bench.py @@ -59,6 +59,8 @@ def parse_args(args): help="Number of matches reported per-call (%(default)s)") thresg.add_argument("--dup-to-ins", action="store_true", help="Assume DUP svtypes are INS (%(default)s)") + thresg.add_argument("-B", "--bnddist", type=int, default=defaults.bnddist, + help="Maximum distance allowed between BNDs (%(default)s; -1=off)") thresg.add_argument("-C", "--chunksize", type=truvari.restricted_int, default=defaults.chunksize, help="Max reference distance to compare calls (%(default)s)") @@ -497,7 +499,7 @@ def run(self): if (self.extend and (match.comp is not None) and not match.state - and not truvari.entry_within_tree(match.comp, region_tree)): + and not truvari.entry_within_tree(match.comp, region_tree)): match.comp = None output.write_match(match) @@ -516,9 +518,13 @@ def compare_chunk(self, chunk): result = self.compare_calls( chunk_dict["base"], chunk_dict["comp"], chunk_id) self.check_refine_candidate(result) + # Check BNDs separately + if self.matcher.params.bnddist != -1 and (chunk_dict['base_BND'] or chunk_dict['comp_BND']): + result.extend(self.compare_calls(chunk_dict['base_BND'], + chunk_dict['comp_BND'], chunk_id, True)) return result - def compare_calls(self, base_variants, comp_variants, chunk_id=0): + def compare_calls(self, base_variants, comp_variants, chunk_id=0, is_bnds=False): """ Builds MatchResults, returns them as a numpy matrix if there's at least one base and one comp variant. Otherwise, returns a list of the variants placed in MatchResults @@ -558,19 +564,20 @@ def compare_calls(self, base_variants, comp_variants, chunk_id=0): cnt += 1 pos.extend(truvari.entry_boundaries(i)) chrom = i.chrom - logging.warning("Skipping region %s:%d-%d with %d variants", chrom, min(*pos), max(*pos), cnt) + logging.warning("Skipping region %s:%d-%d with %d variants", + chrom, min(*pos), max(*pos), cnt) return [] - match_matrix = self.build_matrix( - base_variants, comp_variants, chunk_id) + match_matrix = self.build_matrix(base_variants, comp_variants, chunk_id, is_bnds=is_bnds) if isinstance(match_matrix, list): return match_matrix return PICKERS[self.matcher.params.pick](match_matrix) - def build_matrix(self, base_variants, comp_variants, chunk_id=0, skip_gt=False): + def build_matrix(self, base_variants, comp_variants, chunk_id=0, skip_gt=False, is_bnds=False): """ Builds MatchResults, returns them as a numpy matrix """ + matcher = self.matcher.build_match if not is_bnds else self.matcher.bnd_build_match if not base_variants or not comp_variants: raise RuntimeError( "Expected at least one base and one comp variant") @@ -578,9 +585,8 @@ def build_matrix(self, base_variants, comp_variants, chunk_id=0, skip_gt=False): for bid, b in enumerate(base_variants): base_matches = [] for cid, c in enumerate(comp_variants): - mat = self.matcher.build_match( - b, c, [f"{chunk_id}.{bid}", f"{chunk_id}.{cid}"], - skip_gt, self.short_circuit) + mat = matcher(b, c, [f"{chunk_id}.{bid}", f"{chunk_id}.{cid}"], + skip_gt, self.short_circuit) logging.debug("Made mat -> %s", mat) base_matches.append(mat) match_matrix.append(base_matches) @@ -603,14 +609,17 @@ def check_refine_candidate(self, result): chrom = match.comp.chrom pos.extend(truvari.entry_boundaries(match.comp)) if has_unmatched and pos: - buf = 10 # min(10, self.matcher.params.chunksize) need to make sure the refine covers the region + # min(10, self.matcher.params.chunksize) need to make sure the refine covers the region + buf = 10 start = max(0, min(*pos) - buf) - self.refine_candidates.append(f"{chrom}\t{start}\t{max(*pos) + buf}") - + self.refine_candidates.append( + f"{chrom}\t{start}\t{max(*pos) + buf}") ################# # Match Pickers # ################# + + def pick_multi_matches(match_matrix): """ Given a numpy array of MatchResults diff --git a/truvari/matching.py b/truvari/matching.py index e92d2988..070b6be5 100644 --- a/truvari/matching.py +++ b/truvari/matching.py @@ -1,6 +1,7 @@ """ Comparison engine """ +import re import types import logging from collections import Counter, defaultdict @@ -108,6 +109,7 @@ def make_match_params(): params.bSample = 0 params.cSample = 0 params.dup_to_ins = False + params.bnddist = 100 params.sizemin = 50 params.sizefilt = 30 params.sizemax = 50000 @@ -136,6 +138,7 @@ def make_match_params_from_args(args): ret.bSample = args.bSample if args.bSample else 0 ret.cSample = args.cSample if args.cSample else 0 ret.dup_to_ins = args.dup_to_ins if "dup_to_ins" in args else False + ret.bnddist = args.bnddist if 'bnddist' in args else -1 # filtering properties ret.sizemin = args.sizemin ret.sizefilt = args.sizefilt @@ -149,7 +152,7 @@ def make_match_params_from_args(args): def filter_call(self, entry, base=False): """ - Returns True if the call should be filtered + Returns True if the call should be filtered based on parameters or truvari requirements Base has different filtering requirements, so let the method know """ if self.params.check_monref and (not entry.alts or entry.alts[0] in (None, '*')): # ignore monomorphic reference @@ -159,19 +162,9 @@ def filter_call(self, entry, base=False): raise ValueError( f"Cannot compare multi-allelic records. Please split\nline {str(entry)}") - # Don't compare BNDs, ever - #if entry.alleles_variant_types[1] == 'BND': - # return True - if self.params.passonly and truvari.entry_is_filtered(entry): return True - size = truvari.entry_size(entry) - if (size > self.params.sizemax) \ - or (base and size < self.params.sizemin) \ - or (not base and size < self.params.sizefilt): - return True - prefix = 'b' if base else 'c' if self.params.no_ref in ["a", prefix] or self.params.pick == 'ac': samp = self.params.bSample if base else self.params.cSample @@ -180,6 +173,27 @@ def filter_call(self, entry, base=False): return False + def size_filter(self, entry, base=False): + """ + Returns True if entry should be filtered due to its size + """ + size = truvari.entry_size(entry) + return (size > self.params.sizemax) \ + or (base and size < self.params.sizemin) \ + or (not base and size < self.params.sizefilt) + + def compare_gts(self, match, base, comp): + """ + Given a MatchResult, populate the genotype specific comparisons in place + """ + if "GT" in base.samples[self.params.bSample]: + match.base_gt = base.samples[self.params.bSample]["GT"] + match.base_gt_count = sum(1 for _ in match.base_gt if _ == 1) + if "GT" in comp.samples[self.params.cSample]: + match.comp_gt = comp.samples[self.params.cSample]["GT"] + match.comp_gt_count = sum(1 for _ in match.comp_gt if _ == 1) + match.gt_match = abs(match.base_gt_count - match.comp_gt_count) + def build_match(self, base, comp, matid=None, skip_gt=False, short_circuit=False): """ Build a MatchResult @@ -218,13 +232,7 @@ def build_match(self, base, comp, matid=None, skip_gt=False, short_circuit=False return ret if not skip_gt: - if "GT" in base.samples[self.params.bSample]: - ret.base_gt = base.samples[self.params.bSample]["GT"] - ret.base_gt_count = sum(1 for _ in ret.base_gt if _ == 1) - if "GT" in comp.samples[self.params.cSample]: - ret.comp_gt = comp.samples[self.params.cSample]["GT"] - ret.comp_gt_count = sum(1 for _ in ret.comp_gt if _ == 1) - ret.gt_match = abs(ret.base_gt_count - ret.comp_gt_count) + self.compare_gts(ret, base, comp) ret.ovlpct = truvari.entry_reciprocal_overlap(base, comp) if ret.ovlpct < self.params.pctovl: @@ -235,7 +243,8 @@ def build_match(self, base, comp, matid=None, skip_gt=False, short_circuit=False return ret if self.params.pctseq > 0: - ret.seqsim = truvari.entry_seq_similarity(base, comp, self.params.no_roll) + ret.seqsim = truvari.entry_seq_similarity( + base, comp, self.params.no_roll) if ret.seqsim < self.params.pctseq: logging.debug("%s and %s sequence similarity is too low (%.3ff)", str(base), str(comp), ret.seqsim) @@ -250,6 +259,37 @@ def build_match(self, base, comp, matid=None, skip_gt=False, short_circuit=False return ret + def bnd_build_match(self, base, comp, matid=None, *_args, **_kwargs): + """ + Build a MatchResult for bnds + """ + ret = truvari.MatchResult() + ret.base = base + ret.comp = comp + + ret.matid = matid + ret.state = base.chrom == comp.chrom + + ret.st_dist = base.pos - comp.pos + ret.state &= abs(ret.st_dist) < self.params.bnddist + b_bnd = bnd_direction_strand(base.alts[0]) + c_bnd = bnd_direction_strand(comp.alts[0]) + ret.state &= b_bnd == c_bnd + + b_pos2 = bnd_position(base.alts[0]) + c_pos2 = bnd_position(comp.alts[0]) + ret.ed_dist = b_pos2[1] - c_pos2[1] + ret.state &= b_pos2[0] == c_pos2[0] + ret.state &= ret.ed_dist < self.params.bnddist + + self.compare_gts(ret, base, comp) + + # Score is percent of allowed distance needed to find this match + ret.score = (1 - ((abs(ret.st_dist) + abs(ret.ed_dist)) / + 2) / self.params.bnddist) * 100 + # I think I'm missing GT stuff here + return ret + ############################ # Parsing and set building # ############################ @@ -313,8 +353,15 @@ def chunker(matcher, *files): call_counts['__filtered'] += 1 continue + is_bnd = entry.alleles_variant_types[1] == 'BND' + + if not is_bnd and matcher.size_filter(entry, key == 'base'): + cur_chunk['__filtered'].append(entry) + call_counts['__filtered'] += 1 + continue + # check symbolic, resolve if needed/possible - if matcher.params.pctseq != 0 and (entry.alleles_variant_types[-1] == 'BND' or entry.alts[0].startswith('<')): + if not is_bnd and matcher.params.pctseq != 0 and entry.alts[0].startswith('<'): was_resolved = resolve_sv(entry, matcher.reference, matcher.params.dup_to_ins) @@ -338,7 +385,11 @@ def chunker(matcher, *files): cur_chrom = entry.chrom cur_end = max(entry.stop, cur_end) - cur_chunk[key].append(entry) + + if is_bnd: + cur_chunk[f'{key}_BND'].append(entry) + else: + cur_chunk[key].append(entry) call_counts[key] += 1 chunk_count += 1 @@ -346,9 +397,14 @@ def chunker(matcher, *files): sum(call_counts.values()), call_counts) yield cur_chunk, chunk_count +#################### +# Helper functions # +#################### + RC = str.maketrans("ATCG", "TAGC") + def resolve_sv(entry, ref, dup_to_ins=False): """ Attempts to resolve an SV's REF/ALT sequences @@ -356,9 +412,12 @@ def resolve_sv(entry, ref, dup_to_ins=False): if ref is None or entry.alts[0] in ['', ''] or entry.start > ref.get_reference_length(entry.chrom): return False - if entry.alleles_variant_types[1] == 'BND' and "SVTYPE" in entry.info \ - and truvari.entry_variant_type(entry) == truvari.SV.DEL: - entry.alts = [''] + # BNDs which describe a deletion can be resolved + if entry.alleles_variant_types[1] == 'BND': + if "SVTYPE" in entry.info and entry.info['SVTYPE'] == 'DEL': + entry.alts = [''] + else: + return False seq = ref.fetch(entry.chrom, entry.start, entry.stop) if entry.alts[0] == '': @@ -375,3 +434,60 @@ def resolve_sv(entry, ref, dup_to_ins=False): return False return True + + +def bnd_direction_strand(bnd: str) -> tuple: + """ + Parses a BND ALT string to determine its direction and strand. + ALT Meaning + t[p[ piece extending to the right of p is joined after t + t]p] reverse comp piece extending left of p is joined after t + ]p]t piece extending to the left of p is joined before t + [p[t reverse comp piece extending right of p is joined before t + + Note that direction of 'left' means the piece is anchored on the left of the breakpoint + + Args: + bnd (str): The BND ALT string. + + Returns: + tuple: A tuple containing the direction ("left" or "right") and strand ("direct" or "complement"). + """ + if bnd.startswith('[') or bnd.endswith('['): + direction = "left" + elif bnd.startswith(']') or bnd.endswith(']'): + direction = "right" + else: + raise ValueError(f"Invalid BND ALT format: {bnd}") + + # Determine strand based on the position of the base letter + if bnd[0] not in '[]': # Base letter is at the start (before brackets) + strand = "direct" + elif bnd[-1] not in '[]': # Base letter is at the end (after brackets) + strand = "complement" + else: + raise ValueError(f"Invalid BND ALT format: {bnd}") + + return direction, strand + + +def bnd_position(bnd): + """ + Extracts the chromosome and position from a BND ALT string. + + Args: + bnd (str): The BND ALT string. + + Returns: + tuple: A tuple containing the chromosome (str) and position (int). + """ + + # Regular expression to match the BND format and extract chrom:pos + match = re.search(r'[\[\]]([^\[\]:]+):(\d+)[\[\]]', bnd) + if not match: + raise ValueError(f"Invalid BND ALT format: {bnd}") + + chrom = match.group(1) # Extract the chromosome + pos = int(match.group(2)) # Extract the position as an integer + + return chrom, pos