diff --git a/imgs/coverage.svg b/imgs/coverage.svg
index a9915353..ea71b13a 100644
--- a/imgs/coverage.svg
+++ b/imgs/coverage.svg
@@ -17,7 +17,7 @@
coverage
- 89%
- 89%
+ 90%
+ 90%
diff --git a/repo_utils/answer_key/bench/bench_bnd/candidate.refine.bed b/repo_utils/answer_key/bench/bench_bnd/candidate.refine.bed
new file mode 100644
index 00000000..c5259f82
--- /dev/null
+++ b/repo_utils/answer_key/bench/bench_bnd/candidate.refine.bed
@@ -0,0 +1,64 @@
+chr1 181135 181156
+chr1 6005049 6006670
+chr1 224013761 224013782
+chr10 1238267 1238288
+chr10 39478703 39478724
+chr10 132505641 132506422
+chr11 964817 964838
+chr11 44662925 44662946
+chr11 48880455 48880476
+chr11 48888289 48888310
+chr11 68232223 68232244
+chr11 80583401 80583422
+chr11 105902622 105902643
+chr12 2572354 2572375
+chr12 10428997 10444441
+chr12 26566549 26566649
+chr12 106703356 106718437
+chr12 132242311 132242332
+chr13 56380894 56381040
+chr13 114073358 114074122
+chr14 102919538 102919632
+chr15 74894096 74894117
+chr16 1025231 1025252
+chr16 49626994 49627100
+chr16 89536459 89536480
+chr17 31964205 32004138
+chr17 77850848 77850938
+chr17 83120464 83120485
+chr17 83127668 83127689
+chr18 70465189 70465210
+chr18 78390326 78390347
+chr19 395626 395835
+chr19 3477781 3477868
+chr19 29439299 29439424
+chr19 38981201 38983536
+chr19 55068010 55068465
+chr2 9737589 9737610
+chr2 138335007 138335083
+chr2 157917136 157917157
+chr2 238119351 238119427
+chr20 16410045 16410066
+chr20 38428231 38428252
+chr20 62755268 62755289
+chr20 63443033 63443054
+chr22 31903282 31903303
+chr22 45328219 45328240
+chr22 47732364 47732476
+chr22 49494347 49494368
+chr3 112164423 112164444
+chr3 184714589 184714610
+chr4 1429347 1434609
+chr4 8631479 8637158
+chr5 71009525 71015235
+chr5 176031036 176031057
+chr7 23502979 23503000
+chr7 68922678 68950323
+chr7 75988853 75988924
+chr7 158151440 158151461
+chr8 48193098 48193205
+chr8 77023596 77040442
+chr8 143669349 143669370
+chr9 104414084 104414105
+chr9 136535716 136535737
+chrX 125901244 125901387
\ No newline at end of file
diff --git a/repo_utils/answer_key/bench/bench_bnd/fn.vcf.gz b/repo_utils/answer_key/bench/bench_bnd/fn.vcf.gz
new file mode 100644
index 00000000..1dcb017f
Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/fn.vcf.gz differ
diff --git a/repo_utils/answer_key/bench/bench_bnd/fn.vcf.gz.tbi b/repo_utils/answer_key/bench/bench_bnd/fn.vcf.gz.tbi
new file mode 100644
index 00000000..819c3953
Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/fn.vcf.gz.tbi differ
diff --git a/repo_utils/answer_key/bench/bench_bnd/fp.vcf.gz b/repo_utils/answer_key/bench/bench_bnd/fp.vcf.gz
new file mode 100644
index 00000000..9b8a4e53
Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/fp.vcf.gz differ
diff --git a/repo_utils/answer_key/bench/bench_bnd/fp.vcf.gz.tbi b/repo_utils/answer_key/bench/bench_bnd/fp.vcf.gz.tbi
new file mode 100644
index 00000000..2179aaeb
Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/fp.vcf.gz.tbi differ
diff --git a/repo_utils/answer_key/bench/bench_bnd/log.txt b/repo_utils/answer_key/bench/bench_bnd/log.txt
new file mode 100644
index 00000000..258b41b8
--- /dev/null
+++ b/repo_utils/answer_key/bench/bench_bnd/log.txt
@@ -0,0 +1,61 @@
+2025-01-05 07:08:37,421 [INFO] Truvari v5.0.0
+2025-01-05 07:08:37,422 [INFO] Command /data/truvari/__main__.py bench -b repo_utils/test_files/variants/bnd.base.vcf.gz -c repo_utils/test_files/variants/bnd.comp.vcf.gz -p 0 -o test_results/bench_bnd/
+2025-01-05 07:08:37,423 [INFO] Params:
+{
+ "base": "/data/repo_utils/test_files/variants/bnd.base.vcf.gz",
+ "comp": "/data/repo_utils/test_files/variants/bnd.comp.vcf.gz",
+ "output": "test_results/bench_bnd/",
+ "includebed": null,
+ "extend": 0,
+ "debug": false,
+ "reference": null,
+ "refdist": 500,
+ "pctseq": 0.0,
+ "pctsize": 0.7,
+ "pctovl": 0.0,
+ "typeignore": false,
+ "no_roll": true,
+ "chunksize": 1000,
+ "bSample": "HG008-T",
+ "cSample": "HG008_T_hiphase.haplotagged",
+ "dup_to_ins": false,
+ "bnddist": 100,
+ "sizemin": 50,
+ "sizefilt": 30,
+ "sizemax": 50000,
+ "passonly": false,
+ "no_ref": false,
+ "pick": "single",
+ "check_monref": true,
+ "check_multi": true
+}
+2025-01-05 07:08:37,460 [WARNING] Excluding 193 contigs present in comparison calls header but not baseline calls.
+2025-01-05 07:08:37,552 [INFO] Zipped 418 variants Counter({'comp': 243, 'base': 175})
+2025-01-05 07:08:37,553 [INFO] 185 chunks of 418 variants Counter({'comp': 190, '__filtered': 115, 'base': 113})
+2025-01-05 07:08:37,638 [INFO] Stats: {
+ "TP-base": 85,
+ "TP-comp": 85,
+ "FP": 81,
+ "FN": 28,
+ "precision": 0.5120481927710844,
+ "recall": 0.7522123893805309,
+ "f1": 0.6093189964157706,
+ "base cnt": 113,
+ "comp cnt": 166,
+ "TP-comp_TP-gt": 68,
+ "TP-comp_FP-gt": 17,
+ "TP-base_TP-gt": 68,
+ "TP-base_FP-gt": 17,
+ "gt_concordance": 0.8,
+ "gt_matrix": {
+ "(0, 1)": {
+ "(0, 1)": 65,
+ "(0, 0)": 16,
+ "(1, 1)": 1
+ },
+ "(1, 0)": {
+ "(0, 1)": 3
+ }
+ }
+}
+2025-01-05 07:08:37,638 [INFO] Finished bench
diff --git a/repo_utils/answer_key/bench/bench_bnd/params.json b/repo_utils/answer_key/bench/bench_bnd/params.json
new file mode 100644
index 00000000..cdafda38
--- /dev/null
+++ b/repo_utils/answer_key/bench/bench_bnd/params.json
@@ -0,0 +1 @@
+{"base": "/data/repo_utils/test_files/variants/bnd.base.vcf.gz", "comp": "/data/repo_utils/test_files/variants/bnd.comp.vcf.gz", "output": "test_results/bench_bnd/", "includebed": null, "extend": 0, "debug": false, "reference": null, "refdist": 500, "pctseq": 0.0, "pctsize": 0.7, "pctovl": 0.0, "typeignore": false, "no_roll": true, "chunksize": 1000, "bSample": "HG008-T", "cSample": "HG008_T_hiphase.haplotagged", "dup_to_ins": false, "bnddist": 100, "sizemin": 50, "sizefilt": 30, "sizemax": 50000, "passonly": false, "no_ref": false, "pick": "single", "check_monref": true, "check_multi": true}
\ No newline at end of file
diff --git a/repo_utils/answer_key/bench/bench_bnd/summary.json b/repo_utils/answer_key/bench/bench_bnd/summary.json
new file mode 100644
index 00000000..f23027ac
--- /dev/null
+++ b/repo_utils/answer_key/bench/bench_bnd/summary.json
@@ -0,0 +1,26 @@
+{
+ "TP-base": 85,
+ "TP-comp": 85,
+ "FP": 81,
+ "FN": 28,
+ "precision": 0.5120481927710844,
+ "recall": 0.7522123893805309,
+ "f1": 0.6093189964157706,
+ "base cnt": 113,
+ "comp cnt": 166,
+ "TP-comp_TP-gt": 68,
+ "TP-comp_FP-gt": 17,
+ "TP-base_TP-gt": 68,
+ "TP-base_FP-gt": 17,
+ "gt_concordance": 0.8,
+ "gt_matrix": {
+ "(0, 1)": {
+ "(0, 1)": 65,
+ "(0, 0)": 16,
+ "(1, 1)": 1
+ },
+ "(1, 0)": {
+ "(0, 1)": 3
+ }
+ }
+}
\ No newline at end of file
diff --git a/repo_utils/answer_key/bench/bench_bnd/tp-base.vcf.gz b/repo_utils/answer_key/bench/bench_bnd/tp-base.vcf.gz
new file mode 100644
index 00000000..f9bba69e
Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/tp-base.vcf.gz differ
diff --git a/repo_utils/answer_key/bench/bench_bnd/tp-base.vcf.gz.tbi b/repo_utils/answer_key/bench/bench_bnd/tp-base.vcf.gz.tbi
new file mode 100644
index 00000000..a6ae62ba
Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/tp-base.vcf.gz.tbi differ
diff --git a/repo_utils/answer_key/bench/bench_bnd/tp-comp.vcf.gz b/repo_utils/answer_key/bench/bench_bnd/tp-comp.vcf.gz
new file mode 100644
index 00000000..a7c2e365
Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/tp-comp.vcf.gz differ
diff --git a/repo_utils/answer_key/bench/bench_bnd/tp-comp.vcf.gz.tbi b/repo_utils/answer_key/bench/bench_bnd/tp-comp.vcf.gz.tbi
new file mode 100644
index 00000000..b77233a1
Binary files /dev/null and b/repo_utils/answer_key/bench/bench_bnd/tp-comp.vcf.gz.tbi differ
diff --git a/repo_utils/sub_tests/bench.sh b/repo_utils/sub_tests/bench.sh
index 2605ccae..97319f83 100644
--- a/repo_utils/sub_tests/bench.sh
+++ b/repo_utils/sub_tests/bench.sh
@@ -86,3 +86,10 @@ run test_bench_starallele $truv bench -b $INDIR/variants/star.base.vcf.gz \
if [ $test_bench_starallele ]; then
bench_assert _starallele
fi
+
+run test_bench_bnd $truv bench -b $INDIR/variants/bnd.base.vcf.gz \
+ -c $INDIR/variants/bnd.comp.vcf.gz \
+ -p 0 -o $OD/bench_bnd/
+if [ $test_bench_bnd ]; then
+ bench_assert _bnd
+fi
diff --git a/repo_utils/test_files/variants/bnd.base.vcf.gz b/repo_utils/test_files/variants/bnd.base.vcf.gz
new file mode 100644
index 00000000..c76be640
Binary files /dev/null and b/repo_utils/test_files/variants/bnd.base.vcf.gz differ
diff --git a/repo_utils/test_files/variants/bnd.base.vcf.gz.tbi b/repo_utils/test_files/variants/bnd.base.vcf.gz.tbi
new file mode 100644
index 00000000..316860d0
Binary files /dev/null and b/repo_utils/test_files/variants/bnd.base.vcf.gz.tbi differ
diff --git a/repo_utils/test_files/variants/bnd.comp.vcf.gz b/repo_utils/test_files/variants/bnd.comp.vcf.gz
new file mode 100644
index 00000000..9538ceee
Binary files /dev/null and b/repo_utils/test_files/variants/bnd.comp.vcf.gz differ
diff --git a/repo_utils/test_files/variants/bnd.comp.vcf.gz.tbi b/repo_utils/test_files/variants/bnd.comp.vcf.gz.tbi
new file mode 100644
index 00000000..51a529b8
Binary files /dev/null and b/repo_utils/test_files/variants/bnd.comp.vcf.gz.tbi differ
diff --git a/truvari/bench.py b/truvari/bench.py
index aba26022..065a7cf7 100644
--- a/truvari/bench.py
+++ b/truvari/bench.py
@@ -59,6 +59,8 @@ def parse_args(args):
help="Number of matches reported per-call (%(default)s)")
thresg.add_argument("--dup-to-ins", action="store_true",
help="Assume DUP svtypes are INS (%(default)s)")
+ thresg.add_argument("-B", "--bnddist", type=int, default=defaults.bnddist,
+ help="Maximum distance allowed between BNDs (%(default)s; -1=off)")
thresg.add_argument("-C", "--chunksize", type=truvari.restricted_int, default=defaults.chunksize,
help="Max reference distance to compare calls (%(default)s)")
@@ -497,7 +499,7 @@ def run(self):
if (self.extend
and (match.comp is not None)
and not match.state
- and not truvari.entry_within_tree(match.comp, region_tree)):
+ and not truvari.entry_within_tree(match.comp, region_tree)):
match.comp = None
output.write_match(match)
@@ -516,9 +518,13 @@ def compare_chunk(self, chunk):
result = self.compare_calls(
chunk_dict["base"], chunk_dict["comp"], chunk_id)
self.check_refine_candidate(result)
+ # Check BNDs separately
+ if self.matcher.params.bnddist != -1 and (chunk_dict['base_BND'] or chunk_dict['comp_BND']):
+ result.extend(self.compare_calls(chunk_dict['base_BND'],
+ chunk_dict['comp_BND'], chunk_id, True))
return result
- def compare_calls(self, base_variants, comp_variants, chunk_id=0):
+ def compare_calls(self, base_variants, comp_variants, chunk_id=0, is_bnds=False):
"""
Builds MatchResults, returns them as a numpy matrix if there's at least one base and one comp variant.
Otherwise, returns a list of the variants placed in MatchResults
@@ -558,19 +564,20 @@ def compare_calls(self, base_variants, comp_variants, chunk_id=0):
cnt += 1
pos.extend(truvari.entry_boundaries(i))
chrom = i.chrom
- logging.warning("Skipping region %s:%d-%d with %d variants", chrom, min(*pos), max(*pos), cnt)
+ logging.warning("Skipping region %s:%d-%d with %d variants",
+ chrom, min(*pos), max(*pos), cnt)
return []
- match_matrix = self.build_matrix(
- base_variants, comp_variants, chunk_id)
+ match_matrix = self.build_matrix(base_variants, comp_variants, chunk_id, is_bnds=is_bnds)
if isinstance(match_matrix, list):
return match_matrix
return PICKERS[self.matcher.params.pick](match_matrix)
- def build_matrix(self, base_variants, comp_variants, chunk_id=0, skip_gt=False):
+ def build_matrix(self, base_variants, comp_variants, chunk_id=0, skip_gt=False, is_bnds=False):
"""
Builds MatchResults, returns them as a numpy matrix
"""
+ matcher = self.matcher.build_match if not is_bnds else self.matcher.bnd_build_match
if not base_variants or not comp_variants:
raise RuntimeError(
"Expected at least one base and one comp variant")
@@ -578,9 +585,8 @@ def build_matrix(self, base_variants, comp_variants, chunk_id=0, skip_gt=False):
for bid, b in enumerate(base_variants):
base_matches = []
for cid, c in enumerate(comp_variants):
- mat = self.matcher.build_match(
- b, c, [f"{chunk_id}.{bid}", f"{chunk_id}.{cid}"],
- skip_gt, self.short_circuit)
+ mat = matcher(b, c, [f"{chunk_id}.{bid}", f"{chunk_id}.{cid}"],
+ skip_gt, self.short_circuit)
logging.debug("Made mat -> %s", mat)
base_matches.append(mat)
match_matrix.append(base_matches)
@@ -603,14 +609,17 @@ def check_refine_candidate(self, result):
chrom = match.comp.chrom
pos.extend(truvari.entry_boundaries(match.comp))
if has_unmatched and pos:
- buf = 10 # min(10, self.matcher.params.chunksize) need to make sure the refine covers the region
+ # min(10, self.matcher.params.chunksize) need to make sure the refine covers the region
+ buf = 10
start = max(0, min(*pos) - buf)
- self.refine_candidates.append(f"{chrom}\t{start}\t{max(*pos) + buf}")
-
+ self.refine_candidates.append(
+ f"{chrom}\t{start}\t{max(*pos) + buf}")
#################
# Match Pickers #
#################
+
+
def pick_multi_matches(match_matrix):
"""
Given a numpy array of MatchResults
diff --git a/truvari/matching.py b/truvari/matching.py
index e92d2988..070b6be5 100644
--- a/truvari/matching.py
+++ b/truvari/matching.py
@@ -1,6 +1,7 @@
"""
Comparison engine
"""
+import re
import types
import logging
from collections import Counter, defaultdict
@@ -108,6 +109,7 @@ def make_match_params():
params.bSample = 0
params.cSample = 0
params.dup_to_ins = False
+ params.bnddist = 100
params.sizemin = 50
params.sizefilt = 30
params.sizemax = 50000
@@ -136,6 +138,7 @@ def make_match_params_from_args(args):
ret.bSample = args.bSample if args.bSample else 0
ret.cSample = args.cSample if args.cSample else 0
ret.dup_to_ins = args.dup_to_ins if "dup_to_ins" in args else False
+ ret.bnddist = args.bnddist if 'bnddist' in args else -1
# filtering properties
ret.sizemin = args.sizemin
ret.sizefilt = args.sizefilt
@@ -149,7 +152,7 @@ def make_match_params_from_args(args):
def filter_call(self, entry, base=False):
"""
- Returns True if the call should be filtered
+ Returns True if the call should be filtered based on parameters or truvari requirements
Base has different filtering requirements, so let the method know
"""
if self.params.check_monref and (not entry.alts or entry.alts[0] in (None, '*')): # ignore monomorphic reference
@@ -159,19 +162,9 @@ def filter_call(self, entry, base=False):
raise ValueError(
f"Cannot compare multi-allelic records. Please split\nline {str(entry)}")
- # Don't compare BNDs, ever
- #if entry.alleles_variant_types[1] == 'BND':
- # return True
-
if self.params.passonly and truvari.entry_is_filtered(entry):
return True
- size = truvari.entry_size(entry)
- if (size > self.params.sizemax) \
- or (base and size < self.params.sizemin) \
- or (not base and size < self.params.sizefilt):
- return True
-
prefix = 'b' if base else 'c'
if self.params.no_ref in ["a", prefix] or self.params.pick == 'ac':
samp = self.params.bSample if base else self.params.cSample
@@ -180,6 +173,27 @@ def filter_call(self, entry, base=False):
return False
+ def size_filter(self, entry, base=False):
+ """
+ Returns True if entry should be filtered due to its size
+ """
+ size = truvari.entry_size(entry)
+ return (size > self.params.sizemax) \
+ or (base and size < self.params.sizemin) \
+ or (not base and size < self.params.sizefilt)
+
+ def compare_gts(self, match, base, comp):
+ """
+ Given a MatchResult, populate the genotype specific comparisons in place
+ """
+ if "GT" in base.samples[self.params.bSample]:
+ match.base_gt = base.samples[self.params.bSample]["GT"]
+ match.base_gt_count = sum(1 for _ in match.base_gt if _ == 1)
+ if "GT" in comp.samples[self.params.cSample]:
+ match.comp_gt = comp.samples[self.params.cSample]["GT"]
+ match.comp_gt_count = sum(1 for _ in match.comp_gt if _ == 1)
+ match.gt_match = abs(match.base_gt_count - match.comp_gt_count)
+
def build_match(self, base, comp, matid=None, skip_gt=False, short_circuit=False):
"""
Build a MatchResult
@@ -218,13 +232,7 @@ def build_match(self, base, comp, matid=None, skip_gt=False, short_circuit=False
return ret
if not skip_gt:
- if "GT" in base.samples[self.params.bSample]:
- ret.base_gt = base.samples[self.params.bSample]["GT"]
- ret.base_gt_count = sum(1 for _ in ret.base_gt if _ == 1)
- if "GT" in comp.samples[self.params.cSample]:
- ret.comp_gt = comp.samples[self.params.cSample]["GT"]
- ret.comp_gt_count = sum(1 for _ in ret.comp_gt if _ == 1)
- ret.gt_match = abs(ret.base_gt_count - ret.comp_gt_count)
+ self.compare_gts(ret, base, comp)
ret.ovlpct = truvari.entry_reciprocal_overlap(base, comp)
if ret.ovlpct < self.params.pctovl:
@@ -235,7 +243,8 @@ def build_match(self, base, comp, matid=None, skip_gt=False, short_circuit=False
return ret
if self.params.pctseq > 0:
- ret.seqsim = truvari.entry_seq_similarity(base, comp, self.params.no_roll)
+ ret.seqsim = truvari.entry_seq_similarity(
+ base, comp, self.params.no_roll)
if ret.seqsim < self.params.pctseq:
logging.debug("%s and %s sequence similarity is too low (%.3ff)",
str(base), str(comp), ret.seqsim)
@@ -250,6 +259,37 @@ def build_match(self, base, comp, matid=None, skip_gt=False, short_circuit=False
return ret
+ def bnd_build_match(self, base, comp, matid=None, *_args, **_kwargs):
+ """
+ Build a MatchResult for bnds
+ """
+ ret = truvari.MatchResult()
+ ret.base = base
+ ret.comp = comp
+
+ ret.matid = matid
+ ret.state = base.chrom == comp.chrom
+
+ ret.st_dist = base.pos - comp.pos
+ ret.state &= abs(ret.st_dist) < self.params.bnddist
+ b_bnd = bnd_direction_strand(base.alts[0])
+ c_bnd = bnd_direction_strand(comp.alts[0])
+ ret.state &= b_bnd == c_bnd
+
+ b_pos2 = bnd_position(base.alts[0])
+ c_pos2 = bnd_position(comp.alts[0])
+ ret.ed_dist = b_pos2[1] - c_pos2[1]
+ ret.state &= b_pos2[0] == c_pos2[0]
+ ret.state &= ret.ed_dist < self.params.bnddist
+
+ self.compare_gts(ret, base, comp)
+
+ # Score is percent of allowed distance needed to find this match
+ ret.score = (1 - ((abs(ret.st_dist) + abs(ret.ed_dist)) /
+ 2) / self.params.bnddist) * 100
+ # I think I'm missing GT stuff here
+ return ret
+
############################
# Parsing and set building #
############################
@@ -313,8 +353,15 @@ def chunker(matcher, *files):
call_counts['__filtered'] += 1
continue
+ is_bnd = entry.alleles_variant_types[1] == 'BND'
+
+ if not is_bnd and matcher.size_filter(entry, key == 'base'):
+ cur_chunk['__filtered'].append(entry)
+ call_counts['__filtered'] += 1
+ continue
+
# check symbolic, resolve if needed/possible
- if matcher.params.pctseq != 0 and (entry.alleles_variant_types[-1] == 'BND' or entry.alts[0].startswith('<')):
+ if not is_bnd and matcher.params.pctseq != 0 and entry.alts[0].startswith('<'):
was_resolved = resolve_sv(entry,
matcher.reference,
matcher.params.dup_to_ins)
@@ -338,7 +385,11 @@ def chunker(matcher, *files):
cur_chrom = entry.chrom
cur_end = max(entry.stop, cur_end)
- cur_chunk[key].append(entry)
+
+ if is_bnd:
+ cur_chunk[f'{key}_BND'].append(entry)
+ else:
+ cur_chunk[key].append(entry)
call_counts[key] += 1
chunk_count += 1
@@ -346,9 +397,14 @@ def chunker(matcher, *files):
sum(call_counts.values()), call_counts)
yield cur_chunk, chunk_count
+####################
+# Helper functions #
+####################
+
RC = str.maketrans("ATCG", "TAGC")
+
def resolve_sv(entry, ref, dup_to_ins=False):
"""
Attempts to resolve an SV's REF/ALT sequences
@@ -356,9 +412,12 @@ def resolve_sv(entry, ref, dup_to_ins=False):
if ref is None or entry.alts[0] in ['', ''] or entry.start > ref.get_reference_length(entry.chrom):
return False
- if entry.alleles_variant_types[1] == 'BND' and "SVTYPE" in entry.info \
- and truvari.entry_variant_type(entry) == truvari.SV.DEL:
- entry.alts = ['']
+ # BNDs which describe a deletion can be resolved
+ if entry.alleles_variant_types[1] == 'BND':
+ if "SVTYPE" in entry.info and entry.info['SVTYPE'] == 'DEL':
+ entry.alts = ['']
+ else:
+ return False
seq = ref.fetch(entry.chrom, entry.start, entry.stop)
if entry.alts[0] == '':
@@ -375,3 +434,60 @@ def resolve_sv(entry, ref, dup_to_ins=False):
return False
return True
+
+
+def bnd_direction_strand(bnd: str) -> tuple:
+ """
+ Parses a BND ALT string to determine its direction and strand.
+ ALT Meaning
+ t[p[ piece extending to the right of p is joined after t
+ t]p] reverse comp piece extending left of p is joined after t
+ ]p]t piece extending to the left of p is joined before t
+ [p[t reverse comp piece extending right of p is joined before t
+
+ Note that direction of 'left' means the piece is anchored on the left of the breakpoint
+
+ Args:
+ bnd (str): The BND ALT string.
+
+ Returns:
+ tuple: A tuple containing the direction ("left" or "right") and strand ("direct" or "complement").
+ """
+ if bnd.startswith('[') or bnd.endswith('['):
+ direction = "left"
+ elif bnd.startswith(']') or bnd.endswith(']'):
+ direction = "right"
+ else:
+ raise ValueError(f"Invalid BND ALT format: {bnd}")
+
+ # Determine strand based on the position of the base letter
+ if bnd[0] not in '[]': # Base letter is at the start (before brackets)
+ strand = "direct"
+ elif bnd[-1] not in '[]': # Base letter is at the end (after brackets)
+ strand = "complement"
+ else:
+ raise ValueError(f"Invalid BND ALT format: {bnd}")
+
+ return direction, strand
+
+
+def bnd_position(bnd):
+ """
+ Extracts the chromosome and position from a BND ALT string.
+
+ Args:
+ bnd (str): The BND ALT string.
+
+ Returns:
+ tuple: A tuple containing the chromosome (str) and position (int).
+ """
+
+ # Regular expression to match the BND format and extract chrom:pos
+ match = re.search(r'[\[\]]([^\[\]:]+):(\d+)[\[\]]', bnd)
+ if not match:
+ raise ValueError(f"Invalid BND ALT format: {bnd}")
+
+ chrom = match.group(1) # Extract the chromosome
+ pos = int(match.group(2)) # Extract the position as an integer
+
+ return chrom, pos