Skip to content

Commit

Permalink
Symbolic<>Resolved Comparison. No sizemax.
Browse files Browse the repository at this point in the history
Symbolic SVs can now be compared to sequence resolved SVs without
specifying `--pctsize 0`. Sizemax now default to -1, meaning there's no
cap on SV lengths.
  • Loading branch information
ACEnglish committed Jan 9, 2025
1 parent 807c2d7 commit 751a207
Show file tree
Hide file tree
Showing 23 changed files with 118 additions and 104 deletions.
43 changes: 23 additions & 20 deletions repo_utils/answer_key/bench/bench_bnd/candidate.refine.bed
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
chr1 181135 181156
chr1 297945 297966
chr1 6005049 6006670
chr1 122627325 122627346
chr1 124860930 124860951
chr1 224013761 224013782
chr10 1238267 1238288
chr10 39478703 39478724
chr10 67470500 67470521
chr10 68211488 68211509
chr10 67470500 68211509
chr10 93279144 94722597
chr10 132505641 132506422
chr11 964817 964838
chr11 33648565 33648586
Expand All @@ -19,24 +21,24 @@ chr11 125661462 125661483
chr11 135076371 135076392
chr12 2572354 2572375
chr12 10428997 10444441
chr12 24759747 24759768
chr12 26566549 26566649
chr12 26706109 26706130
chr12 12760646 12864984
chr12 24759747 26706130
chr12 106703356 106718439
chr12 132242311 132242332
chr13 56380894 56381040
chr13 114073358 114074122
chr14 102919538 102919632
chr15 31496974 31496995
chr15 31637303 31637324
chr15 31496974 31637324
chr15 40450778 40450800
chr15 44117194 44117215
chr15 44248462 44248483
chr15 44117191 44248483
chr15 74894096 74894117
chr15 90787610 90889113
chr15 98348337 98809237
chr16 1025231 1025252
chr16 49626994 49627100
chr16 89536459 89536480
chr17 31964205 32004138
chr17 32305433 32305454
chr17 32670386 32670407
chr17 77850848 77850938
chr17 83120464 83120485
Expand All @@ -49,7 +51,7 @@ chr19 395626 395835
chr19 3477781 3477868
chr19 11196125 11218966
chr19 29439299 29439424
chr19 44991097 44991118
chr19 39706573 50554260
chr19 53185919 53185940
chr19 55068010 55068465
chr2 9737589 9737610
Expand All @@ -63,8 +65,7 @@ chr20 16410045 16410066
chr20 25499107 25499128
chr20 38428231 38428252
chr20 38430124 38430145
chr20 62257534 62257555
chr20 62378058 62378079
chr20 62257534 62378079
chr20 62755268 62755289
chr20 63443033 63443054
chr22 31903282 31903303
Expand All @@ -74,32 +75,34 @@ chr22 49494347 49494368
chr3 11399 11532
chr3 112164423 112164444
chr3 138126516 138126537
chr3 139998683 139998704
chr3 184714589 184714610
chr3 192188379 192188404
chr4 1429347 1434609
chr4 8631479 8637158
chr5 36023169 36023190
chr5 39360014 39360035
chr4 37407608 37553796
chr5 20023760 20023791
chr5 36023169 39360035
chr5 71009525 71015235
chr5 172418408 172418429
chr5 172626365 172626386
chr5 172418408 172626386
chr5 176031036 176031057
chr6 51982887 100746170
chr6 131707570 131707591
chr6 157972782 157972814
chr7 4824638 4824659
chr7 7802502 7802523
chr7 361589 361610
chr7 4824637 7802523
chr7 23502979 23503000
chr7 68922678 68922699
chr7 75988853 75988924
chr7 144186929 144186950
chr7 158151440 158151461
chr8 48193098 48193205
chr8 77023596 77040442
chr8 99361739 99486153
chr8 143669349 143669370
chr9 104414084 104414105
chr9 136535716 136535737
chrX 90627638 90627659
chrX 95207229 95207250
chrX 90627637 95207250
chrX 95810791 95810812
chrX 95812857 95812878
chrX 125901244 125901387
Expand Down
Binary file modified repo_utils/answer_key/bench/bench_bnd/fn.vcf.gz
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd/fn.vcf.gz.tbi
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd/fp.vcf.gz
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd/fp.vcf.gz.tbi
Binary file not shown.
57 changes: 29 additions & 28 deletions repo_utils/answer_key/bench/bench_bnd/log.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
2025-01-08 20:16:32,128 [INFO] Truvari v5.0.0
2025-01-08 20:16:32,129 [INFO] Command /Users/english/code/truvari/truvari/__main__.py bench -b repo_utils/test_files/variants/bnd.base.vcf.gz -c repo_utils/test_files/variants/bnd.comp.vcf.gz -p 0 -o test_results/bench_bnd/ --no-decompose
2025-01-08 20:16:32,129 [INFO] Params:
2025-01-09 21:35:26,687 [INFO] Truvari v5.0.0
2025-01-09 21:35:26,688 [INFO] Command /data/truvari/__main__.py bench -b repo_utils/test_files/variants/bnd.base.vcf.gz -c repo_utils/test_files/variants/bnd.comp.vcf.gz -p 0 -o test_results/bench_bnd/ --no-decompose
2025-01-09 21:35:26,688 [INFO] Params:
{
"base": "/Users/english/code/truvari/repo_utils/test_files/variants/bnd.base.vcf.gz",
"comp": "/Users/english/code/truvari/repo_utils/test_files/variants/bnd.comp.vcf.gz",
"base": "/data/repo_utils/test_files/variants/bnd.base.vcf.gz",
"comp": "/data/repo_utils/test_files/variants/bnd.comp.vcf.gz",
"output": "test_results/bench_bnd/",
"includebed": null,
"extend": 0,
Expand All @@ -22,7 +22,7 @@
"bnddist": 100,
"sizemin": 50,
"sizefilt": 30,
"sizemax": 50000,
"sizemax": -1,
"passonly": false,
"no_ref": false,
"pick": "single",
Expand All @@ -33,30 +33,31 @@
"write_resolved": false,
"decompose": false,
"short_circuit": false,
"skip_gt": false
"skip_gt": false,
"max_resolve": 25000
}
2025-01-08 20:16:32,143 [WARNING] 193 contigs present in comparison VCF header are not in baseline VCF.
2025-01-08 20:16:32,220 [INFO] Zipped 439 variants Counter({'comp': 243, 'base': 196})
2025-01-08 20:16:32,220 [INFO] 202 chunks of 439 variants Counter({'comp': 190, 'base': 135, '__filtered': 114})
2025-01-08 20:16:32,268 [INFO] Stats: {
"TP-base": 80,
"TP-comp": 80,
"FP": 88,
"FN": 55,
"precision": 0.47619047619047616,
"recall": 0.5925925925925926,
"f1": 0.528052805280528,
"base cnt": 135,
"comp cnt": 168,
"TP-comp_TP-gt": 64,
"TP-comp_FP-gt": 16,
"TP-base_TP-gt": 64,
"TP-base_FP-gt": 16,
"gt_concordance": 0.8,
2025-01-09 21:35:26,723 [WARNING] 193 contigs present in comparison VCF header are not in baseline VCF.
2025-01-09 21:35:26,898 [INFO] Zipped 439 variants Counter({'comp': 243, 'base': 196})
2025-01-09 21:35:26,899 [INFO] 223 chunks of 439 variants Counter({'comp': 243, 'base': 186, '__filtered': 10})
2025-01-09 21:35:26,974 [INFO] Stats: {
"TP-base": 119,
"TP-comp": 119,
"FP": 102,
"FN": 67,
"precision": 0.5384615384615384,
"recall": 0.6397849462365591,
"f1": 0.5847665847665847,
"base cnt": 186,
"comp cnt": 221,
"TP-comp_TP-gt": 99,
"TP-comp_FP-gt": 20,
"TP-base_TP-gt": 99,
"TP-base_FP-gt": 20,
"gt_concordance": 0.8319327731092437,
"gt_matrix": {
"(0, 1)": {
"(0, 1)": 59,
"(0, 0)": 15,
"(0, 1)": 94,
"(0, 0)": 19,
"(1, 1)": 1
},
"(1, 0)": {
Expand All @@ -67,4 +68,4 @@
}
}
}
2025-01-08 20:16:32,268 [INFO] Finished bench
2025-01-09 21:35:26,974 [INFO] Finished bench
2 changes: 1 addition & 1 deletion repo_utils/answer_key/bench/bench_bnd/params.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"base": "/Users/english/code/truvari/repo_utils/test_files/variants/bnd.base.vcf.gz", "comp": "/Users/english/code/truvari/repo_utils/test_files/variants/bnd.comp.vcf.gz", "output": "test_results/bench_bnd/", "includebed": null, "extend": 0, "debug": false, "reference": null, "refdist": 500, "pctseq": 0.0, "pctsize": 0.7, "pctovl": 0.0, "typeignore": false, "no_roll": true, "chunksize": 1000, "bSample": "HG008-T", "cSample": "HG008_T_hiphase.haplotagged", "dup_to_ins": false, "bnddist": 100, "sizemin": 50, "sizefilt": 30, "sizemax": 50000, "passonly": false, "no_ref": false, "pick": "single", "ignore_monref": true, "check_multi": true, "check_monref": true, "no_single_bnd": true, "write_resolved": false, "decompose": false, "short_circuit": false, "skip_gt": false}
{"base": "/data/repo_utils/test_files/variants/bnd.base.vcf.gz", "comp": "/data/repo_utils/test_files/variants/bnd.comp.vcf.gz", "output": "test_results/bench_bnd/", "includebed": null, "extend": 0, "debug": false, "reference": null, "refdist": 500, "pctseq": 0.0, "pctsize": 0.7, "pctovl": 0.0, "typeignore": false, "no_roll": true, "chunksize": 1000, "bSample": "HG008-T", "cSample": "HG008_T_hiphase.haplotagged", "dup_to_ins": false, "bnddist": 100, "sizemin": 50, "sizefilt": 30, "sizemax": -1, "passonly": false, "no_ref": false, "pick": "single", "ignore_monref": true, "check_multi": true, "check_monref": true, "no_single_bnd": true, "write_resolved": false, "decompose": false, "short_circuit": false, "skip_gt": false, "max_resolve": 25000}
32 changes: 16 additions & 16 deletions repo_utils/answer_key/bench/bench_bnd/summary.json
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
{
"TP-base": 80,
"TP-comp": 80,
"FP": 88,
"FN": 55,
"precision": 0.47619047619047616,
"recall": 0.5925925925925926,
"f1": 0.528052805280528,
"base cnt": 135,
"comp cnt": 168,
"TP-comp_TP-gt": 64,
"TP-comp_FP-gt": 16,
"TP-base_TP-gt": 64,
"TP-base_FP-gt": 16,
"gt_concordance": 0.8,
"TP-base": 119,
"TP-comp": 119,
"FP": 102,
"FN": 67,
"precision": 0.5384615384615384,
"recall": 0.6397849462365591,
"f1": 0.5847665847665847,
"base cnt": 186,
"comp cnt": 221,
"TP-comp_TP-gt": 99,
"TP-comp_FP-gt": 20,
"TP-base_TP-gt": 99,
"TP-base_FP-gt": 20,
"gt_concordance": 0.8319327731092437,
"gt_matrix": {
"(0, 1)": {
"(0, 1)": 59,
"(0, 0)": 15,
"(0, 1)": 94,
"(0, 0)": 19,
"(1, 1)": 1
},
"(1, 0)": {
Expand Down
Binary file modified repo_utils/answer_key/bench/bench_bnd/tp-base.vcf.gz
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd/tp-base.vcf.gz.tbi
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd/tp-comp.vcf.gz
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd/tp-comp.vcf.gz.tbi
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd_decomp/fn.vcf.gz
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd_decomp/fn.vcf.gz.tbi
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd_decomp/fp.vcf.gz
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd_decomp/fp.vcf.gz.tbi
Binary file not shown.
12 changes: 7 additions & 5 deletions truvari/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,20 @@ def parse_args(args):
help="Min reciprocal overlap (%(default)s)")
thresg.add_argument("-t", "--typeignore", action="store_true", default=defaults.typeignore,
help="Don't compare variant types (%(default)s)")
thresg.add_argument("--no-roll", action="store_false",
thresg.add_argument("-n", "--no-roll", action="store_false",
help="Turn off rolling sequence similarity")
thresg.add_argument("--pick", type=str, default=defaults.pick, choices=PICKERS.keys(),
help="Number of matches reported per-call (%(default)s)")
thresg.add_argument("--dup-to-ins", action="store_true",
thresg.add_argument("-d", "--dup-to-ins", action="store_true",
help="Assume DUP svtypes are INS (%(default)s)")
thresg.add_argument("-B", "--bnddist", type=int, default=defaults.bnddist,
help="Maximum distance allowed between BNDs (%(default)s; -1=off)")
thresg.add_argument("-C", "--chunksize", type=truvari.restricted_int, default=defaults.chunksize,
help="Max reference distance to compare calls (%(default)s)")
thresg.add_argument("--no-decompose", action="store_false",
thresg.add_argument("-N", "--no-decompose", action="store_false",
help="Disallow symbolic variant decomposition to BNDs")
thresg.add_argument("-m", "--max-resolve", type=int, default=defaults.max_resolve,
help="Maximum size of variant to attempt to sequence resolve ($(default)s)")

genoty = parser.add_argument_group("Genotype Comparison Arguments")
genoty.add_argument("--bSample", type=str, default=None,
Expand All @@ -80,8 +82,8 @@ def parse_args(args):
help="Minimum variant size to consider from --comp (%(default)s)")
filteg.add_argument("-S", "--sizefilt", type=truvari.restricted_int, default=None,
help="Minimum variant size to consider from --base (30)")
filteg.add_argument("--sizemax", type=truvari.restricted_int, default=defaults.sizemax,
help="Maximum variant size to consider (%(default)s)")
filteg.add_argument("--sizemax", type=int, default=defaults.sizemax,
help="Maximum variant size to consider (-1 = off)")
filteg.add_argument("--no-ref", default=defaults.no_ref, choices=['a', 'b', 'c'],
help="Exclude 0/0 or ./. GT calls from all (a), base (b), or comp (c) vcfs (%(default)s)")
filteg.add_argument("--includebed", type=str, default=None,
Expand Down
6 changes: 5 additions & 1 deletion truvari/collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,8 +509,12 @@ def parse_args(args):
help="Min pct reciprocal overlap (%(default)s) for DEL events")
thresg.add_argument("-t", "--typeignore", action="store_true", default=False,
help="Variant types don't need to match to compare (%(default)s)")
thresg.add_argument("--no-roll", action="store_false",
thresg.add_argument("-n", "--no-roll", action="store_false",
help="Turn off rolling sequence similarity")
thresg.add_argument("-m", "--max-resolve", type=truvari.restricted_int, default=25000,
help="Maximum size of variant to attempt to sequence resolve ($(default)s)")
thresg.add_argument("-D", "--decompose", action="store_true",
help="Allow decomposition for SV to BND comparison (%(default)s)")

parser.add_argument("--hap", action="store_true", default=False,
help="Collapsing a single individual's haplotype resolved calls (%(default)s)")
Expand Down
12 changes: 2 additions & 10 deletions truvari/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,6 @@ def chunker(params, *files):
cur_chrom = None
cur_end = 0
cur_chunk = defaultdict(list)
unresolved_warned = False
reference = pysam.FastaFile(params.reference) if params.reference is not None else None

for key, entry in file_zipper(*files):
Expand All @@ -182,15 +181,8 @@ def chunker(params, *files):
continue

# check symbolic, resolve if needed/possible
if params.pctseq != 0 and entry.alts[0].startswith('<'):
was_resolved = entry.resolve(reference)
if not was_resolved:
if not unresolved_warned:
logging.warning("Some symbolic SVs couldn't be resolved")
unresolved_warned = True
cur_chunk['__filtered'].append(entry)
call_counts['__filtered'] += 1
continue
if entry.alts[0].startswith('<'):
entry.resolve(reference)

new_chrom = cur_chrom and entry.chrom != cur_chrom
new_chunk = cur_end and cur_end + params.chunksize < entry.start
Expand Down
6 changes: 5 additions & 1 deletion truvari/phab.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,11 +139,12 @@ def make_consensus(data, ref_fn, passonly=True, max_size=50000):
vcf_i = vcf.fetch_regions(tree, with_region=True)

# Don't make haplotypes of non-sequence resolved, non-pass (sometimes), too big variants
# Note: max_size == -1 now that we're not capping by variant size.
# pylint: disable=unnecessary-lambda-assignment
entry_filter = lambda e: \
e.is_resolved() \
and (not passonly or not e.is_filtered()) \
and (e.var_size() <= max_size)
and (max_size == -1 or e.var_size() <= max_size)
# pylint: enable=unnecessary-lambda-assignment

cur_key = None
Expand Down Expand Up @@ -347,6 +348,9 @@ def phab(var_regions, base_vcf, ref_fn, output_fn, bSamples=None, buffer=100,
:param `method`: Alignment method to use mafft or wfa
:type `method`: :class:`str`
"""
if max_size == -1 or max_size >= 50000:
logging.warning("Harmonizing variants ≥50kbp is not recommended")

logging.info("Preparing regions")
region_fn = merged_region_file(var_regions, buffer)

Expand Down
4 changes: 2 additions & 2 deletions truvari/variant_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self, filename, *args, params=None, **kwargs):
:param args: Additional positional arguments to pass to pysam.VariantFile.
:param kwargs: Additional keyword arguments to pass to pysam.VariantFile.
"""
self.params = params
self.params = params if params else truvari.VariantParams()
self._vcf = pysam.VariantFile(filename, *args, **kwargs)

def __enter__(self):
Expand Down Expand Up @@ -121,7 +121,7 @@ def write(self, record):
:type record: `truvari.VariantRecord`
"""
out = record.get_record()
if self.params and self.params.write_resolved:
if self.params.write_resolved:
out.ref = record.get_ref()
out.alts = (record.get_alt(),)
self._vcf.write(out)
7 changes: 5 additions & 2 deletions truvari/variant_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class VariantParams():
* - `sizefilt`
- Minimum size filter for comparison in the "comparison" dataset. Default: 30.
* - `sizemax`
- Maximum variant size to consider. Default: 50000.
- Maximum variant size to consider. Default: -1, off.
* - `passonly`
- Whether to only consider variants with a "PASS" filter status. Default: `False`.
* - `no_ref`
Expand All @@ -63,6 +63,8 @@ class VariantParams():
- Whether to enable short-circuit logic for early exits in comparisons. Default: `False`.
* - `skip_gt`
- Whether to skip genotype comparisons. Default: `False`.
* - `max_resolve`
- Maximum size of a variant to attempt sequence resolving. Default: 25000.
"""

Expand All @@ -81,7 +83,7 @@ class VariantParams():
"bnddist": 100,
"sizemin": 50,
"sizefilt": 30,
"sizemax": 50000,
"sizemax": -1,
"passonly": False,
"no_ref": False,
"pick": "single",
Expand All @@ -93,6 +95,7 @@ class VariantParams():
"decompose": True,
"short_circuit": False,
"skip_gt": False,
"max_resolve": 25000,
}

def __init__(self, args=None, **kwargs):
Expand Down
Loading

0 comments on commit 751a207

Please sign in to comment.