Skip to content

Commit

Permalink
subchunker
Browse files Browse the repository at this point in the history
Lower the number of comparisons collapse has to do by pre-clustering
chunks by their sizesimilarity min/max in an intervaltree
  • Loading branch information
ACEnglish committed Dec 23, 2023
1 parent 7134c2f commit 564c6fe
Show file tree
Hide file tree
Showing 45 changed files with 1,364 additions and 1,344 deletions.
Binary file modified repo_utils/answer_key/bench/bench_unroll/fn.vcf.gz
Binary file not shown.
26 changes: 13 additions & 13 deletions repo_utils/answer_key/bench/bench_unroll/log.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
2023-11-01 14:02:05,211 [INFO] Truvari v4.1.1.dev0+a08d9a7.uc
2023-11-01 14:02:05,212 [INFO] Command /data/truvari/__main__.py bench -b repo_utils/test_files/variants/real_small_base.vcf.gz -c repo_utils/test_files/variants/real_small_comp.vcf.gz -o test_results/bench_unroll/
2023-11-01 14:02:05,214 [INFO] Params:
2023-12-23 20:12:24,236 [INFO] Truvari v4.2.0rc1
2023-12-23 20:12:24,237 [INFO] Command /data/truvari/__main__.py bench -b repo_utils/test_files/variants/real_small_base.vcf.gz -c repo_utils/test_files/variants/real_small_comp.vcf.gz -o test_results/bench_unroll/
2023-12-23 20:12:24,239 [INFO] Params:
{
"base": "/data/repo_utils/test_files/variants/real_small_base.vcf.gz",
"comp": "/data/repo_utils/test_files/variants/real_small_comp.vcf.gz",
Expand Down Expand Up @@ -28,9 +28,9 @@
"check_monref": true,
"check_multi": true
}
2023-11-01 14:02:05,282 [INFO] Zipped 71 variants Counter({'base': 43, 'comp': 28})
2023-11-01 14:02:05,283 [INFO] 45 chunks of 71 variants Counter({'base': 43, 'comp': 28})
2023-11-01 14:02:05,396 [INFO] Stats: {
2023-12-23 20:12:24,328 [INFO] Zipped 71 variants Counter({'base': 43, 'comp': 28})
2023-12-23 20:12:24,330 [INFO] 45 chunks of 71 variants Counter({'base': 43, 'comp': 28})
2023-12-23 20:12:24,466 [INFO] Stats: {
"TP-base": 18,
"TP-comp": 18,
"FP": 1,
Expand Down Expand Up @@ -58,12 +58,12 @@
},
"weighted": {
"sequence": {
"TP": 18.180799916386604,
"FP": 2.3630000948905945,
"FN": 24.819200083613396,
"precision": 0.8849774582310252,
"recall": 0.42280930038108383,
"f1": 0.572228916531905,
"TP": 18.588599994778633,
"FP": 1.9593999981880188,
"FN": 24.411400005221367,
"precision": 0.9046427876747761,
"recall": 0.43229302313438683,
"f1": 0.5850254924414924,
"total": 44
},
"size": {
Expand All @@ -77,4 +77,4 @@
}
}
}
2023-11-01 14:02:05,397 [INFO] Finished bench
2023-12-23 20:12:24,467 [INFO] Finished bench
12 changes: 6 additions & 6 deletions repo_utils/answer_key/bench/bench_unroll/summary.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@
},
"weighted": {
"sequence": {
"TP": 18.180799916386604,
"FP": 2.3630000948905945,
"FN": 24.819200083613396,
"precision": 0.8849774582310252,
"recall": 0.42280930038108383,
"f1": 0.572228916531905,
"TP": 18.588599994778633,
"FP": 1.9593999981880188,
"FN": 24.411400005221367,
"precision": 0.9046427876747761,
"recall": 0.43229302313438683,
"f1": 0.5850254924414924,
"total": 44
},
"size": {
Expand Down
Binary file modified repo_utils/answer_key/bench/bench_unroll/tp-base.vcf.gz
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_unroll/tp-base.vcf.gz.tbi
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_unroll/tp-comp.vcf.gz
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_unroll/tp-comp.vcf.gz.tbi
Binary file not shown.
4 changes: 2 additions & 2 deletions repo_utils/answer_key/collapse/input1_collapsed.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -885,7 +885,7 @@ chr20 419860 . AGTGACCCTGCACCTGGCT A 60 . QNAME=HG002-S9-H2-000001F;QSTART=37411
chr20 420228 . A C 60 . QNAME=HG002-S9-H2-000001F;QSTART=374466;QSTRAND=+ GT:PL:DP 0|1:2,3,6:24,29
chr20 420465 . A AG 60 . QNAME=HG002-S9-H2-000001F;QSTART=374704;QSTRAND=+ GT:PL:DP 0|1:6,10,1:32,35
chr20 420561 . A T 60 . QNAME=HG002-S9-H1-000001F;QSTART=399939;QSTRAND=+ GT:PL:DP 1|1:5,6,7:36,12
chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;NumCollapsed=1;NumConsolidated=0;CollapseId=4.0 GT:PL 0/1:.
chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;NumCollapsed=1;NumConsolidated=0;CollapseId=6.0 GT:PL 0/1:.
chr20 421409 . G A 60 . QNAME=HG002-S9-H1-000001F;QSTART=401013;QSTRAND=+ GT:PL:DP 1|1:2,3,7:14,41
chr20 421527 . T C 60 . QNAME=HG002-S9-H2-000001F;QSTART=375993;QSTRAND=+ GT:PL:DP 0|1:3,8,4:49,42
chr20 422066 . A G 60 . QNAME=HG002-S9-H1-000001F;QSTART=401670;QSTRAND=+ GT:PL:DP 1|1:3,7,8:9,39
Expand Down Expand Up @@ -1257,7 +1257,7 @@ chr20 639104 . A AT 60 . QNAME=HG002-S9-H1-000001F;QSTART=618555;QSTRAND=+ GT:PL
chr20 640046 . C T 60 . QNAME=HG002-S9-H1-000001F;QSTART=619497;QSTRAND=+ GT:PL:DP 1|0:10,10,9:11,48
chr20 640049 . C T 60 . QNAME=HG002-S9-H1-000001F;QSTART=619500;QSTRAND=+ GT:PL:DP 1|1:2,3,4:13,44
chr20 641878 . C G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621329;QSTRAND=+ GT:PL:DP 1|0:8,1,7:7,13
chr20 641913 . G GGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGA 60 . QNAME=HG002-S9-H1-000001F;QSTART=621365;QSTRAND=+;SVTYPE=INS;SVLEN=66;NumCollapsed=1;NumConsolidated=0;CollapseId=6.0 GT:PL:DP 1/0:7,5,7:34,13
chr20 641913 . G GGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGA 60 . QNAME=HG002-S9-H1-000001F;QSTART=621365;QSTRAND=+;SVTYPE=INS;SVLEN=66;NumCollapsed=1;NumConsolidated=0;CollapseId=10.0 GT:PL:DP 1/0:7,5,7:34,13
chr20 641944 . GGA G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621462;QSTRAND=+ GT:PL:DP 1|0:6,6,6:17,7
chr20 642012 . GGT G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621528;QSTRAND=+ GT:PL:DP 1|0:5,7,1:6,23
chr20 642037 . T TG 60 . QNAME=HG002-S9-H1-000001F;QSTART=621551;QSTRAND=+ GT:PL:DP 1|0:4,10,10:14,47
Expand Down
4 changes: 2 additions & 2 deletions repo_utils/answer_key/collapse/input1_median_collapsed.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -888,7 +888,7 @@ chr20 419860 . AGTGACCCTGCACCTGGCT A 60 . QNAME=HG002-S9-H2-000001F;QSTART=37411
chr20 420228 . A C 60 . QNAME=HG002-S9-H2-000001F;QSTART=374466;QSTRAND=+ GT:PL:DP 0|1:2,3,6:24,29
chr20 420465 . A AG 60 . QNAME=HG002-S9-H2-000001F;QSTART=374704;QSTRAND=+ GT:PL:DP 0|1:6,10,1:32,35
chr20 420561 . A T 60 . QNAME=HG002-S9-H1-000001F;QSTART=399939;QSTRAND=+ GT:PL:DP 1|1:5,6,7:36,12
chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;NumCollapsed=1;NumConsolidated=0;CollapseId=4.0;CollapseStart=420664;CollapseEnd=420665;CollapseSize=226 GT:PL 0/1:.
chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCCATCCCCCGTCCGCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 61 . QNAME=HG002-S9-H2-000001F;QSTART=374905;QSTRAND=+;SVTYPE=INS;SVLEN=227;NumCollapsed=1;NumConsolidated=0;CollapseId=6.0;CollapseStart=420664;CollapseEnd=420665;CollapseSize=226 GT:PL 0/1:.
chr20 421409 . G A 60 . QNAME=HG002-S9-H1-000001F;QSTART=401013;QSTRAND=+ GT:PL:DP 1|1:2,3,7:14,41
chr20 421527 . T C 60 . QNAME=HG002-S9-H2-000001F;QSTART=375993;QSTRAND=+ GT:PL:DP 0|1:3,8,4:49,42
chr20 422066 . A G 60 . QNAME=HG002-S9-H1-000001F;QSTART=401670;QSTRAND=+ GT:PL:DP 1|1:3,7,8:9,39
Expand Down Expand Up @@ -1260,7 +1260,7 @@ chr20 639104 . A AT 60 . QNAME=HG002-S9-H1-000001F;QSTART=618555;QSTRAND=+ GT:PL
chr20 640046 . C T 60 . QNAME=HG002-S9-H1-000001F;QSTART=619497;QSTRAND=+ GT:PL:DP 1|0:10,10,9:11,48
chr20 640049 . C T 60 . QNAME=HG002-S9-H1-000001F;QSTART=619500;QSTRAND=+ GT:PL:DP 1|1:2,3,4:13,44
chr20 641878 . C G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621329;QSTRAND=+ GT:PL:DP 1|0:8,1,7:7,13
chr20 641913 . G GGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGA 60 . QNAME=HG002-S9-H1-000001F;QSTART=621365;QSTRAND=+;SVTYPE=INS;SVLEN=66;NumCollapsed=1;NumConsolidated=0;CollapseId=6.0;CollapseStart=642120;CollapseEnd=642121;CollapseSize=66 GT:PL:DP 1/0:7,5,7:34,13
chr20 641913 . G GGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGGGCCCAGCGGGGGTGGAGTTGCCTGGGGGGGGA 60 . QNAME=HG002-S9-H1-000001F;QSTART=621365;QSTRAND=+;SVTYPE=INS;SVLEN=66;NumCollapsed=1;NumConsolidated=0;CollapseId=10.0;CollapseStart=642120;CollapseEnd=642121;CollapseSize=66 GT:PL:DP 1/0:7,5,7:34,13
chr20 641944 . GGA G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621462;QSTRAND=+ GT:PL:DP 1|0:6,6,6:17,7
chr20 642012 . GGT G 60 . QNAME=HG002-S9-H1-000001F;QSTART=621528;QSTRAND=+ GT:PL:DP 1|0:5,7,1:6,23
chr20 642037 . T TG 60 . QNAME=HG002-S9-H1-000001F;QSTART=621551;QSTRAND=+ GT:PL:DP 1|0:4,10,10:14,47
Expand Down
4 changes: 2 additions & 2 deletions repo_utils/answer_key/collapse/input1_median_removed.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,5 @@
##INFO=<ID=MatchId,Number=1,Type=String,Description="Id to help tie base/comp calls together {chunkid}.{baseid}.{compid}">
##INFO=<ID=Multi,Number=0,Type=Flag,Description="Call is false due to non-multimatching">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA24385
chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCATCCCCCGTCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 60 . QNAME=HG002-S9-H1-000001F;QSTART=400044;QSTRAND=+;SVTYPE=INS;SVLEN=226;PctSeqSimilarity=0.9956;PctSizeSimilarity=0.9956;PctRecOverlap=1;SizeDiff=1;StartDistance=0;EndDistance=0;GTMatch=.;TruScore=99;MatchId=4.0 GT:PL:DP 1/0:4,8,6:32,9
chr20 642330 . G GGCCCAGCGGGGGTGGAGTTGCCTGTGGTGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGGC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597257;QSTRAND=+;SVTYPE=INS;SVLEN=66;PctSeqSimilarity=0.9576;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-417;EndDistance=-417;GTMatch=.;TruScore=65;MatchId=6.0 GT:PL:DP 0/1:6,1,9:20,25
chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCATCCCCCGTCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 60 . QNAME=HG002-S9-H1-000001F;QSTART=400044;QSTRAND=+;SVTYPE=INS;SVLEN=226;PctSeqSimilarity=0.9956;PctSizeSimilarity=0.9956;PctRecOverlap=1;SizeDiff=1;StartDistance=0;EndDistance=0;GTMatch=.;TruScore=99;MatchId=6.0 GT:PL:DP 1/0:4,8,6:32,9
chr20 642330 . G GGCCCAGCGGGGGTGGAGTTGCCTGTGGTGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGGC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597257;QSTRAND=+;SVTYPE=INS;SVLEN=66;PctSeqSimilarity=0.9576;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-417;EndDistance=-417;GTMatch=.;TruScore=65;MatchId=10.0 GT:PL:DP 0/1:6,1,9:20,25
4 changes: 2 additions & 2 deletions repo_utils/answer_key/collapse/input1_removed.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,5 @@
##INFO=<ID=MatchId,Number=1,Type=String,Description="Id to help tie base/comp calls together {chunkid}.{baseid}.{compid}">
##INFO=<ID=Multi,Number=0,Type=Flag,Description="Call is false due to non-multimatching">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA24385
chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCATCCCCCGTCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 60 . QNAME=HG002-S9-H1-000001F;QSTART=400044;QSTRAND=+;SVTYPE=INS;SVLEN=226;PctSeqSimilarity=0.9956;PctSizeSimilarity=0.9956;PctRecOverlap=1;SizeDiff=1;StartDistance=0;EndDistance=0;GTMatch=.;TruScore=99;MatchId=4.0 GT:PL:DP 1/0:4,8,6:32,9
chr20 642330 . G GGCCCAGCGGGGGTGGAGTTGCCTGTGGTGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGGC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597257;QSTRAND=+;SVTYPE=INS;SVLEN=66;PctSeqSimilarity=0.9576;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-417;EndDistance=-417;GTMatch=.;TruScore=65;MatchId=6.0 GT:PL:DP 0/1:6,1,9:20,25
chr20 420665 . G GCCCACCCCATCCCCCGTCCCCATCCCCCATCCCCCGTCCCCCGTCCCCATCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCATCCCCCGTCCCCCGTCCCCCATCCCATCCCCCACCCCCATCCCCCGTCCCCCGTCCCCATCCCCCATCCCCCATCCCCATCCCCCGTCCCCCGTCCCCCATCTCCTGTCCCCCGTCCCCCATCCCCCGTCCCCATCCCCCACC 60 . QNAME=HG002-S9-H1-000001F;QSTART=400044;QSTRAND=+;SVTYPE=INS;SVLEN=226;PctSeqSimilarity=0.9956;PctSizeSimilarity=0.9956;PctRecOverlap=1;SizeDiff=1;StartDistance=0;EndDistance=0;GTMatch=.;TruScore=99;MatchId=6.0 GT:PL:DP 1/0:4,8,6:32,9
chr20 642330 . G GGCCCAGCGGGGGTGGAGTTGCCTGTGGTGGGGGGCCCAGCGGGGGTGGAGTTGCCTGTGGGGGGGC 60 . QNAME=HG002-S9-H2-000001F;QSTART=597257;QSTRAND=+;SVTYPE=INS;SVLEN=66;PctSeqSimilarity=0.9576;PctSizeSimilarity=1;PctRecOverlap=0;SizeDiff=0;StartDistance=-417;EndDistance=-417;GTMatch=.;TruScore=65;MatchId=10.0 GT:PL:DP 0/1:6,1,9:20,25
Loading

0 comments on commit 564c6fe

Please sign in to comment.