Skip to content

Commit

Permalink
More API improvements
Browse files Browse the repository at this point in the history
Matcher methods make more sense inside VariantRecords.
Flattened Matcher so that matcher.params.abc becomes matcher.abc.
VariantFile can be sent a Matcher so that users don't have to tote it
around.
Some progress on documentation
  • Loading branch information
ACEnglish committed Jan 7, 2025
1 parent e4e6716 commit 07dd27b
Show file tree
Hide file tree
Showing 8 changed files with 330 additions and 199 deletions.
40 changes: 40 additions & 0 deletions docs/api/truvari.examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,45 @@ statements).
print(entry.info['SVTYPE']) # pysam access to the variant's INFO fields
print(entry.allele_freq_annos()) # truvari calculation of a variant's allele frequency
# pysam GT access
if 'GT' in entry.samples['SAMPLE']:
print(entry.samples[0]['GT'])
# truvari GT access
print(entry.gt('SAMPLE'))
Details of all available functions are in :ref:`package documentation <variant_handling>`

Besides some helpful accession functions, the `truvari.VariantRecord` also makes comparing two VCF entries easy.

.. code-block:: python
# Given two `truvari.VariantRecords`, entry1 and entry2
match = entry1.match(entry2)
print("Entries' Sequence Similarity:", match.seqsim)
print("Entries' Size Similarity:", match.sizesim)
print("Is the match above thresholds:", match.state)
This returns a `truvari.MatchResult`. We can customize the thresholds for matching by giving the `truvari.VariantFile` a
`truvari.Matcher`.

.. code-block:: python
# Turn off sequence and size similarity. Turn on reciprocal overlap
matcher = truvari.Matcher(seqsim=0, sizesim=0, recovl=0.5, ...)
vcf = truvari.VariantFile("input.vcf.gz", matcher=matcher)
entry1 = next(vcf)
entry2 = next(vcf)
match = entry1.match(entry2)
Another useful function is a quick way filter variants. The `truvari.Matcher` has parameters for e.g. minimum or maximum
size of SVs one wants to analyze which can be leveraged via:

.. code-block:: python
matcher = truvari.Matcher(sizemin=200, sizemax=500)
vcf = truvari.VariantFile("input.vcf.gz", matcher=matcher)
# Grab all of the variant records between sizemin and sizemax
results = [entry for entry in vcf if not entry.size_filter()]
Additional filtering for things such as monomorphic reference sites or single-end BNDs are available by calling `entry.filter_call()`
9 changes: 3 additions & 6 deletions repo_utils/run_unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,13 +75,10 @@
"""
Filtering logic
"""
vcf = truvari.VariantFile("repo_utils/test_files/variants/filter.vcf")
matcher = truvari.Matcher()
matcher.params.sizemin = 0
matcher.params.sizefilt = 0
matcher.params.passonly = True
matcher = truvari.Matcher(sizemin=0, sizefilt=0, passonly=True)
vcf = truvari.VariantFile("repo_utils/test_files/variants/filter.vcf", matcher=matcher)
for entry in vcf:
try:
assert matcher.filter_call(entry), f"Didn't filter {str(entry)}"
assert entry.filter_call(), f"Didn't filter {str(entry)}"
except ValueError as e:
assert e.args[0].startswith("Cannot compare multi-allelic"), f"Unknown exception {str(entry)}"
8 changes: 1 addition & 7 deletions truvari/annotations/chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,7 @@ def chunks_main(args):
"""
args = parse_args(args)
v = truvari.VariantFile(args.input)
m = truvari.Matcher()
m.params.pctseq = 0
m.params.sizemin = args.sizemin
m.params.sizefilt = args.sizemin
m.params.sizemax = args.sizemax
m.params.chunksize = args.chunksize
m.params.refdist = args.chunksize
m = truvari.Matcher(args=args, pctseq=0)
if args.bed:
v = v.bed_fetch(args.bed)
c = truvari.chunker(m, ('base', v))
Expand Down
51 changes: 22 additions & 29 deletions truvari/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ def __init__(self, bench, matcher):

os.mkdir(self.m_bench.outdir)
param_dict = self.m_bench.param_dict()
param_dict.update(vars(self.m_matcher.params))
param_dict.update(vars(self.m_matcher))

if self.m_bench.do_logging:
truvari.setup_logging(self.m_bench.debug, truvari.LogFileStderr(
Expand All @@ -333,8 +333,8 @@ def __init__(self, bench, matcher):
with open(os.path.join(self.m_bench.outdir, 'params.json'), 'w') as fout:
json.dump(param_dict, fout)

b_vcf = truvari.VariantFile(self.m_bench.base_vcf)
c_vcf = truvari.VariantFile(self.m_bench.comp_vcf)
b_vcf = truvari.VariantFile(self.m_bench.base_vcf, matcher=self.m_matcher)
c_vcf = truvari.VariantFile(self.m_bench.comp_vcf, matcher=self.m_matcher)
self.n_headers = {'b': edit_header(b_vcf),
'c': edit_header(c_vcf)}

Expand All @@ -345,14 +345,14 @@ def __init__(self, bench, matcher):
self.out_vcfs = {}
for key in ['tpb', 'fn']:
self.out_vcfs[key] = truvari.VariantFile(
self.vcf_filenames[key], mode='w', header=self.n_headers['b'])
self.vcf_filenames[key], mode='w', header=self.n_headers['b'], matcher=self.m_matcher)
for key in ['tpc', 'fp']:
self.out_vcfs[key] = truvari.VariantFile(
self.vcf_filenames[key], mode='w', header=self.n_headers['c'])
self.vcf_filenames[key], mode='w', header=self.n_headers['c'], matcher=self.m_matcher)

self.stats_box = StatsBox()

def write_match(self, match, resolved=False):
def write_match(self, match):
"""
Annotate a MatchResults' entries then write to the apppropriate file
and do the stats counting.
Expand All @@ -368,30 +368,30 @@ def write_match(self, match, resolved=False):
box["gt_matrix"][gtBase][gtComp] += 1

box["TP-base"] += 1
self.out_vcfs["tpb"].write(match.base, resolved)
self.out_vcfs["tpb"].write(match.base)
if match.gt_match == 0:
box["TP-base_TP-gt"] += 1
else:
box["TP-base_FP-gt"] += 1
else:
box["FN"] += 1
self.out_vcfs["fn"].write(match.base, resolved)
self.out_vcfs["fn"].write(match.base)

if match.comp:
annotate_entry(match.comp, match, self.n_headers['c'])
if match.state:
box["comp cnt"] += 1
box["TP-comp"] += 1
self.out_vcfs["tpc"].write(match.comp, resolved)
self.out_vcfs["tpc"].write(match.comp)
if match.gt_match == 0:
box["TP-comp_TP-gt"] += 1
else:
box["TP-comp_FP-gt"] += 1
elif match.comp.size() >= self.m_matcher.params.sizemin:
elif match.comp.size() >= self.m_matcher.sizemin:
# The if is because we don't count FPs between sizefilt-sizemin
box["comp cnt"] += 1
box["FP"] += 1
self.out_vcfs["fp"].write(match.comp, resolved)
self.out_vcfs["fp"].write(match.comp)

def close_outputs(self):
"""
Expand Down Expand Up @@ -422,8 +422,7 @@ class Bench():
.. code-block:: python
matcher = truvari.Matcher()
matcher.params.pctseq = 0.50
matcher = truvari.Matcher(pctseq=0.50)
m_bench = truvari.Bench(matcher)
To run on a chunk of :class:`truvari.VariantRecord` already loaded, pass them in as lists to:
Expand Down Expand Up @@ -451,10 +450,8 @@ class Bench():
However, the returned `BenchOutput` has attributes pointing to all the results.
"""

#pylint: disable=too-many-arguments
def __init__(self, matcher=None, base_vcf=None, comp_vcf=None, outdir=None,
includebed=None, extend=0, debug=False, do_logging=False, short_circuit=False,
write_resolved=False):
includebed=None, extend=0, debug=False, do_logging=False, short_circuit=False):
"""
Initilize
"""
Expand All @@ -468,8 +465,6 @@ def __init__(self, matcher=None, base_vcf=None, comp_vcf=None, outdir=None,
self.do_logging = do_logging
self.short_circuit = short_circuit
self.refine_candidates = []
self.write_resolved = write_resolved
#pylint: enable=too-many-arguments

def param_dict(self):
"""
Expand All @@ -492,8 +487,8 @@ def run(self):

output = BenchOutput(self, self.matcher)

base = truvari.VariantFile(self.base_vcf)
comp = truvari.VariantFile(self.comp_vcf)
base = truvari.VariantFile(self.base_vcf, matcher=self.matcher)
comp = truvari.VariantFile(self.comp_vcf, matcher=self.matcher)

region_tree = truvari.build_region_tree(base, comp, self.includebed)
truvari.merge_region_tree_overlaps(region_tree)
Expand All @@ -513,7 +508,7 @@ def run(self):
and not match.state
and not match.comp.within_tree(region_tree)):
match.comp = None
output.write_match(match, self.write_resolved)
output.write_match(match)

with open(os.path.join(self.outdir, 'candidate.refine.bed'), 'w') as fout:
fout.write("\n".join(self.refine_candidates))
Expand All @@ -531,7 +526,7 @@ def compare_chunk(self, chunk):
chunk_dict["base"], chunk_dict["comp"], chunk_id)
self.check_refine_candidate(result)
# Check BNDs separately
if self.matcher.params.bnddist != -1 and (chunk_dict['base_BND'] or chunk_dict['comp_BND']):
if self.matcher.bnddist != -1 and (chunk_dict['base_BND'] or chunk_dict['comp_BND']):
result.extend(self.compare_calls(chunk_dict['base_BND'],
chunk_dict['comp_BND'], chunk_id))
return result
Expand Down Expand Up @@ -584,7 +579,7 @@ def compare_calls(self, base_variants, comp_variants, chunk_id=0):
base_variants, comp_variants, chunk_id)
if isinstance(match_matrix, list):
return match_matrix
return PICKERS[self.matcher.params.pick](match_matrix)
return PICKERS[self.matcher.pick](match_matrix)

def build_matrix(self, base_variants, comp_variants, chunk_id=0, skip_gt=False):
"""
Expand All @@ -597,8 +592,7 @@ def build_matrix(self, base_variants, comp_variants, chunk_id=0, skip_gt=False):
for bid, base in enumerate(base_variants):
base_matches = []
for cid, comp in enumerate(comp_variants):
mat = base.match(comp, matcher=self.matcher,
skip_gt=skip_gt, short_circuit=self.short_circuit)
mat = base.match(comp, skip_gt=skip_gt, short_circuit=self.short_circuit)
mat.matid = [f"{chunk_id}.{bid}", f"{chunk_id}.{cid}"]
logging.debug("Made mat -> %s", mat)
base_matches.append(mat)
Expand All @@ -615,14 +609,14 @@ def check_refine_candidate(self, result):
chrom = None
for match in result:
has_unmatched |= not match.state
if match.base is not None and match.base.size() >= self.matcher.params.sizemin:
if match.base is not None and match.base.size() >= self.matcher.sizemin:
chrom = match.base.chrom
pos.extend(match.base.boundaries())
if match.comp is not None:
chrom = match.comp.chrom
pos.extend(match.comp.boundaries())
if has_unmatched and pos:
# min(10, self.matcher.params.chunksize) need to make sure the refine covers the region
# min(10, self.matcher.chunksize) need to make sure the refine covers the region
buf = 10
start = max(0, min(*pos) - buf)
self.refine_candidates.append(
Expand Down Expand Up @@ -783,8 +777,7 @@ def bench_main(cmdargs):
extend=args.extend,
debug=args.debug,
do_logging=True,
short_circuit=args.short,
write_resolved=args.write_resolved)
short_circuit=args.short)
output = m_bench.run()

logging.info("Stats: %s", json.dumps(output.stats_box, indent=4))
Expand Down
Loading

0 comments on commit 07dd27b

Please sign in to comment.