From c42ff8a8a1b6b425d9a25b4115051ec833ca40d0 Mon Sep 17 00:00:00 2001 From: Adam English Date: Tue, 7 Jan 2025 00:03:01 -0500 Subject: [PATCH] code clean VariantParams better reflects its use. Split apart variants.py as the code was getting way too long --- docs/api/conf.py | 2 +- docs/api/truvari.examples.rst | 11 +- docs/api/truvari.package.rst | 6 +- repo_utils/run_doctests.py | 6 +- repo_utils/run_unittest.py | 4 +- truvari/__init__.py | 10 +- truvari/annotations/bpovl.py | 2 +- truvari/annotations/chunks.py | 2 +- truvari/annotations/hompct.py | 4 +- truvari/annotations/numneigh.py | 2 +- truvari/annotations/remap.py | 2 +- truvari/annotations/repmask.py | 4 +- truvari/annotations/svinfo.py | 2 +- truvari/annotations/trf.py | 6 +- truvari/bench.py | 53 +-- truvari/collapse.py | 86 ++-- truvari/matching.py | 94 +--- truvari/phab.py | 2 +- truvari/refine.py | 4 +- truvari/variant_file.py | 117 +++++ truvari/variant_params.py | 86 ++++ truvari/{variants.py => variant_record.py} | 476 +++++++++------------ truvari/vcf2df.py | 2 +- 23 files changed, 508 insertions(+), 475 deletions(-) create mode 100644 truvari/variant_file.py create mode 100644 truvari/variant_params.py rename truvari/{variants.py => variant_record.py} (80%) diff --git a/docs/api/conf.py b/docs/api/conf.py index 9384a268..67a39caa 100644 --- a/docs/api/conf.py +++ b/docs/api/conf.py @@ -17,7 +17,7 @@ # -- Project information ----------------------------------------------------- project = 'Truvari' -copyright = '2023, Adam English' +copyright = '2025, Adam English' author = 'Adam English' diff --git a/docs/api/truvari.examples.rst b/docs/api/truvari.examples.rst index 04886f4e..bba2bd4f 100644 --- a/docs/api/truvari.examples.rst +++ b/docs/api/truvari.examples.rst @@ -32,12 +32,12 @@ The `truvari.VariantRecord` simplifies comparing two VCF entries. print("Entries' Size Similarity:", match.sizesim) print("Is the match above thresholds:", match.state) -This returns a `truvari.MatchResult`. You can customize matching thresholds by providing a `truvari.Matcher` to the `truvari.VariantFile`. +This returns a `truvari.MatchResult`. You can customize matching thresholds by providing a `truvari.VariantParams` to the `truvari.VariantFile`. .. code-block:: python # Disable sequence and size similarity; enable reciprocal overlap - matcher = truvari.Matcher(seqsim=0, sizesim=0, recovl=0.5) + matcher = truvari.VariantParams(seqsim=0, sizesim=0, recovl=0.5) vcf = truvari.VariantFile("input.vcf.gz", matcher=matcher) entry1 = next(vcf) entry2 = next(vcf) @@ -46,11 +46,11 @@ This returns a `truvari.MatchResult`. You can customize matching thresholds by p Filtering Variants ------------------ -The `truvari.Matcher` provides parameters for filtering variants, such as minimum or maximum SV sizes. +The `truvari.VariantParams` provides parameters for filtering variants, such as minimum or maximum SV sizes. .. code-block:: python - matcher = truvari.Matcher(sizemin=200, sizemax=500) + matcher = truvari.VariantParams(sizemin=200, sizemax=500) vcf = truvari.VariantFile("input.vcf.gz", matcher=matcher) # Retrieve all variant records within sizemin and sizemax results = [entry for entry in vcf if not entry.size_filter()] @@ -65,7 +65,8 @@ To subset a VCF to regions specified in a BED file, use: .. code-block:: python for entry in vcf.bed_fetch("regions.bed"): - print(entry.var_type(), entry.size()) + print("Entry's variant type:", entry.var_type()) + print("Entry's variant size:", entry.var_size()) If your regions of interest are stored in an in-memory object instead of a BED file, use the `.regions_fetch` method: diff --git a/docs/api/truvari.package.rst b/docs/api/truvari.package.rst index 54746570..33eb4b5b 100644 --- a/docs/api/truvari.package.rst +++ b/docs/api/truvari.package.rst @@ -19,6 +19,9 @@ Variant Handling .. autoclass:: VariantRecord :members: +.. autoclass:: VariantParams + :members: + Extra Methods ------------- .. autofunction:: bed_ranges @@ -114,9 +117,6 @@ Objects .. autoclass:: MatchResult :members: -.. autoclass:: Matcher - :members: - .. autoclass:: SV :members: diff --git a/repo_utils/run_doctests.py b/repo_utils/run_doctests.py index 9a75c6df..42944685 100644 --- a/repo_utils/run_doctests.py +++ b/repo_utils/run_doctests.py @@ -13,7 +13,8 @@ comparisons, msatovcf, utils, - variants, + variant_file, + variant_record, vcf2df, ) @@ -27,7 +28,8 @@ def tester(module): return ret.failed fails = 0 -fails += tester(variants) +fails += tester(variant_file) +fails += tester(variant_record) fails += tester(comparisons) fails += tester(utils) fails += tester(vcf2df) diff --git a/repo_utils/run_unittest.py b/repo_utils/run_unittest.py index fdb96d11..3acacd44 100644 --- a/repo_utils/run_unittest.py +++ b/repo_utils/run_unittest.py @@ -75,8 +75,8 @@ """ Filtering logic """ -matcher = truvari.Matcher(sizemin=0, sizefilt=0, passonly=True) -vcf = truvari.VariantFile("repo_utils/test_files/variants/filter.vcf", matcher=matcher) +p = truvari.VariantParams(sizemin=0, sizefilt=0, passonly=True) +vcf = truvari.VariantFile("repo_utils/test_files/variants/filter.vcf", params=p) for entry in vcf: try: assert entry.filter_call(), f"Didn't filter {str(entry)}" diff --git a/truvari/__init__.py b/truvari/__init__.py index 07289e56..585a0c8b 100644 --- a/truvari/__init__.py +++ b/truvari/__init__.py @@ -10,11 +10,11 @@ :class:`GT` :class:`LogFileStderr` :class:`MatchResult` -:class:`Matcher` :class:`StatsBox` :class:`SV` :class:`VariantFile` :class:`VariantRecord` +:class:`VariantParams` Extra methods: @@ -89,7 +89,6 @@ from truvari.matching import ( MatchResult, - Matcher, chunker, file_zipper ) @@ -132,10 +131,9 @@ vcf_ranges, ) -from truvari.variants import ( - VariantFile, - VariantRecord, -) +from truvari.variant_file import VariantFile +from truvari.variant_params import VariantParams +from truvari.variant_record import VariantRecord from truvari.vcf2df import ( GT, diff --git a/truvari/annotations/bpovl.py b/truvari/annotations/bpovl.py index d12c827e..6384af97 100644 --- a/truvari/annotations/bpovl.py +++ b/truvari/annotations/bpovl.py @@ -75,7 +75,7 @@ def _transform(): if span > args.spanmax: continue - if entry.size() < args.sizemin: + if entry.var_size() < args.sizemin: continue key = entry.to_hash() diff --git a/truvari/annotations/chunks.py b/truvari/annotations/chunks.py index a964ab53..4c9e3508 100644 --- a/truvari/annotations/chunks.py +++ b/truvari/annotations/chunks.py @@ -49,7 +49,7 @@ def chunks_main(args): """ args = parse_args(args) v = truvari.VariantFile(args.input) - m = truvari.Matcher(args=args, pctseq=0) + m = truvari.VariantParams(args=args, pctseq=0) if args.bed: v = v.bed_fetch(args.bed) c = truvari.chunker(m, ('base', v)) diff --git a/truvari/annotations/hompct.py b/truvari/annotations/hompct.py index b2320890..d41b090e 100644 --- a/truvari/annotations/hompct.py +++ b/truvari/annotations/hompct.py @@ -45,7 +45,7 @@ def get_pct(chrom, start, end): tot = 0 homs = 0 for entry in v.fetch(chrom, max(0, start - args.buffer), min(v.header.contigs[chrom].length, end + args.buffer)): - if entry.size() > args.maxgt: + if entry.var_size() > args.maxgt: continue if truvari.get_gt(entry.samples[0]["GT"]).name == "HOM": homs += 1 @@ -66,7 +66,7 @@ def get_pct(chrom, start, end): out = truvari.VariantFile(args.output, 'w', header=header) v2 = truvari.VariantFile(args.input) for entry in v2: - if entry.size() >= args.minanno: + if entry.var_size() >= args.minanno: entry.translate(header) anno = get_pct(entry.chrom, *entry.boundaries()) if anno is not None: diff --git a/truvari/annotations/numneigh.py b/truvari/annotations/numneigh.py index 9392c7b2..2dbb79e9 100644 --- a/truvari/annotations/numneigh.py +++ b/truvari/annotations/numneigh.py @@ -132,7 +132,7 @@ def run(self): """ last_pos = None for entry in self.in_vcf: - size = entry.size() + size = entry.var_size() if not last_pos: last_pos = [entry.chrom, entry.start] diff --git a/truvari/annotations/remap.py b/truvari/annotations/remap.py index 5955fa1e..e07b5f81 100644 --- a/truvari/annotations/remap.py +++ b/truvari/annotations/remap.py @@ -117,7 +117,7 @@ def annotate_entry(self, entry): """ Annotates entries in the vcf and writes to new vcf """ - if entry.size() >= self.min_length: + if entry.var_size() >= self.min_length: entry.translate(self.n_header) remap, hits = self.remap_entry(entry) entry.info["REMAP"] = remap diff --git a/truvari/annotations/repmask.py b/truvari/annotations/repmask.py index bc4eecbf..3a70bf29 100644 --- a/truvari/annotations/repmask.py +++ b/truvari/annotations/repmask.py @@ -73,7 +73,7 @@ def extract_seqs(self): fh = truvari.VariantFile(self.in_vcf) for pos, entry in enumerate(fh): tot_cnt += 1 - entry_size = entry.size() + entry_size = entry.var_size() if self.min_length <= entry_size <= self.max_length: cnt += 1 cntbp += entry_size @@ -126,7 +126,7 @@ def annotate_entry(self, entry, hits): """ best_hit_pct = 0 best_hit = None - entry_size = entry.size() + entry_size = entry.var_size() for hit in hits: size_aln = abs(hit["RM_qstart"] - hit["RM_qend"]) + 1 pct = size_aln / entry_size # The TR that covers the most of the sequence diff --git a/truvari/annotations/svinfo.py b/truvari/annotations/svinfo.py index 092a3199..7688abbc 100644 --- a/truvari/annotations/svinfo.py +++ b/truvari/annotations/svinfo.py @@ -42,7 +42,7 @@ def add_svinfo(entry, min_size=0, n_header=None): del entry.info['SVTYPE'] if "SVLEN" in entry.info: del entry.info['SVLEN'] - sz = entry.size() + sz = entry.var_size() if sz < min_size: return if n_header: diff --git a/truvari/annotations/trf.py b/truvari/annotations/trf.py index bb45b1ac..27f4b5c1 100644 --- a/truvari/annotations/trf.py +++ b/truvari/annotations/trf.py @@ -181,7 +181,7 @@ def annotate(self, entry, score_filter=True): Figure out the hit and return """ svtype = entry.var_type() - sz = entry.size() + sz = entry.var_size() repeat = [] if svtype == truvari.SV.DEL: repeat = self.del_annotate(entry, sz, score_filter) @@ -396,7 +396,7 @@ def process_ref_region(region, args): continue svtype = entry.var_type() - svlen = entry.size() + svlen = entry.var_size() if svlen < args.min_length or svtype not in [truvari.SV.DEL, truvari.SV.INS]: out.write(str(edit_entry(entry, None, new_header))) continue @@ -460,7 +460,7 @@ def process_tr_region(region, args): if not (entry.start >= region["start"] and entry.stop < region["end"]): continue svtype = entry.var_type() - svlen = entry.size() + svlen = entry.var_size() if svlen < args.min_length or svtype not in [truvari.SV.DEL, truvari.SV.INS]: out.write(str(edit_entry(entry, None, new_header))) continue diff --git a/truvari/bench.py b/truvari/bench.py index afa4e564..df780bd7 100644 --- a/truvari/bench.py +++ b/truvari/bench.py @@ -25,7 +25,7 @@ def parse_args(args): """ Pull the command line parameters """ - defaults = truvari.Matcher.make_match_params() + defaults = truvari.VariantParams.make_params() parser = argparse.ArgumentParser(prog="bench", description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("-b", "--base", type=str, required=True, @@ -314,16 +314,16 @@ class BenchOutput(): a :class:`StatsBox`. """ - def __init__(self, bench, matcher): + def __init__(self, bench, params): """ initialize """ self.m_bench = bench - self.m_matcher = matcher + self.m_params = params os.mkdir(self.m_bench.outdir) param_dict = self.m_bench.param_dict() - param_dict.update(vars(self.m_matcher)) + param_dict.update(vars(self.m_params)) if self.m_bench.do_logging: truvari.setup_logging(self.m_bench.debug, truvari.LogFileStderr( @@ -333,8 +333,8 @@ def __init__(self, bench, matcher): with open(os.path.join(self.m_bench.outdir, 'params.json'), 'w') as fout: json.dump(param_dict, fout) - b_vcf = truvari.VariantFile(self.m_bench.base_vcf, matcher=self.m_matcher) - c_vcf = truvari.VariantFile(self.m_bench.comp_vcf, matcher=self.m_matcher) + b_vcf = truvari.VariantFile(self.m_bench.base_vcf, params=self.m_params) + c_vcf = truvari.VariantFile(self.m_bench.comp_vcf, params=self.m_params) self.n_headers = {'b': edit_header(b_vcf), 'c': edit_header(c_vcf)} @@ -345,10 +345,10 @@ def __init__(self, bench, matcher): self.out_vcfs = {} for key in ['tpb', 'fn']: self.out_vcfs[key] = truvari.VariantFile( - self.vcf_filenames[key], mode='w', header=self.n_headers['b'], matcher=self.m_matcher) + self.vcf_filenames[key], mode='w', header=self.n_headers['b'], params=self.m_params) for key in ['tpc', 'fp']: self.out_vcfs[key] = truvari.VariantFile( - self.vcf_filenames[key], mode='w', header=self.n_headers['c'], matcher=self.m_matcher) + self.vcf_filenames[key], mode='w', header=self.n_headers['c'], params=self.m_params) self.stats_box = StatsBox() @@ -387,7 +387,7 @@ def write_match(self, match): box["TP-comp_TP-gt"] += 1 else: box["TP-comp_FP-gt"] += 1 - elif match.comp.size() >= self.m_matcher.sizemin: + elif match.comp.var_size() >= self.m_params.sizemin: # The if is because we don't count FPs between sizefilt-sizemin box["comp cnt"] += 1 box["FP"] += 1 @@ -417,12 +417,12 @@ class Bench(): m_bench = truvari.Bench() - If you'd like to customize the parameters, build and edit a :class:`Matcher` and pass it to the Bench init + If you'd like to customize the parameters, build and edit a :class:`VariantParams` and pass it to the Bench init .. code-block:: python - matcher = truvari.Matcher(pctseq=0.50) - m_bench = truvari.Bench(matcher) + params = truvari.VariantParams(pctseq=0.50) + m_bench = truvari.Bench(params) To run on a chunk of :class:`truvari.VariantRecord` already loaded, pass them in as lists to: @@ -442,7 +442,7 @@ class Bench(): .. code-block:: python - m_bench = truvari.Bench(matcher, base_vcf, comp_vcf, outdir) + m_bench = truvari.Bench(params, base_vcf, comp_vcf, outdir) output = m_bench.run() .. note:: @@ -450,12 +450,12 @@ class Bench(): However, the returned `BenchOutput` has attributes pointing to all the results. """ - def __init__(self, matcher=None, base_vcf=None, comp_vcf=None, outdir=None, + def __init__(self, params=None, base_vcf=None, comp_vcf=None, outdir=None, includebed=None, extend=0, debug=False, do_logging=False): """ Initilize """ - self.matcher = matcher if matcher is not None else truvari.Matcher() + self.params = params if params is not None else truvari.VariantParams() self.base_vcf = base_vcf self.comp_vcf = comp_vcf self.outdir = outdir @@ -484,10 +484,10 @@ def run(self): raise RuntimeError( "Cannot call Bench.run without base/comp vcf filenames and outdir") - output = BenchOutput(self, self.matcher) + output = BenchOutput(self, self.params) - base = truvari.VariantFile(self.base_vcf, matcher=self.matcher) - comp = truvari.VariantFile(self.comp_vcf, matcher=self.matcher) + base = truvari.VariantFile(self.base_vcf, params=self.params) + comp = truvari.VariantFile(self.comp_vcf, params=self.params) region_tree = truvari.build_region_tree(base, comp, self.includebed) truvari.merge_region_tree_overlaps(region_tree) @@ -497,8 +497,9 @@ def run(self): base_i = base.regions_fetch(region_tree) comp_i = comp.regions_fetch(regions_extended) - chunks = truvari.chunker( - self.matcher, ('base', base_i), ('comp', comp_i)) + chunks = truvari.chunker(self.params, + ('base', base_i), + ('comp', comp_i)) for match in itertools.chain.from_iterable(map(self.compare_chunk, chunks)): # setting non-matched comp variants that are not fully contained in the original regions to None # These don't count as FP or TP and don't appear in the output vcf files @@ -525,7 +526,7 @@ def compare_chunk(self, chunk): chunk_dict["base"], chunk_dict["comp"], chunk_id) self.check_refine_candidate(result) # Check BNDs separately - if self.matcher.bnddist != -1 and (chunk_dict['base_BND'] or chunk_dict['comp_BND']): + if self.params.bnddist != -1 and (chunk_dict['base_BND'] or chunk_dict['comp_BND']): result.extend(self.compare_calls(chunk_dict['base_BND'], chunk_dict['comp_BND'], chunk_id)) return result @@ -558,7 +559,7 @@ def compare_calls(self, base_variants, comp_variants, chunk_id=0): return fns # 5k variants takes too long - if self.matcher.short_circuit and (len(base_variants) + len(comp_variants)) > 5000: + if self.params.short_circuit and (len(base_variants) + len(comp_variants)) > 5000: pos = [] cnt = 0 chrom = None @@ -578,7 +579,7 @@ def compare_calls(self, base_variants, comp_variants, chunk_id=0): base_variants, comp_variants, chunk_id) if isinstance(match_matrix, list): return match_matrix - return PICKERS[self.matcher.pick](match_matrix) + return PICKERS[self.params.pick](match_matrix) def build_matrix(self, base_variants, comp_variants, chunk_id=0): """ @@ -608,7 +609,7 @@ def check_refine_candidate(self, result): chrom = None for match in result: has_unmatched |= not match.state - if match.base is not None and match.base.size() >= self.matcher.sizemin: + if match.base is not None and match.base.var_size() >= self.params.sizemin: chrom = match.base.chrom pos.extend(match.base.boundaries()) if match.comp is not None: @@ -766,9 +767,9 @@ def bench_main(cmdargs): sys.stderr.write("Couldn't run Truvari. Please fix parameters\n") sys.exit(100) - matcher = truvari.Matcher(args, short_circuit=args.short) + params = truvari.VariantParams(args, short_circuit=args.short) - m_bench = Bench(matcher=matcher, + m_bench = Bench(params=params, base_vcf=args.base, comp_vcf=args.comp, outdir=args.output, diff --git a/truvari/collapse.py b/truvari/collapse.py index af0dda48..b5f4e4d4 100644 --- a/truvari/collapse.py +++ b/truvari/collapse.py @@ -36,13 +36,13 @@ def calc_median_sizepos(self): Given a set of variants, calculate the median start, end, and size """ st, en = self.entry.boundaries() - sz = self.entry.size() + sz = self.entry.var_size() starts = [st] ends = [en] sizes = [sz] for i in self.matches: st, en = i.comp.boundaries() - sz = i.comp.size() + sz = i.comp.var_size() starts.append(st) ends.append(en) sizes.append(sz) @@ -114,14 +114,14 @@ def chain_collapse(cur_collapse, all_collapse): return False # You'll have to add it to your set of collapsed calls -def collapse_chunk(chunk, matcher): +def collapse_chunk(chunk, params): """ Returns a list of lists with [keep entry, collap matches, match_id, gt_consolidate_count] """ chunk_dict, chunk_id = chunk - remaining_calls = sorted(chunk_dict['base'], key=matcher.sorter) + remaining_calls = sorted(chunk_dict['base'], key=params.sorter) - remaining_calls.sort(key=matcher.sorter) + remaining_calls.sort(key=params.sorter) call_id = -1 ret = [] # list of Collapses while remaining_calls: @@ -130,29 +130,29 @@ def collapse_chunk(chunk, matcher): f'{chunk_id}.{call_id}') # quicker genotype comparison - needs to be refactored m_collap.genotype_mask = m_collap.make_genotype_mask( - m_collap.entry, matcher.gt) + m_collap.entry, params.gt) # Sort based on size difference to current call for candidate in sorted(remaining_calls, key=partial(relative_size_sorter, m_collap.entry)): mat = m_collap.entry.match(candidate) mat.matid = m_collap.match_id - if matcher.hap and not hap_resolve(m_collap.entry, candidate): + if params.hap and not hap_resolve(m_collap.entry, candidate): mat.state = False - if mat.state and m_collap.gt_conflict(candidate, matcher.gt): + if mat.state and m_collap.gt_conflict(candidate, params.gt): mat.state = False if mat.state: m_collap.matches.append(mat) - elif mat.sizesim is not None and mat.sizesim < matcher.pctsize: + elif mat.sizesim is not None and mat.sizesim < params.pctsize: # Can we do this? The sort tells us that we're going through most->least # similar size. So the next one will only be worse... break # Does this collap need to go into a previous collap? - if not matcher.chain or not chain_collapse(m_collap, ret): + if not params.chain or not chain_collapse(m_collap, ret): ret.append(m_collap) # If hap, only allow the best match - if matcher.hap and m_collap.matches: + if params.hap and m_collap.matches: mats = sorted(m_collap.matches, reverse=True) m_collap.matches = [mats.pop(0)] @@ -160,14 +160,14 @@ def collapse_chunk(chunk, matcher): to_rm = [_.comp for _ in m_collap.matches] remaining_calls = [_ for _ in remaining_calls if _ not in to_rm] - if matcher.no_consolidate: + if params.no_consolidate: for val in ret: - if matcher.gt != 'off': + if params.gt != 'off': edited_entry, collapse_cnt = gt_aware_consolidate( val.entry, val.matches) else: edited_entry, collapse_cnt = collapse_into_entry( - val.entry, val.matches, matcher.hap) + val.entry, val.matches, params.hap) val.entry = edited_entry val.gt_consolidate_count = collapse_cnt @@ -181,7 +181,7 @@ def relative_size_sorter(base, comp): """ Sort calls based on the absolute size difference of base and comp """ - return abs(base.size() - comp.size()) + return abs(base.var_size() - comp.var_size()) def collapse_into_entry(entry, others, hap_mode=False): @@ -389,8 +389,8 @@ def sort_length(b1, b2): """ Order entries from longest to shortest SVLEN, ties are by alphanumeric of REF """ - s1 = b1.size() - s2 = b2.size() + s1 = b1.var_size() + s2 = b2.var_size() if s1 < s2: return 1 if s1 > s2: @@ -639,7 +639,7 @@ def write(self, entry): Writes header (str) or entries (VariantRecords) """ if isinstance(entry, truvari.VariantRecord): - entry = self.consolidate(entry, entry.matcher.write_resolved) + entry = self.consolidate(entry, entry.params.write_resolved) if self.isgz: entry = entry.encode() self.fh.write(entry) @@ -656,14 +656,14 @@ class CollapseOutput(dict): Output writer for collapse """ - def __init__(self, args, matcher): + def __init__(self, args, params): """ Makes all of the output files for collapse """ super().__init__() logging.info("Params:\n%s", json.dumps(vars(args), indent=4)) - in_vcf = truvari.VariantFile(args.input, matcher=matcher) + in_vcf = truvari.VariantFile(args.input, params=params) self["o_header"] = edit_header(in_vcf, args.median_info) self["c_header"] = trubench.edit_header(in_vcf) num_samps = len(self["o_header"].samples) @@ -676,9 +676,9 @@ def __init__(self, args, matcher): args.output, self["o_header"]) else: self["output_vcf"] = truvari.VariantFile(args.output, 'w', - header=self["o_header"], matcher=matcher) + header=self["o_header"], params=params) self["collap_vcf"] = truvari.VariantFile(args.removed_output, 'w', - header=self["c_header"], matcher=matcher) + header=self["c_header"], params=params) self["stats_box"] = {"collap_cnt": 0, "kept_cnt": 0, "out_cnt": 0, "consol_cnt": 0} @@ -794,7 +794,7 @@ def merge_intervals(intervals): return merged -def tree_size_chunker(matcher, chunks): +def tree_size_chunker(params, chunks): """ To reduce the number of variants in a chunk try to sub-chunk by size-similarity before hand Needs to return the same thing as a chunker @@ -810,9 +810,9 @@ def tree_size_chunker(matcher, chunks): to_add = [] for entry in chunk['base']: # How much smaller/larger would be in sizesim? - sz = entry.size() - diff = sz * (1 - matcher.pctsize) - if not matcher.typeignore: + sz = entry.var_size() + diff = sz * (1 - params.pctsize) + if not params.typeignore: sz *= -1 if entry.var_type() == truvari.SV.DEL else 1 to_add.append((sz - diff, sz + diff, LinkedList(entry))) tree = merge_intervals(to_add) @@ -821,7 +821,7 @@ def tree_size_chunker(matcher, chunks): yield {'base': intv[2].to_list(), '__filtered': []}, chunk_count -def tree_dist_chunker(matcher, chunks): +def tree_dist_chunker(params, chunks): """ To reduce the number of variants in a chunk try to sub-chunk by reference distance before hand Needs to return the same thing as a chunker @@ -838,8 +838,8 @@ def tree_dist_chunker(matcher, chunks): to_add = [] for entry in chunk['base']: st, ed = entry.boundaries() - st -= matcher.refdist - ed += matcher.refdist + st -= params.refdist + ed += params.refdist to_add.append((st, ed, LinkedList(entry))) tree = merge_intervals(to_add) for intv in tree: @@ -858,30 +858,30 @@ def collapse_main(args): sys.stderr.write("Couldn't run Truvari. Please fix parameters\n") sys.exit(100) - matcher = truvari.Matcher(args=args, + params = truvari.VariantParams(args=args, short_circuit=True, skip_gt=True, sizefilt=args.sizemin, chunksize=args.refdist) # Extra attributes needed for collapse - matcher.keep = args.keep - matcher.hap = args.hap - matcher.gt = args.gt - matcher.chain = args.chain - matcher.sorter = SORTS[args.keep] - matcher.no_consolidate = args.no_consolidate - - base = truvari.VariantFile(args.input, matcher=matcher) + params.keep = args.keep + params.hap = args.hap + params.gt = args.gt + params.chain = args.chain + params.sorter = SORTS[args.keep] + params.no_consolidate = args.no_consolidate + + base = truvari.VariantFile(args.input, params=params) regions = truvari.build_region_tree(base, includebed=args.bed) truvari.merge_region_tree_overlaps(regions) base_i = base.regions_fetch(regions) - chunks = truvari.chunker(matcher, ('base', base_i)) - smaller_chunks = tree_size_chunker(matcher, chunks) - even_smaller_chunks = tree_dist_chunker(matcher, smaller_chunks) + chunks = truvari.chunker(params, ('base', base_i)) + smaller_chunks = tree_size_chunker(params, chunks) + even_smaller_chunks = tree_dist_chunker(params, smaller_chunks) - outputs = CollapseOutput(args, matcher) - m_collap = partial(collapse_chunk, matcher=matcher) + outputs = CollapseOutput(args, params) + m_collap = partial(collapse_chunk, params=params) for call in itertools.chain.from_iterable(map(m_collap, even_smaller_chunks)): outputs.write(call, args.median_info) diff --git a/truvari/matching.py b/truvari/matching.py index 6bf510a0..b71b90bf 100644 --- a/truvari/matching.py +++ b/truvari/matching.py @@ -1,7 +1,6 @@ """ Comparison engine """ -import types import logging from collections import Counter, defaultdict from functools import total_ordering @@ -62,87 +61,6 @@ def __repr__(self): return f'' -class Matcher(): - """ - Holds matching parameters. Allows calls to be checked for filtering and matches to be made - - Example - >>> import truvari - >>> mat = truvari.Matcher(pctseq=0) - >>> v = truvari.VariantFile('repo_utils/test_files/variants/input1.vcf.gz') - >>> one = next(v); two = next(v) - >>> one.match(two, matcher=mat) - - - Look at `Matcher.make_match_params()` for a list of all params and their defaults - """ - - def __init__(self, args=None, **kwargs): - """ - Initalize. args is a Namespace from argparse - """ - if args is not None: - params = self.make_match_params_from_args(args) - else: - params = self.make_match_params() - - # Override parameters with those provided in kwargs - for key, value in kwargs.items(): - if hasattr(params, key): - setattr(params, key, value) - else: - raise ValueError(f"Invalid parameter: {key}") - - for key, value in params.__dict__.items(): - setattr(self, key, value) - - @staticmethod - def make_match_params(): - """ - Makes a simple namespace of matching parameters. Holds defaults - """ - params = types.SimpleNamespace() - params.reference = None - params.refdist = 500 - params.pctseq = 0.70 - params.pctsize = 0.70 - params.pctovl = 0.0 - params.typeignore = False - params.no_roll = False - params.chunksize = 1000 - params.bSample = 0 - params.cSample = 0 - params.dup_to_ins = False - params.bnddist = 100 - params.sizemin = 50 - params.sizefilt = 30 - params.sizemax = 50000 - params.passonly = False - params.no_ref = False - params.pick = 'single' - params.ignore_monref = True - params.check_multi = True - params.check_monref = True - params.no_single_bnd = True - params.write_resolved = False - params.short_circuit = False - params.skip_gt = False - return params - - @staticmethod - def make_match_params_from_args(args): - """ - Makes a simple namespace of matching parameters. - Populates defaults from make_match_params, then updates with values from args. - """ - ret = Matcher.make_match_params() - - for key in vars(ret): - if hasattr(args, key): - setattr(ret, key, getattr(args, key)) - - return ret - ############################ # Parsing and set building # ############################ @@ -188,9 +106,9 @@ def file_zipper(*start_files): file_counts) -def chunker(matcher, *files): +def chunker(params, *files): """ - Given a Matcher and multiple files, zip them and create chunks + Given a `truvari.VariantParams` and multiple files, zip them and create chunks Yields tuple of the chunk of calls, and an identifier of the chunk """ @@ -200,7 +118,7 @@ def chunker(matcher, *files): cur_end = 0 cur_chunk = defaultdict(list) unresolved_warned = False - reference = pysam.FastaFile(matcher.reference) if matcher.reference is not None else None + reference = pysam.FastaFile(params.reference) if params.reference is not None else None for key, entry in file_zipper(*files): if entry.filter_call(key == 'base'): @@ -208,13 +126,13 @@ def chunker(matcher, *files): call_counts['__filtered'] += 1 continue - if not entry.is_bnd() and entry.size_filter(key == 'base'): + if not entry.is_bnd() and entry.filter_size(key == 'base'): cur_chunk['__filtered'].append(entry) call_counts['__filtered'] += 1 continue # check symbolic, resolve if needed/possible - if matcher.pctseq != 0 and entry.alts[0].startswith('<'): + if params.pctseq != 0 and entry.alts[0].startswith('<'): was_resolved = entry.resolve(reference) if not was_resolved: if not unresolved_warned: @@ -225,7 +143,7 @@ def chunker(matcher, *files): continue new_chrom = cur_chrom and entry.chrom != cur_chrom - new_chunk = cur_end and cur_end + matcher.chunksize < entry.start + new_chunk = cur_end and cur_end + params.chunksize < entry.start if new_chunk or new_chrom: chunk_count += 1 yield cur_chunk, chunk_count diff --git a/truvari/phab.py b/truvari/phab.py index f732cfa4..142550dc 100644 --- a/truvari/phab.py +++ b/truvari/phab.py @@ -143,7 +143,7 @@ def make_consensus(data, ref_fn, passonly=True, max_size=50000): entry_filter = lambda e: \ e.is_resolved() \ and (not passonly or not e.is_filtered()) \ - and (e.size() <= max_size) + and (e.var_size() <= max_size) # pylint: enable=unnecessary-lambda-assignment cur_key = None diff --git a/truvari/refine.py b/truvari/refine.py index 58608d77..970e0a28 100644 --- a/truvari/refine.py +++ b/truvari/refine.py @@ -420,9 +420,9 @@ def refine_main(cmdargs): # Now run bench on the phab harmonized variants logging.info("Running bench") - matcher = truvari.Matcher(params, no_ref='a', short_circuit=True) + var_params = truvari.VariantParams(params, no_ref='a', short_circuit=True) outdir = os.path.join(args.benchdir, "phab_bench") - m_bench = truvari.Bench(matcher=matcher, base_vcf=phab_vcf, comp_vcf=phab_vcf, outdir=outdir, + m_bench = truvari.Bench(params=var_params, base_vcf=phab_vcf, comp_vcf=phab_vcf, outdir=outdir, includebed=reeval_bed) m_bench.run() diff --git a/truvari/variant_file.py b/truvari/variant_file.py new file mode 100644 index 00000000..a838edcf --- /dev/null +++ b/truvari/variant_file.py @@ -0,0 +1,117 @@ +""" +Truvari class wrapping pysam VariantFile +""" +import pysam + +import truvari +from truvari.region_vcf_iter import region_filter + + +class VariantFile: + """ + Wrapper around pysam.VariantFile with helper functions for iteration. + + .. note:: + The context manager functionality of pysam.VariantFile is not available with truvari.VariantFile. + """ + + def __init__(self, filename, *args, params=None, **kwargs): + """ + Initialize the VariantFile wrapper. + + :param filename: Path to the VCF file to be opened. + :type filename: str + :param params: Parameters to apply to all VariantRecords + :type params: `truvari.VariantParams` + :param args: Additional positional arguments to pass to pysam.VariantFile. + :param kwargs: Additional keyword arguments to pass to pysam.VariantFile. + """ + self.params = params + self._vcf = pysam.VariantFile(filename, *args, **kwargs) + + def __getattr__(self, name): + """ + Delegate attribute access to the original VariantFile. + + :param name: Attribute name to access from the underlying pysam.VariantFile object. + :type name: str + :return: The requested attribute from the underlying pysam.VariantFile. + :rtype: Any + """ + return getattr(self._vcf, name) + + def __iter__(self): + """ + Iterate over the `pysam.VariantFile`, wrapping entries into `truvari.VariantRecord`. + + :return: Iterator of truvari.VariantRecord objects. + :rtype: iterator + """ + for i in self._vcf: + yield truvari.VariantRecord(i, self.params) + + def __next__(self): + """ + Return the next truvari.VariantRecord in the VariantFile. + + :return: Next truvari.VariantRecord. + :rtype: truvari.VariantRecord + """ + return truvari.VariantRecord(next(self._vcf), self.params) + + def fetch(self, *args, **kwargs): + """ + Fetch variants from the `pysam.VariantFile`, wrapping them into `truvari.VariantRecords`. + + :param args: Positional arguments for the pysam.VariantFile.fetch method. + :param kwargs: Keyword arguments for the pysam.VariantFile.fetch method. + :return: Iterator of truvari.VariantRecord objects. + :rtype: iterator + """ + + for i in self._vcf.fetch(*args, **kwargs): + yield truvari.VariantRecord(i, self.params) + + def regions_fetch(self, tree, inside=True, with_region=False): + """ + Fetch variants from the VCF based on regions defined in a tree of chrom:IntervalTree. + + :param tree: Tree of chrom:IntervalTree defining regions of interest. + :type tree: dict + :param inside: If True, fetch variants inside the regions. If False, fetch variants outside the regions. + :type inside: bool + :param with_region: If True, return tuples of (`truvari.VariantRecord`, region). Defaults to False. + :type with_region: bool + :return: Iterator of truvari.VariantRecord objects or tuples of (`truvari.VariantRecord`, region). + :rtype: iterator + """ + return region_filter(self, tree, inside, with_region) + + def bed_fetch(self, bed_fn, inside=True, with_region=False): + """ + Fetch variants from the VCF based on regions defined in a BED file. + + :param bed_fn: Path to the BED file defining regions of interest. + :type bed_fn: str + :param inside: If True, fetch variants inside the regions. If False, fetch variants outside the regions. + :type inside: bool + :param with_region: If True, return tuples of (`truvari.VariantRecord`, region). Defaults to False. + :type with_region: bool + :return: Iterator of truvari.VariantRecord objects or tuples of (`truvari.VariantRecord`, region). + :rtype: iterator + """ + tree = truvari.read_bed_tree(bed_fn) + return self.regions_fetch(tree, inside, with_region) + + def write(self, record): + """ + Write a `truvari.VariantRecord` to the `pysam.VariantFile`. + + :param record: The truvari.VariantRecord to be written. + :type record: `truvari.VariantRecord` + """ + out = record.get_record() + if self.params and self.params.write_resolved: + out.ref = record.get_ref() + out.alts = (record.get_alt(),) + self._vcf.write(out) diff --git a/truvari/variant_params.py b/truvari/variant_params.py new file mode 100644 index 00000000..527f6e8f --- /dev/null +++ b/truvari/variant_params.py @@ -0,0 +1,86 @@ +""" +Truvari main parameters +""" +import types + + +class VariantParams(): + """ + Holds variant parsing and matching parameters. + + Example + >>> import truvari + >>> p = truvari.VariantParams(pctseq=0) + >>> v = truvari.VariantFile('repo_utils/test_files/variants/input1.vcf.gz', params=p) + >>> one = next(v); two = next(v) + >>> one.match(two) + + + Look at `VariantParams.make_params()` for a list of all params and their defaults + """ + + def __init__(self, args=None, **kwargs): + """ + Initalize. args is a Namespace from argparse + """ + if args is not None: + params = self.make_params_from_args(args) + else: + params = self.make_params() + + # Override parameters with those provided in kwargs + for key, value in kwargs.items(): + if hasattr(params, key): + setattr(params, key, value) + else: + raise ValueError(f"Invalid parameter: {key}") + + for key, value in params.__dict__.items(): + setattr(self, key, value) + + @staticmethod + def make_params(): + """ + Makes a simple namespace of matching parameters. Holds defaults + """ + params = types.SimpleNamespace() + params.reference = None + params.refdist = 500 + params.pctseq = 0.70 + params.pctsize = 0.70 + params.pctovl = 0.0 + params.typeignore = False + params.no_roll = False + params.chunksize = 1000 + params.bSample = 0 + params.cSample = 0 + params.dup_to_ins = False + params.bnddist = 100 + params.sizemin = 50 + params.sizefilt = 30 + params.sizemax = 50000 + params.passonly = False + params.no_ref = False + params.pick = 'single' + params.ignore_monref = True + params.check_multi = True + params.check_monref = True + params.no_single_bnd = True + params.write_resolved = False + params.short_circuit = False + params.skip_gt = False + return params + + @staticmethod + def make_params_from_args(args): + """ + Makes a simple namespace of matching parameters. + Populates defaults from make_params, then updates with values from args. + """ + ret = VariantParams.make_params() + + for key in vars(ret): + if hasattr(args, key): + setattr(ret, key, getattr(args, key)) + + return ret diff --git a/truvari/variants.py b/truvari/variant_record.py similarity index 80% rename from truvari/variants.py rename to truvari/variant_record.py index f329383b..c07eeab5 100644 --- a/truvari/variants.py +++ b/truvari/variant_record.py @@ -1,134 +1,22 @@ """ -Truvari functions wrapping pysam VariantFile and VariantRecords +Truvari class wrapping pysam VariantRecord """ import re import hashlib import logging -import pysam import truvari -from truvari.region_vcf_iter import region_filter from truvari.annotations.af_calc import allele_freq_annos RC = str.maketrans("ATCG", "TAGC") -class VariantFile: - """ - Wrapper around pysam.VariantFile with helper functions for iteration. - - .. note:: - The context manager functionality of pysam.VariantFile is not available with truvari.VariantFile. - """ - - def __init__(self, filename, *args, matcher=None, **kwargs): - """ - Initialize the VariantFile wrapper. - - :param filename: Path to the VCF file to be opened. - :type filename: str - :param matcher: Matcher to apply to all VariantRecords - :type matcher: `truvari.Matcher` - :param args: Additional positional arguments to pass to pysam.VariantFile. - :param kwargs: Additional keyword arguments to pass to pysam.VariantFile. - """ - self.matcher = matcher - self._vcf = pysam.VariantFile(filename, *args, **kwargs) - - def __getattr__(self, name): - """ - Delegate attribute access to the original VariantFile. - - :param name: Attribute name to access from the underlying pysam.VariantFile object. - :type name: str - :return: The requested attribute from the underlying pysam.VariantFile. - :rtype: Any - """ - return getattr(self._vcf, name) - - def __iter__(self): - """ - Iterate over the `pysam.VariantFile`, wrapping entries into `truvari.VariantRecord`. - - :return: Iterator of truvari.VariantRecord objects. - :rtype: iterator - """ - for i in self._vcf: - yield VariantRecord(i, self.matcher) - - def __next__(self): - """ - Return the next truvari.VariantRecord in the VariantFile. - - :return: Next truvari.VariantRecord. - :rtype: truvari.VariantRecord - """ - return VariantRecord(next(self._vcf), self.matcher) - - def fetch(self, *args, **kwargs): - """ - Fetch variants from the `pysam.VariantFile`, wrapping them into `truvari.VariantRecords`. - - :param args: Positional arguments for the pysam.VariantFile.fetch method. - :param kwargs: Keyword arguments for the pysam.VariantFile.fetch method. - :return: Iterator of truvari.VariantRecord objects. - :rtype: iterator - """ - - for i in self._vcf.fetch(*args, **kwargs): - yield truvari.VariantRecord(i, self.matcher) - - def regions_fetch(self, tree, inside=True, with_region=False): - """ - Fetch variants from the VCF based on regions defined in a tree of chrom:IntervalTree. - - :param tree: Tree of chrom:IntervalTree defining regions of interest. - :type tree: dict - :param inside: If True, fetch variants inside the regions. If False, fetch variants outside the regions. - :type inside: bool - :param with_region: If True, return tuples of (`truvari.VariantRecord`, region). Defaults to False. - :type with_region: bool - :return: Iterator of truvari.VariantRecord objects or tuples of (`truvari.VariantRecord`, region). - :rtype: iterator - """ - return region_filter(self, tree, inside, with_region) - - def bed_fetch(self, bed_fn, inside=True, with_region=False): - """ - Fetch variants from the VCF based on regions defined in a BED file. - - :param bed_fn: Path to the BED file defining regions of interest. - :type bed_fn: str - :param inside: If True, fetch variants inside the regions. If False, fetch variants outside the regions. - :type inside: bool - :param with_region: If True, return tuples of (`truvari.VariantRecord`, region). Defaults to False. - :type with_region: bool - :return: Iterator of truvari.VariantRecord objects or tuples of (`truvari.VariantRecord`, region). - :rtype: iterator - """ - tree = truvari.read_bed_tree(bed_fn) - return self.regions_fetch(tree, inside, with_region) - - def write(self, record): - """ - Write a `truvari.VariantRecord` to the `pysam.VariantFile`. - - :param record: The truvari.VariantRecord to be written. - :type record: `truvari.VariantRecord` - """ - out = record.get_record() - if self.matcher and self.matcher.write_resolved: - out.ref = record.get_ref() - out.alts = (record.get_alt(),) - self._vcf.write(out) - - class VariantRecord: """ Wrapper around pysam.VariantRecords with helper functions of variant properties and basic comparisons """ - def __init__(self, record, matcher=None): + def __init__(self, record, params=None): """ Initialize with just the internal record """ @@ -136,10 +24,10 @@ def __init__(self, record, matcher=None): self.resolved_ref = None self.resolved_alt = None self.end = record.stop - if matcher is None: - self.matcher = truvari.Matcher() + if params is None: + self.params = truvari.VariantParams() else: - self.matcher = matcher + self.params = params def __getattr__(self, name): """ @@ -242,8 +130,8 @@ def bounds(entry, pos, key): """ Inflate a bnd position based on CIPOS. """ - start = pos - self.matcher.bnddist - end = pos + self.matcher.bnddist + start = pos - self.params.bnddist + end = pos + self.params.bnddist key = 'CI' + key idx = 0 if key == 'POS' else 1 @@ -297,9 +185,9 @@ def bounds(entry, pos, key): self.compare_gts(other, ret) # Score is percent of allowed distance needed to find this match - if self.matcher.bnddist > 0: + if self.params.bnddist > 0: ret.score = max(0, (1 - ((abs(ret.st_dist) + abs(ret.ed_dist)) / 2) - / self.matcher.bnddist) * 100) + / self.params.bnddist) * 100) else: ret.score = int(ret.state) * 100 @@ -320,11 +208,41 @@ def boundaries(self, ins_inflate=False): start = self.start end = self.end if ins_inflate and self.var_type() == truvari.SV.INS: - size = self.size() + size = self.var_size() start -= size // 2 end += size // 2 return start, end + def compare_gts(self, other, match): + """ + Populates genotype-specific comparison details in a `MatchResult`. + + This method compares the genotypes of a "base" (reference) variant and a "comparison" variant and updates + the provided `MatchResult` object with the results. It computes the genotype counts and determines the + difference between the base and comparison genotypes. + + :param other: The other variant entry. + :type other: `truvari.VariantRecord` + :param match: The `MatchResult` object to update with genotype comparison details. + :type match: truvari.MatchResult + + Updates the `MatchResult` object with the following attributes: + - `base_gt`: The genotype of the base sample. + - `base_gt_count`: The count of the reference allele (1) in the base genotype. + - `comp_gt`: The genotype of the comparison sample. + - `comp_gt_count`: The count of the reference allele (1) in the comparison genotype. + - `gt_match`: The absolute difference between `base_gt_count` and `comp_gt_count`. + """ + b_gt = self.gt(self.params.bSample) + c_gt = other.gt(self.params.cSample) + if b_gt: + match.base_gt = b_gt + match.base_gt_count = sum(1 for _ in match.base_gt if _ == 1) + if c_gt: + match.comp_gt = c_gt + match.comp_gt_count = sum(1 for _ in match.comp_gt if _ == 1) + match.gt_match = abs(match.base_gt_count - match.comp_gt_count) + def distance(self, other): """ Calculate the start and end distances of the pair. Negative distances @@ -422,7 +340,6 @@ def is_present(self, sample=0, allow_missing=True): Example >>> import truvari - >>> import pysam >>> v = truvari.VariantFile('repo_utils/test_files/variants/input1.vcf.gz') >>> e = next(v) >>> e.is_present() @@ -452,6 +369,78 @@ def is_resolved(self): ('<' not in self.alts[0]) and (self.alts[0] not in (None, '*')) \ and self.alleles_variant_types[1] != 'BND' + def filter_call(self, base=False): + """ + Determines whether a variant call should be filtered based on Truvari parameters or specific requirements. + + This method evaluates a variant entry (`entry`) and checks if it should be excluded from further processing + based on filtering criteria such as monomorphic reference, multi-allelic records, filtering status, + sample presence, or unsupported single-end BNDs. + + :param entry: The variant entry to evaluate. + :type entry: truvari.VariantRecord + :param base: A flag indicating whether the entry is the "base" (reference) call or the "comparison" call. + Filtering behavior may differ based on this flag. + :type base: bool, optional + + :return: `True` if the variant should be filtered (excluded), otherwise `False`. + :rtype: bool + + :raises ValueError: If the entry is multi-allelic and `check_multi` is enabled in the Truvari parameters. + + Filtering Logic: + - **Monomorphic Reference:** If `check_monref` is enabled and the entry is a monomorphic reference, it is filtered. + - **Multi-Allelic Records:** If `check_multi` is enabled and the entry is multi-allelic, an error is raised. + - **Filtered Variants:** If `passonly` is enabled and the entry is flagged as filtered, it is excluded. + - **Sample Presence:** If `no_ref` is set to include the entry's type (base or comparison) or `pick == 'ac'`, the sample must be present in the entry. + - **Single-End BNDs:** Single-end BNDs are always excluded. + """ + if self.params.check_monref and self.is_monrefstar(): + return True + + if self.params.check_multi and self.is_multi(): + raise ValueError( + f"Cannot compare multi-allelic records. Please split\nline {str(self)}") + + if self.params.passonly and self.is_filtered(): + return True + + prefix = 'b' if base else 'c' + if self.params.no_ref in ["a", prefix] or self.params.pick == 'ac': + samp = self.params.bSample if base else self.params.cSample + if not self.is_present(samp): + return True + + if self.params.no_single_bnd and self.is_single_bnd(): + return True + return False + + def filter_size(self, base=False): + """ + Determines whether a variant entry should be filtered based on its size. + + This method evaluates the size of a variant and checks if it falls outside the specified size thresholds. + Filtering criteria depend on whether the entry is a "base" (reference) or "comparison" call. + + :param entry: The variant entry to evaluate. + :type entry: truvari.VariantRecord + :param base: A flag indicating whether the entry is the "base" (reference) call. If `True`, the `sizemin` + parameter is used as the minimum size threshold. Otherwise, the `sizefilt` parameter is used. + :type base: bool, optional + + :return: `True` if the variant should be filtered due to its size, otherwise `False`. + :rtype: bool + + Filtering Logic: + - **Maximum Size:** If the size exceeds `sizemax`, the variant is filtered. + - **Minimum Size (Base):** If `base=True` and the size is less than `sizemin`, the variant is filtered. + - **Minimum Size (Comparison):** If `base=False` and the size is less than `sizefilt`, the variant is filtered. + """ + size = self.var_size() + return (size > self.params.sizemax) \ + or (base and size < self.params.sizemin) \ + or (not base and size < self.params.sizefilt) + def get_record(self): """ Return internal pysam.VariantRecord @@ -464,13 +453,13 @@ def match(self, other): If self and other are non-bnd, calls VariantRecord.var_match, If self and other are bnd, calls VariantRecord.bnd_match Otherwise, raises a TypeError. - If no matcher is provided, builds one from defaults """ if not self.is_bnd() and not other.is_bnd(): return self.var_match(other) if self.is_bnd() and other.is_bnd(): return self.bnd_match(other) - raise TypeError("Incompatible Variants (BND and !BND) can't be matched") + raise TypeError( + "Incompatible Variants (BND and !BND) can't be matched") def move_record(self, out_vcf, sample=None): """ @@ -498,7 +487,6 @@ def recovl(self, other, ins_inflate=True): Example >>> import truvari - >>> import pysam >>> v = truvari.VariantFile('repo_utils/test_files/variants/input1.vcf.gz') >>> a = next(v) >>> b = next(v) @@ -511,7 +499,32 @@ def recovl(self, other, ins_inflate=True): def resolve(self, ref): """ - Attempts to resolve an SV's REF/ALT sequences. Stores in self.resolved_ref, self.resolved_alt, and self.end + Attempts to resolve the REF/ALT sequences of a structural variant (SV) and updates the instance attributes. + + This method tries to reconstruct the REF and ALT sequences for specific types of structural variants + (e.g., deletions, inversions, duplications) based on the provided reference genome sequence. If successful, + it stores the resolved sequences in `self.resolved_ref` and `self.resolved_alt`, and may adjust `self.end`. + + :param ref: The reference genome used to resolve the structural variant. + Typically a `pysam.FastaFile` or equivalent object providing access to reference sequences. + :type ref: pysam.FastaFile + + :return: `True` if the REF/ALT sequences are successfully resolved, otherwise `False`. + :rtype: bool + + Updates: + - `self.resolved_ref`: The resolved REF sequence. + - `self.resolved_alt`: The resolved ALT sequence. + - `self.end`: May be adjusted for specific variant types (e.g., duplications). + + Resolution Logic: + - If `ref` is `None`, the ALT is `` or ``, or the start position is invalid, the method returns `False`. + - The method fetches the sequence from the reference genome using the variant's `chrom`, `start`, and `end`. + - The resolution depends on the variant type: + - **Deletion (`DEL`)**: The REF sequence is the fetched reference sequence, and the ALT is the first base. + - **Inversion (`INV`)**: The REF sequence is the fetched reference sequence, and the ALT is the reverse complement of the REF. + - **Duplication (`DUP`)** (when `self.params.dup_to_ins` is enabled): The REF is the first base of the fetched sequence, and the ALT is the full sequence. The `end` is adjusted to be `start + 1`. + - Returns `False` if the variant type is unsupported or cannot be resolved. """ if ref is None or self.alts[0] in ['', ''] or self.start > ref.get_reference_length(self.chrom): return False @@ -524,7 +537,7 @@ def resolve(self, ref): elif svtype == truvari.SV.INV: self.resolved_ref = seq self.resolved_alt = seq.translate(RC)[::-1] - elif svtype == truvari.SV.DUP and self.matcher.dup_to_ins: + elif svtype == truvari.SV.DUP and self.params.dup_to_ins: self.resolved_ref = seq[0] self.resolved_alt = seq self.end = self.start + 1 @@ -596,43 +609,6 @@ def seqsim(self, other, roll=True): # Whichever is highest is how similar these sequences can be return truvari.best_seqsim(a_seq, b_seq, st_dist) - def size(self): - """ - Determine the size of the variant. - - .. note:: How size is determined - - - Starts by trying to use INFO/SVLEN - - If SVLEN is unavailable and ALT field is an SV (e.g. , , etc), \ - use abs(vcf.start - vcf.end). The INFO/END tag needs to be available, \ - especially for INS. - - If len of vcf.REF and vcf.ALT[0] are equal, return len of vcf.REF - - Otherwise, return the size difference of the sequence resolved call using \ - abs(len(vcf.REF) - len(str(vcf.ALT[0]))) - - :return: the entry's size - :rtype: int - """ - if "SVLEN" in self.info: - if type(self.info["SVLEN"]) in [list, tuple]: - size = abs(self.info["SVLEN"][0]) - else: - size = abs(self.info["SVLEN"]) - elif self.alts is not None and self.alts[0].count("<"): - start, end = self.boundaries() - size = end - start - else: - r_len = len(self.ref) - a_len = len(self.alts[0]) if self.alts is not None else 0 - if r_len == a_len: - if r_len == 1: - size = 0 # SNPs are special - else: - size = r_len - else: - size = abs(r_len - a_len) - return size - def sizesim(self, other): """ Calculate the size similarity and difference for two entries @@ -645,14 +621,13 @@ def sizesim(self, other): Example >>> import truvari - >>> import pysam >>> v = truvari.VariantFile('repo_utils/test_files/variants/input1.vcf.gz') >>> a = next(v) >>> b = next(v) >>> a.sizesim(b) (0.07142857142857142, 13) """ - return truvari.sizesim(self.size(), other.size()) + return truvari.sizesim(self.var_size(), other.var_size()) def var_match(self, other): """ @@ -664,48 +639,48 @@ def var_match(self, other): ret.state = True - if not self.matcher.typeignore and not self.same_type(other): + if not self.params.typeignore and not self.same_type(other): logging.debug("%s and %s are not the same SVTYPE", str(self), str(other)) ret.state = False - if self.matcher.short_circuit: + if self.params.short_circuit: return ret bstart, bend = self.boundaries() cstart, cend = other.boundaries() - if not truvari.overlaps(bstart - self.matcher.refdist, bend + self.matcher.refdist, cstart, cend): + if not truvari.overlaps(bstart - self.params.refdist, bend + self.params.refdist, cstart, cend): logging.debug("%s and %s are not within REFDIST", str(self), str(other)) ret.state = False - if self.matcher.short_circuit: + if self.params.short_circuit: return ret ret.sizesim, ret.sizediff = self.sizesim(other) - if ret.sizesim < self.matcher.pctsize: + if ret.sizesim < self.params.pctsize: logging.debug("%s and %s size similarity is too low (%.3f)", str(self), str(other), ret.sizesim) ret.state = False - if self.matcher.short_circuit: + if self.params.short_circuit: return ret - if not self.matcher.skip_gt: + if not self.params.skip_gt: self.compare_gts(other, ret) ret.ovlpct = self.recovl(other) - if ret.ovlpct < self.matcher.pctovl: + if ret.ovlpct < self.params.pctovl: logging.debug("%s and %s overlap percent is too low (%.3f)", str(self), str(other), ret.ovlpct) ret.state = False - if self.matcher.short_circuit: + if self.params.short_circuit: return ret - if self.matcher.pctseq > 0: + if self.params.pctseq > 0: ret.seqsim = self.seqsim(other) - if ret.seqsim < self.matcher.pctseq: + if ret.seqsim < self.params.pctseq: logging.debug("%s and %s sequence similarity is too low (%.3ff)", str(self), str(other), ret.seqsim) ret.state = False - if self.matcher.short_circuit: + if self.params.short_circuit: return ret else: ret.seqsim = 0 @@ -715,6 +690,43 @@ def var_match(self, other): return ret + def var_size(self): + """ + Determine the size of the variant. + + .. note:: How size is determined + + - Starts by trying to use INFO/SVLEN + - If SVLEN is unavailable and ALT field is an SV (e.g. , , etc), \ + use abs(vcf.start - vcf.end). The INFO/END tag needs to be available, \ + especially for INS. + - If len of vcf.REF and vcf.ALT[0] are equal, return len of vcf.REF + - Otherwise, return the size difference of the sequence resolved call using \ + abs(len(vcf.REF) - len(str(vcf.ALT[0]))) + + :return: the entry's size + :rtype: int + """ + if "SVLEN" in self.info: + if type(self.info["SVLEN"]) in [list, tuple]: + size = abs(self.info["SVLEN"][0]) + else: + size = abs(self.info["SVLEN"]) + elif self.alts is not None and self.alts[0].count("<"): + start, end = self.boundaries() + size = end - start + else: + r_len = len(self.ref) + a_len = len(self.alts[0]) if self.alts is not None else 0 + if r_len == a_len: + if r_len == 1: + size = 0 # SNPs are special + else: + size = r_len + else: + size = abs(r_len - a_len) + return size + def var_type(self): """ Return entry's SVTYPE @@ -800,7 +812,7 @@ def to_key(self, prefix="", bounds=False): def within_tree(self, tree): """ - Extract entry and tree boundaries to call `coords_within` + Extract entry and tree boundaries to call `truvari.coords_within` """ qstart, qend = self.boundaries() m_ovl = tree[self.chrom].overlap(qstart, qend) @@ -813,110 +825,8 @@ def within_tree(self, tree): def within(self, rstart, rend): """ - Extract entry boundaries and type to call `coords_within` + Extract entry boundaries and type to call `truvari.coords_within` """ qstart, qend = self.boundaries() end_within = self.var_type() != truvari.SV.INS return truvari.coords_within(qstart, qend, rstart, rend, end_within) - - def filter_call(self, base=False): - """ - Determines whether a variant call should be filtered based on Truvari parameters or specific requirements. - - This method evaluates a variant entry (`entry`) and checks if it should be excluded from further processing - based on filtering criteria such as monomorphic reference, multi-allelic records, filtering status, - sample presence, or unsupported single-end BNDs. - - :param entry: The variant entry to evaluate. - :type entry: truvari.VariantRecord - :param base: A flag indicating whether the entry is the "base" (reference) call or the "comparison" call. - Filtering behavior may differ based on this flag. - :type base: bool, optional - - :return: `True` if the variant should be filtered (excluded), otherwise `False`. - :rtype: bool - - :raises ValueError: If the entry is multi-allelic and `check_multi` is enabled in the Truvari parameters. - - Filtering Logic: - - **Monomorphic Reference:** If `check_monref` is enabled and the entry is a monomorphic reference, it is filtered. - - **Multi-Allelic Records:** If `check_multi` is enabled and the entry is multi-allelic, an error is raised. - - **Filtered Variants:** If `passonly` is enabled and the entry is flagged as filtered, it is excluded. - - **Sample Presence:** If `no_ref` is set to include the entry's type (base or comparison) or `pick == 'ac'`, the sample must be present in the entry. - - **Single-End BNDs:** Single-end BNDs are always excluded. - """ - if self.matcher.check_monref and self.is_monrefstar(): - return True - - if self.matcher.check_multi and self.is_multi(): - raise ValueError( - f"Cannot compare multi-allelic records. Please split\nline {str(self)}") - - if self.matcher.passonly and self.is_filtered(): - return True - - prefix = 'b' if base else 'c' - if self.matcher.no_ref in ["a", prefix] or self.matcher.pick == 'ac': - samp = self.matcher.bSample if base else self.matcher.cSample - if not self.is_present(samp): - return True - - if self.matcher.no_single_bnd and self.is_single_bnd(): - return True - return False - - def size_filter(self, base=False): - """ - Determines whether a variant entry should be filtered based on its size. - - This method evaluates the size of a variant and checks if it falls outside the specified size thresholds. - Filtering criteria depend on whether the entry is a "base" (reference) or "comparison" call. - - :param entry: The variant entry to evaluate. - :type entry: truvari.VariantRecord - :param base: A flag indicating whether the entry is the "base" (reference) call. If `True`, the `sizemin` - parameter is used as the minimum size threshold. Otherwise, the `sizefilt` parameter is used. - :type base: bool, optional - - :return: `True` if the variant should be filtered due to its size, otherwise `False`. - :rtype: bool - - Filtering Logic: - - **Maximum Size:** If the size exceeds `sizemax`, the variant is filtered. - - **Minimum Size (Base):** If `base=True` and the size is less than `sizemin`, the variant is filtered. - - **Minimum Size (Comparison):** If `base=False` and the size is less than `sizefilt`, the variant is filtered. - """ - size = self.size() - return (size > self.matcher.sizemax) \ - or (base and size < self.matcher.sizemin) \ - or (not base and size < self.matcher.sizefilt) - - def compare_gts(self, other, match): - """ - Populates genotype-specific comparison details in a `MatchResult`. - - This method compares the genotypes of a "base" (reference) variant and a "comparison" variant and updates - the provided `MatchResult` object with the results. It computes the genotype counts and determines the - difference between the base and comparison genotypes. - - :param other: The other variant entry. - :type other: `truvari.VariantRecord` - :param match: The `MatchResult` object to update with genotype comparison details. - :type match: truvari.MatchResult - - Updates the `MatchResult` object with the following attributes: - - `base_gt`: The genotype of the base sample. - - `base_gt_count`: The count of the reference allele (1) in the base genotype. - - `comp_gt`: The genotype of the comparison sample. - - `comp_gt_count`: The count of the reference allele (1) in the comparison genotype. - - `gt_match`: The absolute difference between `base_gt_count` and `comp_gt_count`. - """ - b_gt = self.gt(self.matcher.bSample) - c_gt = other.gt(self.matcher.cSample) - if b_gt: - match.base_gt = b_gt - match.base_gt_count = sum(1 for _ in match.base_gt if _ == 1) - if c_gt: - match.comp_gt = c_gt - match.comp_gt_count = sum(1 for _ in match.comp_gt if _ == 1) - match.gt_match = abs(match.base_gt_count - match.comp_gt_count) diff --git a/truvari/vcf2df.py b/truvari/vcf2df.py index 9f13ce6b..81d7e869 100644 --- a/truvari/vcf2df.py +++ b/truvari/vcf2df.py @@ -314,7 +314,7 @@ def _transform(): Yields the rows """ for entry in v: - varsize = entry.size() + varsize = entry.var_size() cur_row = [entry.to_hash(), entry.chrom, entry.start,