Skip to content

Commit

Permalink
new anno chunks command
Browse files Browse the repository at this point in the history
  • Loading branch information
ACEnglish committed Sep 7, 2024
1 parent ee167cc commit 82330d4
Show file tree
Hide file tree
Showing 6 changed files with 179 additions and 10 deletions.
11 changes: 11 additions & 0 deletions repo_utils/answer_key/anno/chunks.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
chr20 149012 149095 3 3 3
chr20 278929 279098 7 7 7
chr20 280210 280275 1 1 1
chr20 306267 306268 2 2 2
chr20 380877 380878 1 1 1
chr20 420664 420665 2 2 2
chr20 613782 613837 1 1 1
chr20 641905 642391 11 11 11
chr20 709758 709852 2 2 2
chr20 764441 764537 3 3 3
chr20 949515 949619 1 1 1
10 changes: 9 additions & 1 deletion repo_utils/sub_tests/anno.sh
Original file line number Diff line number Diff line change
Expand Up @@ -171,10 +171,18 @@ if [ $test_anno_grpaf_subset ]; then
assert_equal $(fn_md5 $ANSDIR/anno/anno_grpaf.subtags.vcf) $(fn_md5 $OD/anno_grpaf.subtags.vcf)
fi

#
# add id
run test_anno_addid \
$truv anno addid $INDIR/variants/multi.vcf.gz -o $OD/addid.vcf
if [ $test_anno_addid ]; then
assert_exit_code 0
assert_equal $(fn_md5 $ANSDIR/anno/addid.vcf) $(fn_md5 $OD/addid.vcf)
fi


# chunks
run test_anno_chunks $truv anno chunks $INDIR/variants/multi.vcf.gz -o $OD/chunks.bed
if [ $test_anno_chunks ]; then
assert_exit_code 0
assert_equal $(fn_md5 $ANSDIR/anno/chunks.bed) $(fn_md5 $OD/chunks.bed)
fi
1 change: 1 addition & 0 deletions truvari/anno.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
ANNOS = {
"addid": ("Set ID field", tannos.addid_main),
"bpovl": ("Annotation Intersection", tannos.bpovl_main),
"chunks": ("Chunk Boundaries and Variant Counts", tannos.chunks_main),
"density": ("Variant Density", tannos.density_main),
"dpcnt": ("Call Depth Counts", tannos.dpcnt_main),
"gcpct": ("GC Percent", tannos.gcpct_main),
Expand Down
2 changes: 2 additions & 0 deletions truvari/annotations/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
""" annotation modules """
from truvari.annotations.addid import addid_main
from truvari.annotations.bpovl import bpovl_main
from truvari.annotations.chunks import chunks_main
from truvari.annotations.density import density_main
from truvari.annotations.dpcnt import dpcnt_main
from truvari.annotations.gccontent import gcpct_main
Expand All @@ -18,6 +19,7 @@
__all__ = [
'addid_main',
'bpovl_main',
'chunks_main',
'density_main',
'dpcnt_main',
'gcpct_main',
Expand Down
77 changes: 77 additions & 0 deletions truvari/annotations/chunks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
Count the number of variants in each chunk
Column 3: total number of variants
Column 4: comma-deliminted number of sub-chunks after accounting for size
Column 5: comma-deliminted number of sub-chunks after accounting for size and distance again
"""
import sys
import argparse
import pysam

import truvari
from truvari.collapse import tree_size_chunker, tree_dist_chunker

def parse_args(args):
"""
Parse arguments
"""
parser = argparse.ArgumentParser(prog="chunks", description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("input", type=str,
help="Input VCF")
parser.add_argument("-o", "--output", type=str, default="/dev/stdout",
help="Output name")
parser.add_argument("-c", "--chunksize", type=int, default=500,
help="Distance between variants to split chunks (%(default)s)")
parser.add_argument("-s", "--sizemin", type=int, default=50,
help="Minimum SV length")
parser.add_argument("-S", "--sizemax", type=int, default=50000,
help="Maximum SV length")
args = parser.parse_args(args)
truvari.setup_logging(show_version=True)
return args

def get_bounds(cnk):
"""
Min start and max end of variants
"""
mstart = sys.maxsize
mend = 0
for i in cnk:
mstart = min(mstart, i.start)
mend = max(mend, i.stop)
return mstart, mend

def chunks_main(args):
"""
Main
"""
args = parse_args(args)
v = pysam.VariantFile(args.input)
m = truvari.Matcher()
m.params.pctseq = 0
m.params.sizemin = args.sizemin
m.params.sizefilt = args.sizemin
m.params.sizemax = args.sizemax
m.params.chunksize = args.chunksize
m.params.refdist = args.chunksize
c = truvari.chunker(m, ('base', v))

with open(args.output, 'w') as fout:
for chunk, _ in c:
if not chunk['base']:
continue
s, e = get_bounds(chunk['base'])
chrom = chunk['base'][0].chrom
num = len(chunk['base'])
fout.write(f"{chrom}\t{s}\t{e}\t{num}")
s_cnts = []
d_cnts = []
for i, _ in tree_size_chunker(m, [(chunk, 0)]):
if i['base']:
s_cnts.append(len(i['base']))
for j, _ in tree_dist_chunker(m, [(i, 0)]):
d_cnts.append(len(j['base']))
s_cnts = ",".join(map(str, s_cnts))
d_cnts = ",".join(map(str, d_cnts))
fout.write(f"\t{s_cnts}\t{d_cnts}\n")
88 changes: 79 additions & 9 deletions truvari/collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

import pysam
import numpy as np
from intervaltree import IntervalTree

import truvari
import truvari.bench as trubench
Expand Down Expand Up @@ -721,6 +720,76 @@ def dump_log(self):
logging.info("%d samples' FORMAT fields consolidated",
self["stats_box"]["consol_cnt"])

class LinkedList:
"""
Simple linked list which should(?) be faster than concatenating a bunch
regular lists
"""
def __init__(self, data=None):
"""
init
"""
self.head = None
self.tail = None
if data is not None:
self.append(data)

def append(self, data):
"""
Put data onto end of list
"""
new_node = (data, None)
if not self.head:
self.head = new_node
self.tail = new_node
return
self.tail[1] = new_node
self.tail = new_node


def to_list(self):
"""
Turn into a regular list
"""
cur_node = self.head
ret = []
while cur_node:
ret.append(cur_node[0])
cur_node = cur_node[1]
return ret

def concatenate(self, other):
"""
Combine two linked lists
"""
if not self.head: # If the first list is empty
return other
self.tail[1] = other.head
self.tail = other.tail
return self

def merge_intervals(intervals):
"""
Merge sorted list of tuples
"""
if not intervals:
return []
merged = []

current_start, current_end, current_data = intervals[0]
for i in range(1, len(intervals)):
next_start, next_end, next_data = intervals[i]

# Should be <=, maybe. but this replicates intervaltree
if next_start < current_end:
current_end = max(current_end, next_end)
current_data.concatenate(next_data)
else:
# No overlap
merged.append((current_start, current_end, current_data))
current_start, current_end, current_data = next_start, next_end, next_data
merged.append((current_start, current_end, current_data))
return merged

def tree_size_chunker(matcher, chunks):
"""
Expand All @@ -733,8 +802,8 @@ def tree_size_chunker(matcher, chunks):
chunk_count += 1
yield chunk, chunk_count
continue
tree = IntervalTree()
yield {'__filtered': chunk['__filtered'], 'base': []}, chunk_count
to_add = []
for entry in chunk['base']:
# How much smaller/larger would be in sizesim?
sz = truvari.entry_size(entry)
Expand All @@ -743,35 +812,36 @@ def tree_size_chunker(matcher, chunks):
sz *= - \
1 if truvari.entry_variant_type(
entry) == truvari.SV.DEL else 1
tree.addi(sz - diff, sz + diff, data=[entry])
tree.merge_overlaps(data_reducer=lambda x, y: x + y)
to_add.append((sz - diff, sz + diff, LinkedList(entry)))
tree = merge_intervals(to_add)
for intv in tree:
chunk_count += 1
yield {'base': intv.data, '__filtered': []}, chunk_count
yield {'base': intv[2].to_list(), '__filtered': []}, chunk_count


def tree_dist_chunker(matcher, chunks):
"""
To reduce the number of variants in a chunk try to sub-chunk by reference distance before hand
Needs to return the same thing as a chunker
This does nothing
"""
chunk_count = 0
for chunk, _ in chunks:
if len(chunk['base']) < 100: # fewer than 100 is fine
chunk_count += 1
yield chunk, chunk_count
continue
tree = IntervalTree()
yield {'__filtered': chunk['__filtered'], 'base': []}, chunk_count
to_add = []
for entry in chunk['base']:
st, ed = truvari.entry_boundaries(entry)
st -= matcher.params.refdist
ed += matcher.params.refdist
tree.addi(st, ed, data=[entry])
tree.merge_overlaps(data_reducer=lambda x, y: x + y)
to_add.append((st, ed, LinkedList(entry)))
tree = merge_intervals(to_add)
for intv in tree:
chunk_count += 1
yield {'base': intv.data, '__filtered': []}, chunk_count
yield {'base': intv[2].to_list(), '__filtered': []}, chunk_count


def collapse_main(args):
Expand Down

0 comments on commit 82330d4

Please sign in to comment.