Skip to content

Commit

Permalink
Move regenotyping tasks from MergeCohortVcfs to Module04b (#83)
Browse files Browse the repository at this point in the history
  • Loading branch information
epiercehoffman authored Nov 19, 2020
1 parent 0482a34 commit 7aad694
Show file tree
Hide file tree
Showing 4 changed files with 331 additions and 303 deletions.
4 changes: 1 addition & 3 deletions test/module04/MergeCohortVcfs.test.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,5 @@
"MergeCohortVcfs.sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:mw-gnomad-02-6a66c96",
"MergeCohortVcfs.pesr_vcfs": ["gs://gatk-sv-resources/test/module03/large/output/test_large.filtered_pesr_merged.vcf.gz"],
"MergeCohortVcfs.depth_vcfs": ["gs://gatk-sv-resources/test/module03/large/output/test_large.depth.outliers_removed.vcf.gz"],
"MergeCohortVcfs.cohort": "test_large",
"MergeCohortVcfs.sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:mw-gnomad-02-6a66c96",
"MergeCohortVcfs.contig_list": "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/primary_contigs.list"
"MergeCohortVcfs.cohort": "test_large"
}
5 changes: 2 additions & 3 deletions test/module04b/Module04b.test.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@
"Module04b.n_RdTest_bins": "100000",
"Module04b.n_per_split": "5000",

"Module04b.regeno_sample_ids_lookup": "gs://talkowskicromwellresults/M04/MergeCohortVcfs/190da1da-75b8-485e-82fc-b2c5ff07a8e6/call-MergeDepthVcfs/cluster.combined.bed",
"Module04b.regeno_sample_counts_lookup": "gs://talkowskicromwellresults/M04/MergeCohortVcfs/190da1da-75b8-485e-82fc-b2c5ff07a8e6/call-MergeDepthVcfs/master_cluster_dups.bed",
"Module04b.regeno_raw_combined_depth": "gs://talkowskicromwellresults/M04/MergeCohortVcfs/190da1da-75b8-485e-82fc-b2c5ff07a8e6/call-MergeDepthVcfs/cohort.sort.bed",
"Module04b.cohort": "test",
"Module04b.contig_list": "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/primary_contigs.list",
"Module04b.regeno_coverage_medians": [
"gs://broad-dsde-methods-eph/Module04StressTest/Module04/6d90e729-0cd1-4752-94b3-e0ad128c6dc4/call-GenotypeDepthPart2/GenotypeDepthPart2/2b962a21-897e-49ac-8055-118ba459f7e4/call-RDTestGenotypeOver5kb/shard-0/cacheCopy/gt5kb.aaaaaa.bed.median_geno",
"gs://broad-dsde-methods-eph/Module04StressTest/Module04/6d90e729-0cd1-4752-94b3-e0ad128c6dc4/call-GenotypeDepthPart2/GenotypeDepthPart2/2b962a21-897e-49ac-8055-118ba459f7e4/call-RDTestGenotypeOver5kb/shard-1/cacheCopy/gt5kb.aaaaab.bed.median_geno"
Expand Down
281 changes: 0 additions & 281 deletions wdl/MergeCohortVcfs.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,11 @@ workflow MergeCohortVcfs {
Array[File] depth_vcfs # Filtered depth VCFs across batches
Array[File] pesr_vcfs # Filtered PESR VCFs across batches
String cohort # Cohort name or project prefix for all cohort-level outputs
File contig_list
String sv_pipeline_docker
String sv_base_mini_docker
RuntimeAttr? runtime_attr_merge_pesr
RuntimeAttr? runtime_attr_merge_depth
RuntimeAttr? runtime_attr_regeno_raw_combined_depth
RuntimeAttr? runtime_attr_regeno_merged_depth
RuntimeAttr? runtime_attr_sample_lookup
RuntimeAttr? runtime_attr_concat_samplecountlookup
RuntimeAttr? runtime_attr_concat_sampleidlookup
}
Array[Array[String]] contigs = read_tsv(contig_list)
call MergeVcfs as MergePESRVcfs {
input:
vcfs = pesr_vcfs,
Expand All @@ -46,57 +37,9 @@ workflow MergeCohortVcfs {
runtime_attr_override = runtime_attr_merge_depth
}
call MakeRawCombinedBed {
input:
vcfs = depth_vcfs,
cohort = cohort,
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_regeno_raw_combined_depth
}
call MakeMergedDepthBeds {
input:
regeno_raw_combined_depth = MakeRawCombinedBed.regeno_raw_combined_depth,
cohort = cohort,
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_regeno_merged_depth
}
scatter (contig in contigs) {
call MakeSampleLookupBeds {
input:
regeno_merged_depth = MakeMergedDepthBeds.regeno_merged_depth,
regeno_merged_depth_clustered = MergeDepthVcfs.regeno_merged_depth_clustered,
cohort = cohort,
contig = contig[0],
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_sample_lookup
}
}
call ConcatBed as ConcatSampleCountLookupBed {
input:
bed_shards = MakeSampleLookupBeds.regeno_sample_counts_lookup,
filename = cohort + ".regeno.sample_counts_lookup.bed",
sv_base_mini_docker = sv_base_mini_docker,
runtime_attr_override = runtime_attr_concat_samplecountlookup
}
call ConcatBed as ConcatSampleIdLookupBed {
input:
bed_shards = MakeSampleLookupBeds.regeno_sample_ids_lookup,
filename = cohort + ".regeno.sample_ids_lookup.bed",
sv_base_mini_docker = sv_base_mini_docker,
runtime_attr_override = runtime_attr_concat_sampleidlookup
}
output {
File cohort_pesr_vcf = MergePESRVcfs.merged_vcf
File cohort_depth_vcf = MergeDepthVcfs.merged_vcf
File regeno_merged_depth = MakeMergedDepthBeds.regeno_merged_depth
File regeno_raw_combined_depth = MakeRawCombinedBed.regeno_raw_combined_depth
File regeno_sample_counts_lookup = ConcatSampleCountLookupBed.concat_bed
File regeno_sample_ids_lookup = ConcatSampleIdLookupBed.concat_bed
}
}

Expand Down Expand Up @@ -161,120 +104,11 @@ task MergeDepthVcfs {
output {
File merged_vcf = "~{prefix}.vcf.gz"
File regeno_merged_depth_clustered = "~{cohort}.regeno.merged_depth_clustered.bed"
}
command <<<

set -euxo pipefail
/opt/sv-pipeline/04_variant_resolution/scripts/merge_vcfs.py ~{write_lines(vcfs)} ~{prefix}.vcf
rm ~{sep=' ' vcfs}
vcf-sort -c ~{prefix}.vcf | bgzip -c > ~{prefix}.vcf.gz
svtk vcf2bed ~{prefix}.vcf.gz ~{prefix}.vcf.gz.bed # vcf2bed merge_vcfs, non_duplicated
# split DELs and DUPs into separate, non-duplicated BED files. SVTYPE is 5th column of BED
awk -F "\t" -v OFS="\t" '{ if ($5 == "DEL") { print > "del.bed" } else if ($5 == "DUP") { print > "dup.bed" } }' ~{prefix}.vcf.gz.bed
svtk bedcluster del.bed | cut -f1-7 | awk '{print $0","}' > del.cluster.bed #cluster non_duplicated del
svtk bedcluster dup.bed | cut -f1-7 | awk '{print $0","}' > dup.cluster.bed #cluster non_duplicated dup
cat del.cluster.bed dup.cluster.bed | sort -k1,1V -k2,2n -k3,3n | fgrep -v "#" > ~{cohort}.regeno.merged_depth_clustered.bed #combine clusterd non-duplicated
>>>
runtime {
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
docker: sv_pipeline_docker
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
}
}
task MakeRawCombinedBed {
input {
Array[File] vcfs
String cohort
String sv_pipeline_docker
RuntimeAttr? runtime_attr_override
}
RuntimeAttr default_attr = object {
cpu_cores: 1,
mem_gb: 3.75,
disk_gb: 10,
boot_disk_gb: 10,
preemptible_tries: 3,
max_retries: 1
}
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
output {
File regeno_raw_combined_depth = "~{cohort}.regeno.raw_combined_depth.bed"
}
command <<<

set -euxo pipefail
while read vcf; do
local_vcf=$(basename $vcf)
svtk vcf2bed --no-header $vcf $local_vcf.bed # for each depth vcf make bed, duplicated
done < ~{write_lines(vcfs)}
rm ~{sep=' ' vcfs}
cat *.bed | sort -k1,1V -k2,2n -k3,3n > ~{cohort}.regeno.raw_combined_depth.bed # concat raw depth vcf, duplicated
>>>
runtime {
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
docker: sv_pipeline_docker
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
}
}
task MakeMergedDepthBeds {
input {
File regeno_raw_combined_depth
String cohort
String sv_pipeline_docker
RuntimeAttr? runtime_attr_override
}
RuntimeAttr default_attr = object {
cpu_cores: 1,
mem_gb: 3.75,
disk_gb: 10,
boot_disk_gb: 10,
preemptible_tries: 3,
max_retries: 1
}
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
output {
File regeno_merged_depth="~{cohort}.regeno.merged_depth.bed"
}
command <<<

set -euxo pipefail
python3 <<CODE
varID={}
with open("~{regeno_raw_combined_depth}",'r') as f: # From the depth cohort bed, a dictionary of all duplicate variants and their samples
for line in f:
dat=line.rstrip().split('\t')
samples=dat[-1].split(",")
var=dat[3]
ID=dat[0]+":"+dat[1]+'-'+dat[2]+'_'+dat[4]
if ID not in varID.keys():
varID[ID]={"sample":samples,"varids":[var]}
else:
varID[ID]['sample']=varID[ID]['sample']+samples
varID[ID]['varids'].append(var)
with open("~{cohort}.regeno.merged_depth.bed",'w') as f: # For each unique variant a line with variants and samples
for variant in varID.keys():
CHROM=variant.split(":")[0]
START=variant.split(':')[1].split("-")[0]
END=variant.split(':')[1].split("-")[1].split('_')[0]
varcol=":".join(varID[variant]["varids"])+':'
samplecol=",".join(varID[variant]['sample'])+','
f.write(CHROM+"\t"+START+"\t"+END+"\t"+varcol+"\t"+samplecol+'\n')
CODE
>>>
runtime {
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
Expand All @@ -286,118 +120,3 @@ task MakeMergedDepthBeds {
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
}
}
task MakeSampleLookupBeds {
input {
File regeno_merged_depth
File regeno_merged_depth_clustered
String cohort
String contig
String sv_pipeline_docker
RuntimeAttr? runtime_attr_override
}
RuntimeAttr default_attr = object {
cpu_cores: 1,
mem_gb: 3.75,
disk_gb: 10,
boot_disk_gb: 10,
preemptible_tries: 3,
max_retries: 1
}
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
output {
File regeno_sample_counts_lookup = "~{contig}.regeno.sample_counts_lookup.bed"
File regeno_sample_ids_lookup = "~{contig}.regeno.sample_ids_lookup.bed"
}
command <<<
set -euxo pipefail
# select rows of BED files pertaining to contig - chrom is 1st column of each BED file
awk -F "\t" -v OFS="\t" '{ if ($1 == "~{contig}") { print > "~{contig}.regeno.merged_depth.bed" } }' ~{regeno_merged_depth}
awk -F "\t" -v OFS="\t" '{ if ($1 == "~{contig}") { print > "~{contig}.regeno.merged_depth_clustered.bed" } }' ~{regeno_merged_depth_clustered}
python3 <<CODE
# dictionary of (samples, varIDs) for de-duplicated variant for EACH varID corresponding to that unique variant
varID_data = {}
with open("~{contig}.regeno.merged_depth.bed","r") as f: # for EACH variant ID, a list of duplicate variants and samples
for line in f:
dat=line.split('\t')
varIDs_list = dat[3].split(":")[0:-1]
samples_list = dat[4].split(',')[0:-1]
for varID in varIDs_list:
varID_data[varID] = (samples_list, varIDs_list)
with open("~{contig}.regeno.sample_counts_lookup.bed",'w') as g: # Using clustered merged (de-dupped) depth calls, for each clustered variant get varIDs and samples of the component calls
with open("~{contig}.regeno.merged_depth_clustered.bed","r") as f:
for line in f:
samples=[]
variants=[]
dat=line.rstrip().split("\t")
for varID in dat[6][0:-1].split(','):
samples.extend(varID_data[varID][0]) # samples are first in tuple
samples = list(set(samples))
variants.extend(varID_data[varID][1]) # variant IDs are 2nd in tuple
variants = list(set(variants))
g.write(dat[0]+'\t'+dat[1]+'\t'+dat[2]+'\t'+dat[3]+'\t'+dat[4]+'\t'+dat[5]+'\t'+":".join(variants)+':\t'+str(len(samples))+'\n')
with open("~{contig}.regeno.sample_ids_lookup.bed",'w') as g:
with open("~{contig}.regeno.merged_depth_clustered.bed","r") as f:
for line in f:
samples=[]
variants=[]
dat=line.rstrip().split("\t")
for varID in dat[6][0:-1].split(','):
samples.extend(varID_data[varID][0]) # samples are first in tuple
samples = list(set(samples))
variants.extend(varID_data[varID][1]) # variant IDs are 2nd in tuple
variants = list(set(variants))
g.write(dat[0]+'\t'+dat[1]+'\t'+dat[2]+'\t'+dat[3]+'\t'+dat[4]+'\t'+dat[5]+'\t'+":".join(variants)+':\t'+','.join(samples)+',\n')
CODE
>>>
runtime {
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
docker: sv_pipeline_docker
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
}
}
task ConcatBed {
input {
Array[File] bed_shards
String filename
String sv_base_mini_docker
RuntimeAttr? runtime_attr_override
}
RuntimeAttr default_attr = object {
cpu_cores: 1,
mem_gb: 3.75,
disk_gb: 10,
boot_disk_gb: 10,
preemptible_tries: 3,
max_retries: 1
}
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
output {
File concat_bed = "~{filename}"
}
command <<<
set -euxo pipefail
while read bed_shard; do
cat $bed_shard >> ~{filename} # all BED files are headless and sorted so can just concatenate in order
done < ~{write_lines(bed_shards)}
>>>
runtime {
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
docker: sv_base_mini_docker
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
}
}
Loading

0 comments on commit 7aad694

Please sign in to comment.