Skip to content

Commit

Permalink
Module04b: parametrize sample overlap and minimum var count outlier t…
Browse files Browse the repository at this point in the history
…hreshold for regeno filtering (#133)
  • Loading branch information
epiercehoffman authored Mar 18, 2021
1 parent 6c8d835 commit cf54ad5
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 4 deletions.
12 changes: 10 additions & 2 deletions wdl/CombineReassess.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ workflow CombineReassess {
File regeno_file
File regeno_sample_ids_lookup
Array[File] vcfs
Int min_var_per_sample_outlier_threshold
Float regeno_sample_overlap
String sv_pipeline_base_docker
String sv_pipeline_docker
RuntimeAttr? runtime_attr_vcf2bed
Expand All @@ -28,6 +30,8 @@ workflow CombineReassess {
regeno_file = regeno_file,
regeno_sample_ids_lookup = regeno_sample_ids_lookup,
samplelist = samplelist,
min_var_per_sample_outlier_threshold = min_var_per_sample_outlier_threshold,
regeno_sample_overlap = regeno_sample_overlap,
runtime_attr_override = runtime_attr_merge_list_creassess,
sv_pipeline_base_docker = sv_pipeline_base_docker
}
Expand Down Expand Up @@ -82,6 +86,8 @@ task MergeList {
File regeno_file
Array[File] nonempty_txt
File regeno_sample_ids_lookup
Int min_var_per_sample_outlier_threshold
Float regeno_sample_overlap
String sv_pipeline_base_docker
RuntimeAttr? runtime_attr_override
}
Expand Down Expand Up @@ -130,7 +136,9 @@ task MergeList {
count(line)
counts=np.array([int(dct[x]) for x in dct.keys()])
def reject_outliers(data, m=3):
return data[abs(data - np.mean(data)) > m * np.std(data)]
deviation_threshold = m * np.std(data)
data_mean = np.mean(data)
return data[np.logical_and(abs(data - data_mean) > deviation_threshold, data > ~{min_var_per_sample_outlier_threshold})]
outliers=reject_outliers(counts)
outlier_samples=set([x for x in dct.keys() if dct[x] in outliers])
with open("reassess_nonzero_overlap.txt",'w') as g, open("reassesss_by_var.txt",'r') as f:
Expand All @@ -146,7 +154,7 @@ task MergeList {
overlap_over_expected=str(len(regeno_in_expected)/len(expected))
g.write(dat[0]+"\t"+",".join(regeno)+'\t'+",".join(expected)+'\t'+overlap_over_regeno+'\t'+overlap_over_expected+"\n")
CODE
awk '{if($4>0.7 && $5>0.7)print $1}' reassess_nonzero_overlap.txt > regeno_var_filtered.txt
awk '{if($4>~{regeno_sample_overlap} && $5>~{regeno_sample_overlap})print $1}' reassess_nonzero_overlap.txt > regeno_var_filtered.txt
# the OR clause below is to ignore return code = 1 because that isn't an error, it just means there were 0 matched lines
# (but don't ignore real error codes > 1)
fgrep -w -f regeno_var_filtered.txt ~{regeno_file}> regeno.filtered.bed || [[ $? == 1 ]]
Expand Down
8 changes: 6 additions & 2 deletions wdl/Module04b.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ workflow Module04b {
String cohort # Cohort name or project prefix for all cohort-level outputs
File contig_list
Array[File] regeno_coverage_medians # one file per batch
Float regeno_max_allele_freq = 0.01
Int regeno_allele_count_threshold = 3
Float regeno_max_allele_freq = 0.01 # Rare variant filter for regenotyping candidates: must be < AF threshold (this parameter) or <= AC threshold (below)
Int regeno_allele_count_threshold = 3 # Rare variant filter for regenotyping candidates: must be < AF threshold (above) or <= AC threshold (this parameter)
Int min_var_per_sample_outlier_threshold = 3 # Threshold below which regeno SV count per sample should not be considered an outlier (need when counts are sparse)
Float regeno_sample_overlap = 0.7 # Minimum sample overlap required between raw and regenotyped calls
RuntimeAttr? runtime_attr_cluster_merged_depth_beds
RuntimeAttr? runtime_attr_regeno_raw_combined_depth
Expand Down Expand Up @@ -188,6 +190,8 @@ workflow Module04b {
regeno_file = MergeList.master_regeno,
regeno_sample_ids_lookup = ConcatSampleIdLookupBed.concat_bed,
vcfs = Genotype_2.genotyped_vcf,
min_var_per_sample_outlier_threshold = min_var_per_sample_outlier_threshold,
regeno_sample_overlap = regeno_sample_overlap,
sv_pipeline_docker = sv_pipeline_docker,
sv_pipeline_base_docker = sv_pipeline_base_docker,
runtime_attr_merge_list_creassess = runtime_attr_merge_list_creassess,
Expand Down

0 comments on commit cf54ad5

Please sign in to comment.