From 445995a5304fecd35eefaa68b74d24b06ab22971 Mon Sep 17 00:00:00 2001 From: npetrill Date: Fri, 1 Mar 2024 10:43:37 -0500 Subject: [PATCH 001/186] small change --- pipelines/skylab/snM3C/snM3C.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/skylab/snM3C/snM3C.wdl b/pipelines/skylab/snM3C/snM3C.wdl index bcdc71a861..bac72eb68c 100644 --- a/pipelines/skylab/snM3C/snM3C.wdl +++ b/pipelines/skylab/snM3C/snM3C.wdl @@ -23,7 +23,6 @@ workflow snM3C { Int num_downstr_bases = 2 Int compress_level = 5 Int batch_number - } # version of the pipeline From cae0f54dd1480271a273fc40351b1fdb56d5e233 Mon Sep 17 00:00:00 2001 From: Nikelle Petrillo <38223776+nikellepetrillo@users.noreply.github.com> Date: Wed, 6 Mar 2024 13:21:43 -0500 Subject: [PATCH 002/186] azurize optimus (#1228) * add logic to choose which docker * fix param_meta and import * add cloud provider to checkinput * handle hard coded white list paths in CheckInputs.wdl * last few dockers * last few dockers * last few dockers * change error msg * use ubuntu image * use ubuntu image * change whitelists * point to azure public whitelists * add sas token * echo whitelist * echo whitelist * testing for coa * testing for coa * change back to terra buckets for whitelists * change whitelists to point at public azure bucket * files to strings * print statemtns to checkinputs * string to files * change to terra bucket paths * strings not files * append sas token * append sas token * append sas and use strings * back to bucket urls * back to bucket urls * use google cloud urls * using public urls * trying to export sas_token * trying to export sas_token * trying to export sas_token * terra on gcp * update azure whitelist files * changelogs * changelogs * changelogs * changelogs * fix some inputs * fix some inputs * fix some inputs * fix some inputs * update optimus dockers * warp_tools_docker_path for staralign * stop using ice lake as default * update pipeline docs * 2 threads * counting mode * changelogs --------- Co-authored-by: phendriksen100 <103142505+phendriksen100@users.noreply.github.com> Co-authored-by: kayleemathews --- .../skylab/multiome/Multiome.changelog.md | 5 + pipelines/skylab/multiome/Multiome.wdl | 6 +- pipelines/skylab/multiome/atac.changelog.md | 7 +- pipelines/skylab/multiome/atac.wdl | 2 +- .../Plumbing/10k_pbmc_downsampled.json | 3 +- pipelines/skylab/optimus/Optimus.changelog.md | 5 + pipelines/skylab/optimus/Optimus.wdl | 101 ++++++++++++++---- .../Plumbing/human_v3_example.json | 3 +- .../Plumbing/mouse_v2_example.json | 3 +- .../Plumbing/mouse_v2_snRNA_example.json | 3 +- .../skylab/paired_tag/PairedTag.changelog.md | 4 + pipelines/skylab/paired_tag/PairedTag.wdl | 2 +- .../skylab/slideseq/SlideSeq.changelog.md | 8 ++ pipelines/skylab/slideseq/SlideSeq.wdl | 57 ++++++++-- .../Plumbing/Puck_210817_11.mm10.json | 3 +- ...iSampleSmartSeq2SingleNucleus.changelog.md | 10 ++ .../MultiSampleSmartSeq2SingleNucleus.wdl | 22 +++- .../test_inputs/Plumbing/mouse_example.json | 3 +- pipelines/skylab/snM3C/snM3C.changelog.md | 2 +- pipelines/skylab/snM3C/snM3C.wdl | 1 + tasks/skylab/CheckInputs.wdl | 36 ++++++- tasks/skylab/FastqProcessing.wdl | 7 +- tasks/skylab/H5adUtils.wdl | 8 +- tasks/skylab/MergeSortBam.wdl | 6 +- tasks/skylab/Metrics.wdl | 13 +-- tasks/skylab/RunEmptyDrops.wdl | 4 +- tasks/skylab/StarAlign.wdl | 26 +++-- .../TestMultiSampleSmartSeq2SingleNucleus.wdl | 5 +- verification/test-wdls/TestMultiome.wdl | 4 +- verification/test-wdls/TestOptimus.wdl | 5 +- verification/test-wdls/TestSlideSeq.wdl | 4 +- website/docs/Pipelines/ATAC/README.md | 2 +- .../Pipelines/Multiome_Pipeline/README.md | 5 +- .../docs/Pipelines/Optimus_Pipeline/README.md | 3 +- .../Pipelines/PairedTag_Pipeline/README.md | 2 +- .../Pipelines/SlideSeq_Pipeline/README.md | 3 +- .../README.md | 3 +- .../multi_snss2.methods.md | 4 +- 38 files changed, 301 insertions(+), 89 deletions(-) diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index da8bc38753..6a82ca00da 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -1,3 +1,8 @@ +# 3.2.2 +2024-03-01 (Date of Last Commit) + +* Updated the Optimus.wdl to run on Azure. This change does not affect the Multiome pipeline. + # 3.2.1 2024-02-29 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 1e6bc2edae..64aa671836 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -6,10 +6,11 @@ import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils import "https://raw.githubusercontent.com/broadinstitute/CellBender/v0.3.0/wdl/cellbender_remove_background.wdl" as CellBender workflow Multiome { - String pipeline_version = "3.2.1" + String pipeline_version = "3.2.2" input { String input_id + String cloud_provider # Optimus Inputs String counting_mode = "sn_rna" @@ -68,7 +69,8 @@ workflow Multiome { ignore_r1_read_length = ignore_r1_read_length, star_strand_mode = star_strand_mode, count_exons = count_exons, - soloMultiMappers = soloMultiMappers + soloMultiMappers = soloMultiMappers, + cloud_provider = cloud_provider } # Call the ATAC workflow diff --git a/pipelines/skylab/multiome/atac.changelog.md b/pipelines/skylab/multiome/atac.changelog.md index 170caa2aed..005a2fb782 100644 --- a/pipelines/skylab/multiome/atac.changelog.md +++ b/pipelines/skylab/multiome/atac.changelog.md @@ -1,4 +1,9 @@ -# 1.1.8 +# 1.1.9 +2024-03-01 (Date of Last Commit) + +* Updated the Optimus.wdl to run on Azure. This change does not affect the ATAC pipeline. + +* # 1.1.8 2024-02-07 (Date of Last Commit) * Updated the Metrics tasks to exclude mitochondrial genes from reads_mapped_uniquely, reads_mapped_multiple and reads_mapped_exonic, reads_mapped_exonic_as and reads_mapped_intergenic diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index 3dd81d7bf5..0431ba3997 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -41,7 +41,7 @@ workflow ATAC { String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" } - String pipeline_version = "1.1.8" + String pipeline_version = "1.1.9" parameter_meta { read1_fastq_gzipped: "read 1 FASTQ file as input for the pipeline, contains read 1 of paired reads" diff --git a/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json b/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json index 7d15111f38..bd9b7a1172 100644 --- a/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json +++ b/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json @@ -23,5 +23,6 @@ "Multiome.Atac.cpu_platform_bwa":"Intel Cascade Lake", "Multiome.Atac.num_threads_bwa":"16", "Multiome.Atac.mem_size_bwa":"64", - "Multiome.soloMultiMappers":"Uniform" + "Multiome.soloMultiMappers":"Uniform", + "Multiome.cloud_provider":"gcp" } diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md index 23098dd7a0..d76bedaed5 100644 --- a/pipelines/skylab/optimus/Optimus.changelog.md +++ b/pipelines/skylab/optimus/Optimus.changelog.md @@ -1,3 +1,8 @@ +# 6.4.2 +2024-03-01 (Date of Last Commit) +* Updated the Optimus.wdl to run on Azure. + + # 6.4.1 2024-02-29 (Date of Last Commit) * Added mem and disk to inputs of Join Barcodes task of Multiome workflow; does not impact the Optimus workflow diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index 159490afbf..ccfa5e35e5 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -7,6 +7,7 @@ import "../../../tasks/skylab/RunEmptyDrops.wdl" as RunEmptyDrops import "../../../tasks/skylab/CheckInputs.wdl" as OptimusInputChecks import "../../../tasks/skylab/MergeSortBam.wdl" as Merge import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils +import "../../../tasks/broad/Utilities.wdl" as utils workflow Optimus { meta { @@ -14,6 +15,8 @@ workflow Optimus { } input { + String cloud_provider + # Mode for counting either "sc_rna" or "sn_rna" String counting_mode = "sc_rna" @@ -45,36 +48,71 @@ workflow Optimus { # Set to true to override input checks and allow pipeline to proceed with invalid input Boolean force_no_check = false - + # Check that tenx_chemistry_version matches the length of the read 1 fastq; # Set to true if you expect that r1_read_length does not match length of UMIs/barcodes for 10x chemistry v2 (26 bp) or v3 (28 bp). Boolean ignore_r1_read_length = false # Set to Forward, Reverse, or Unstranded to account for stranded library preparations (per STARsolo documentation) String star_strand_mode = "Forward" - + # Set to true to count reads aligned to exonic regions in sn_rna mode Boolean count_exons = false # this pipeline does not set any preemptible varibles and only relies on the task-level preemptible settings # you could override the tasklevel preemptible settings by passing it as one of the workflows inputs # for example: `"Optimus.StarAlign.preemptible": 3` will let the StarAlign task, which by default disables the - # usage of preemptible machines, attempt to request for preemptible instance up to 3 times. + # usage of preemptible machines, attempt to request for preemptible instance up to 3 times. } # version of this pipeline - String pipeline_version = "6.4.1" + String pipeline_version = "6.4.2" # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays Array[Int] indices = range(length(r1_fastq)) # 10x parameters - File whitelist_v2 = "gs://gcp-public-data--broad-references/RNA/resources/737k-august-2016.txt" - File whitelist_v3 = "gs://gcp-public-data--broad-references/RNA/resources/3M-febrary-2018.txt" + File gcp_whitelist_v2 = "gs://gcp-public-data--broad-references/RNA/resources/737k-august-2016.txt" + File gcp_whitelist_v3 = "gs://gcp-public-data--broad-references/RNA/resources/3M-febrary-2018.txt" + File azure_whitelist_v2 = "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/737k-august-2016.txt" + File azure_whitelist_v3 = "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/3M-febrary-2018.txt" + # Takes the first read1 FASTQ from the inputs to check for chemistry match File r1_single_fastq = r1_fastq[0] + # docker images + String picard_cloud_docker = "picard-cloud:2.26.10" + String pytools_docker = "pytools:1.0.0-1661263730" + String empty_drops_docker = "empty-drops:1.0.1-4.2" + String star_docker = "star:1.0.1-2.7.11a-1692706072" + String warp_tools_docker_2_0_1 = "warp-tools:2.0.1" + String warp_tools_docker_2_0_2 = "warp-tools:2.0.2-1709308985" + #TODO how do we handle these? + String alpine_docker = "alpine-bash:latest" + String gcp_alpine_docker_prefix = "bashell/" + String acr_alpine_docker_prefix = "dsppipelinedev.azurecr.io/" + String alpine_docker_prefix = if cloud_provider == "gcp" then gcp_alpine_docker_prefix else acr_alpine_docker_prefix + + String ubuntu_docker = "ubuntu_16_0_4:latest" + String gcp_ubuntu_docker_prefix = "gcr.io/gcp-runtimes/" + String acr_ubuntu_docker_prefix = "dsppipelinedev.azurecr.io/" + String ubuntu_docker_prefix = if cloud_provider == "gcp" then gcp_ubuntu_docker_prefix else acr_ubuntu_docker_prefix + + String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" + String acr_docker_prefix = "dsppipelinedev.azurecr.io/" + + # choose docker prefix based on cloud provider + String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix + + # make sure either gcp or azr is supplied as cloud_provider input + if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { + call utils.ErrorWithMessage as ErrorMessageIncorrectInput { + input: + message = "cloud_provider must be supplied with either 'gcp' or 'azure'." + } + } + parameter_meta { r1_fastq: "forward read, contains cell barcodes and molecule barcodes" r2_fastq: "reverse read, contains cDNA fragment generated from captured mRNA" @@ -96,16 +134,21 @@ workflow Optimus { force_no_check = force_no_check, counting_mode = counting_mode, count_exons = count_exons, - whitelist_v2 = whitelist_v2, - whitelist_v3 = whitelist_v3, + gcp_whitelist_v2 = gcp_whitelist_v2, + gcp_whitelist_v3 = gcp_whitelist_v3, + azure_whitelist_v2 = azure_whitelist_v2, + azure_whitelist_v3 = azure_whitelist_v3, tenx_chemistry_version = tenx_chemistry_version, r1_fastq = r1_single_fastq, - ignore_r1_read_length = ignore_r1_read_length + ignore_r1_read_length = ignore_r1_read_length, + cloud_provider = cloud_provider, + alpine_docker_path = alpine_docker_prefix + alpine_docker } call StarAlign.STARGenomeRefVersion as ReferenceCheck { input: - tar_star_reference = tar_star_reference + tar_star_reference = tar_star_reference, + ubuntu_docker_path = ubuntu_docker_prefix + ubuntu_docker } call FastqProcessing.FastqProcessing as SplitFastq { @@ -116,7 +159,8 @@ workflow Optimus { whitelist = whitelist, chemistry = tenx_chemistry_version, sample_id = input_id, - read_struct = read_struct + read_struct = read_struct, + warp_tools_docker_path = docker_prefix + warp_tools_docker_2_0_1 } scatter(idx in range(length(SplitFastq.fastq_R1_output_array))) { @@ -131,21 +175,24 @@ workflow Optimus { counting_mode = counting_mode, count_exons = count_exons, output_bam_basename = output_bam_basename + "_" + idx, - soloMultiMappers = soloMultiMappers + soloMultiMappers = soloMultiMappers, + star_docker_path = docker_prefix + star_docker } } call Merge.MergeSortBamFiles as MergeBam { input: bam_inputs = STARsoloFastq.bam_output, output_bam_filename = output_bam_basename + ".bam", - sort_order = "coordinate" + sort_order = "coordinate", + picard_cloud_docker_path = docker_prefix + picard_cloud_docker } call Metrics.CalculateGeneMetrics as GeneMetrics { input: bam_input = MergeBam.output_bam, mt_genes = mt_genes, original_gtf = annotations_gtf, - input_id = input_id + input_id = input_id, + warp_tools_docker_path = docker_prefix + warp_tools_docker_2_0_1 } call Metrics.CalculateCellMetrics as CellMetrics { @@ -153,7 +200,8 @@ workflow Optimus { bam_input = MergeBam.output_bam, mt_genes = mt_genes, original_gtf = annotations_gtf, - input_id = input_id + input_id = input_id, + warp_tools_docker_path = docker_prefix + warp_tools_docker_2_0_1 } call StarAlign.MergeStarOutput as MergeStarOutputs { @@ -165,7 +213,9 @@ workflow Optimus { summary = STARsoloFastq.summary, align_features = STARsoloFastq.align_features, umipercell = STARsoloFastq.umipercell, - input_id = input_id + input_id = input_id, + counting_mode = counting_mode, + warp_tools_docker_path = docker_prefix + warp_tools_docker_2_0_2 } if (counting_mode == "sc_rna"){ call RunEmptyDrops.RunEmptyDrops { @@ -173,7 +223,8 @@ workflow Optimus { sparse_count_matrix = MergeStarOutputs.sparse_counts, row_index = MergeStarOutputs.row_index, col_index = MergeStarOutputs.col_index, - emptydrops_lower = emptydrops_lower + emptydrops_lower = emptydrops_lower, + empty_drops_docker_path = docker_prefix + empty_drops_docker } } @@ -192,7 +243,8 @@ workflow Optimus { gene_id = MergeStarOutputs.col_index, empty_drops_result = RunEmptyDrops.empty_drops_result, counting_mode = counting_mode, - pipeline_version = "Optimus_v~{pipeline_version}" + pipeline_version = "Optimus_v~{pipeline_version}", + warp_tools_docker_path = docker_prefix + warp_tools_docker_2_0_1 } } if (count_exons && counting_mode=="sn_rna") { @@ -202,7 +254,13 @@ workflow Optimus { features = STARsoloFastq.features_sn_rna, matrix = STARsoloFastq.matrix_sn_rna, cell_reads = STARsoloFastq.cell_reads_sn_rna, - input_id = input_id + input_id = input_id, + counting_mode = "sc_rna", + summary = STARsoloFastq.summary_sn_rna, + align_features = STARsoloFastq.align_features_sn_rna, + umipercell = STARsoloFastq.umipercell_sn_rna, + input_id = input_id, + warp_tools_docker_path = docker_prefix + warp_tools_docker_2_0_2 } call H5adUtils.SingleNucleusOptimusH5adOutput as OptimusH5adGenerationWithExons{ input: @@ -219,7 +277,8 @@ workflow Optimus { sparse_count_matrix_exon = MergeStarOutputsExons.sparse_counts, cell_id_exon = MergeStarOutputsExons.row_index, gene_id_exon = MergeStarOutputsExons.col_index, - pipeline_version = "Optimus_v~{pipeline_version}" + pipeline_version = "Optimus_v~{pipeline_version}", + warp_tools_docker_path = docker_prefix + warp_tools_docker_2_0_1 } } @@ -238,11 +297,13 @@ workflow Optimus { File gene_metrics = GeneMetrics.gene_metrics File? cell_calls = RunEmptyDrops.empty_drops_result File? aligner_metrics = MergeStarOutputs.cell_reads_out + File? library_metrics = MergeStarOutputs.library_metrics Array[File?] multimappers_EM_matrix = STARsoloFastq.multimappers_EM_matrix Array[File?] multimappers_Uniform_matrix = STARsoloFastq.multimappers_Uniform_matrix Array[File?] multimappers_Rescue_matrix = STARsoloFastq.multimappers_Rescue_matrix Array[File?] multimappers_PropUnique_matrix = STARsoloFastq.multimappers_PropUnique_matrix + # h5ad File h5ad_output_file = final_h5ad_output } diff --git a/pipelines/skylab/optimus/test_inputs/Plumbing/human_v3_example.json b/pipelines/skylab/optimus/test_inputs/Plumbing/human_v3_example.json index 612659d25c..667e632bbd 100644 --- a/pipelines/skylab/optimus/test_inputs/Plumbing/human_v3_example.json +++ b/pipelines/skylab/optimus/test_inputs/Plumbing/human_v3_example.json @@ -15,5 +15,6 @@ "Optimus.input_id": "pbmc_human_v3", "Optimus.tenx_chemistry_version": "3", "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_v43.annotation.gtf", - "Optimus.star_strand_mode": "Forward" + "Optimus.star_strand_mode": "Forward", + "Optimus.cloud_provider": "gcp" } diff --git a/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_example.json b/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_example.json index 0dc26af9fd..33e7553cb4 100644 --- a/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_example.json +++ b/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_example.json @@ -27,5 +27,6 @@ "Optimus.input_id": "neurons2k_mouse", "Optimus.tenx_chemistry_version": "2", "Optimus.star_strand_mode": "Unstranded", - "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/GRCm39/star/v2_7_10a/modified_vM32.annotation.gtf" + "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/GRCm39/star/v2_7_10a/modified_vM32.annotation.gtf", + "Optimus.cloud_provider": "gcp" } diff --git a/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_snRNA_example.json b/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_snRNA_example.json index 787a1a8347..fef0bd0f76 100644 --- a/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_snRNA_example.json +++ b/pipelines/skylab/optimus/test_inputs/Plumbing/mouse_v2_snRNA_example.json @@ -25,5 +25,6 @@ "Optimus.star_strand_mode": "Unstranded", "Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/GRCm39/star/v2_7_10a/modified_vM32.annotation.gtf", "Optimus.counting_mode": "sn_rna", - "Optimus.count_exons": true + "Optimus.count_exons": true, + "Optimus.cloud_provider": "gcp" } diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index 17255ab77f..ca066704a4 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,3 +1,7 @@ +# 0.2.1 +2024-03-01 (Date of Last Commit) +* Updated the Optimus.wdl to run on Azure. This change does not affect the PairedTag pipeline. + # 0.2.0 2024-02-29 (Date of Last Commit) * Added mem and disk to inputs of Join Barcodes task of Multiome workflow; does not impact the Paired-tag workflow diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index eb11e9acc4..29d2594152 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -5,7 +5,7 @@ import "../../../pipelines/skylab/optimus/Optimus.wdl" as optimus import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils import "../../../tasks/skylab/PairedTagUtils.wdl" as Demultiplexing workflow PairedTag { - String pipeline_version = "0.2.0" + String pipeline_version = "0.2.1" input { String input_id diff --git a/pipelines/skylab/slideseq/SlideSeq.changelog.md b/pipelines/skylab/slideseq/SlideSeq.changelog.md index e041750353..1817b2665b 100644 --- a/pipelines/skylab/slideseq/SlideSeq.changelog.md +++ b/pipelines/skylab/slideseq/SlideSeq.changelog.md @@ -1,3 +1,11 @@ +# 3.1.3 +2024-03-01 (Date of Last Commit) +* Updated the Optimus.wdl to run on Azure. This change does not affect the SlideSeq pipeline. + +# 3.1.2 +2024-02-28 (Date of Last Commit) +* Updated the Optimus workflow to produce a library-level metrics CSV; this does not impact the slide-seq pipeline + # 3.1.1 2024-02-29 (Date of Last Commit) * Added mem and disk to inputs of Join Barcodes task of Multiome workflow; does not impact the Slideseq workflow diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index 66f6001da8..bc8df16dde 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -6,6 +6,8 @@ import "../../../tasks/skylab/Metrics.wdl" as Metrics import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils import "../../../tasks/skylab/CheckInputs.wdl" as OptimusInputChecks import "../../../tasks/skylab/MergeSortBam.wdl" as Merge +import "../../../tasks/broad/Utilities.wdl" as utils + ## Copyright Broad Institute, 2022 ## @@ -23,7 +25,7 @@ import "../../../tasks/skylab/MergeSortBam.wdl" as Merge workflow SlideSeq { - String pipeline_version = "3.1.1" + String pipeline_version = "3.1.3" input { Array[File] r1_fastq @@ -39,6 +41,33 @@ workflow SlideSeq { Boolean count_exons = true File bead_locations + String cloud_provider + + } + + # docker images + String pytools_docker = "pytools:1.0.0-1661263730" + String picard_cloud_docker = "picard-cloud:2.26.10" + String warp_tools_docker_2_0_1 = "warp-tools:2.0.1" + String warp_tools_docker_2_0_2 = "warp-tools:2.0.2-1709308985" + + String ubuntu_docker = "ubuntu_16_0_4:latest" + String gcp_ubuntu_docker_prefix = "gcr.io/gcp-runtimes/" + String acr_ubuntu_docker_prefix = "dsppipelinedev.azurecr.io/" + String ubuntu_docker_prefix = if cloud_provider == "gcp" then gcp_ubuntu_docker_prefix else acr_ubuntu_docker_prefix + + String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" + String acr_docker_prefix = "dsppipelinedev.azurecr.io/" + + # choose docker prefix based on cloud provider + String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix + + # make sure either gcp or azr is supplied as cloud_provider input + if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { + call utils.ErrorWithMessage as ErrorMessageIncorrectInput { + input: + message = "cloud_provider must be supplied with either 'gcp' or 'azure'." + } } parameter_meta { @@ -51,7 +80,8 @@ workflow SlideSeq { call StarAlign.STARGenomeRefVersion as ReferenceCheck { input: - tar_star_reference = tar_star_reference + tar_star_reference = tar_star_reference, + ubuntu_docker_path = ubuntu_docker_prefix + ubuntu_docker } call Metrics.FastqMetricsSlideSeq as FastqMetrics { @@ -86,13 +116,15 @@ workflow SlideSeq { input: bam_inputs = STARsoloFastqSlideSeq.bam_output, output_bam_filename = output_bam_basename + ".bam", - sort_order = "coordinate" + sort_order = "coordinate", + picard_cloud_docker_path = docker_prefix + picard_cloud_docker } call Metrics.CalculateGeneMetrics as GeneMetrics { input: bam_input = MergeBam.output_bam, original_gtf = annotations_gtf, - input_id = input_id + input_id = input_id, + warp_tools_docker_path = docker_prefix + warp_tools_docker_2_0_1 } call Metrics.CalculateUMIsMetrics as UMIsMetrics { input: @@ -105,7 +137,9 @@ workflow SlideSeq { input: bam_input = MergeBam.output_bam, original_gtf = annotations_gtf, - input_id = input_id + input_id = input_id, + warp_tools_docker_path = docker_prefix + warp_tools_docker_2_0_1 + } call StarAlign.MergeStarOutput as MergeStarOutputs { @@ -113,7 +147,8 @@ workflow SlideSeq { barcodes = STARsoloFastqSlideSeq.barcodes, features = STARsoloFastqSlideSeq.features, matrix = STARsoloFastqSlideSeq.matrix, - input_id = input_id + input_id = input_id, + warp_tools_docker_path = docker_prefix + warp_tools_docker_2_0_2 } if ( !count_exons ) { call H5adUtils.OptimusH5adGeneration as SlideseqH5adGeneration{ @@ -126,7 +161,9 @@ workflow SlideSeq { cell_id = MergeStarOutputs.row_index, gene_id = MergeStarOutputs.col_index, add_emptydrops_data = "no", - pipeline_version = "SlideSeq_v~{pipeline_version}" + pipeline_version = "SlideSeq_v~{pipeline_version}", + warp_tools_docker_path = docker_prefix + warp_tools_docker_2_0_1 + } } if (count_exons) { @@ -135,7 +172,8 @@ workflow SlideSeq { barcodes = STARsoloFastqSlideSeq.barcodes_sn_rna, features = STARsoloFastqSlideSeq.features_sn_rna, matrix = STARsoloFastqSlideSeq.matrix_sn_rna, - input_id = input_id + input_id = input_id, + warp_tools_docker_path = docker_prefix + warp_tools_docker_2_0_2 } call H5adUtils.SingleNucleusOptimusH5adOutput as OptimusH5adGenerationWithExons{ input: @@ -149,7 +187,8 @@ workflow SlideSeq { sparse_count_matrix_exon = MergeStarOutputsExons.sparse_counts, cell_id_exon = MergeStarOutputsExons.row_index, gene_id_exon = MergeStarOutputsExons.col_index, - pipeline_version = "SlideSeq_v~{pipeline_version}" + pipeline_version = "SlideSeq_v~{pipeline_version}", + warp_tools_docker_path = docker_prefix + warp_tools_docker_2_0_1 } } diff --git a/pipelines/skylab/slideseq/test_inputs/Plumbing/Puck_210817_11.mm10.json b/pipelines/skylab/slideseq/test_inputs/Plumbing/Puck_210817_11.mm10.json index d8998d1d9b..035b22c58e 100644 --- a/pipelines/skylab/slideseq/test_inputs/Plumbing/Puck_210817_11.mm10.json +++ b/pipelines/skylab/slideseq/test_inputs/Plumbing/Puck_210817_11.mm10.json @@ -13,5 +13,6 @@ "SlideSeq.tar_star_reference": "gs://gcp-public-data--broad-references/mm10/v0/single_nucleus/star/modified_star_2.7.9a_primary_gencode_mouse_vM23.tar", "SlideSeq.annotations_gtf": "gs://gcp-public-data--broad-references/mm10/v0/single_nucleus/modified_gencode.vM23.primary_assembly.annotation.gtf", "SlideSeq.count_exons": true, - "SlideSeq.bead_locations": " gs://broad-gotc-test-storage/SlideSeq/inputs/plumbing/Puck_210817_11/Puck_210817_11.tsv" + "SlideSeq.bead_locations": " gs://broad-gotc-test-storage/SlideSeq/inputs/plumbing/Puck_210817_11/Puck_210817_11.tsv", + "SlideSeq.cloud_provider": "gcp" } \ No newline at end of file diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md index 64b516e8b9..d3c50e9282 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md @@ -1,3 +1,13 @@ +# 1.3.2 +2024-03-01 (Date of Last Commit) + +* Updated the Optimus.wdl to run on Azure. This change does not affect the MultiSampleSmartSeq2SingleNucleus pipeline. + +# 1.3.1 +2024-02-28 (Date of Last Commit) + +* Updated the Optimus workflow to produce a library-level metrics CSV; this does not impact the Single-nucleus Multi Sample Smart-seq2 pipeline + # 1.3.0 2024-01-22 (Date of Last Commit) diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl index 7a4c1066f8..312e447204 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl @@ -6,6 +6,7 @@ import "../../../tasks/skylab/StarAlign.wdl" as StarAlign import "../../../tasks/skylab/Picard.wdl" as Picard import "../../../tasks/skylab/FeatureCounts.wdl" as CountAlignments import "../../../tasks/skylab/LoomUtils.wdl" as LoomUtils +import "../../../tasks/broad/Utilities.wdl" as utils workflow MultiSampleSmartSeq2SingleNucleus { meta { @@ -38,9 +39,25 @@ workflow MultiSampleSmartSeq2SingleNucleus { Array[String]? organ String? input_name_metadata_field String? input_id_metadata_field + + String cloud_provider + } + + String ubuntu_docker = "ubuntu_16_0_4:latest" + String gcp_ubuntu_docker_prefix = "gcr.io/gcp-runtimes/" + String acr_ubuntu_docker_prefix = "dsppipelinedev.azurecr.io/" + String ubuntu_docker_prefix = if cloud_provider == "gcp" then gcp_ubuntu_docker_prefix else acr_ubuntu_docker_prefix + + # make sure either gcp or azr is supplied as cloud_provider input + if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { + call utils.ErrorWithMessage as ErrorMessageIncorrectInput { + input: + message = "cloud_provider must be supplied with either 'gcp' or 'azure'." + } } + # Version of this pipeline - String pipeline_version = "1.3.0" + String pipeline_version = "1.3.2" if (false) { String? none = "None" @@ -72,7 +89,8 @@ workflow MultiSampleSmartSeq2SingleNucleus { call StarAlign.STARGenomeRefVersion as ReferenceCheck { input: - tar_star_reference = tar_star_reference + tar_star_reference = tar_star_reference, + ubuntu_docker_path = ubuntu_docker_prefix + ubuntu_docker } call TrimAdapters.TrimAdapters as TrimAdapters { diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/test_inputs/Plumbing/mouse_example.json b/pipelines/skylab/smartseq2_single_nucleus_multisample/test_inputs/Plumbing/mouse_example.json index 8fafd92173..db8f68b114 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/test_inputs/Plumbing/mouse_example.json +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/test_inputs/Plumbing/mouse_example.json @@ -18,5 +18,6 @@ "SM-GE644_S117_E1-50_GCGTAGTA-AAGGAGTA", "SM-GE644_S118_E1-50_GCGTAGTA-CTAAGCCT" ], - "MultiSampleSmartSeq2SingleNucleus.batch_id": "SM-GE644" + "MultiSampleSmartSeq2SingleNucleus.batch_id": "SM-GE644", + "MultiSampleSmartSeq2SingleNucleus.cloud_provider": "gcp" } diff --git a/pipelines/skylab/snM3C/snM3C.changelog.md b/pipelines/skylab/snM3C/snM3C.changelog.md index dc90a21239..f3fb853b6c 100644 --- a/pipelines/skylab/snM3C/snM3C.changelog.md +++ b/pipelines/skylab/snM3C/snM3C.changelog.md @@ -1,7 +1,7 @@ # 2.0.1 2024-2-15 (Date of Last Commit) -* Updated the snM3C task memory, disk, and CPUs +* Updated the snM3C task memory, disk, and CPUs # 2.0.0 2024-2-13 (Date of Last Commit) diff --git a/pipelines/skylab/snM3C/snM3C.wdl b/pipelines/skylab/snM3C/snM3C.wdl index bac72eb68c..bcdc71a861 100644 --- a/pipelines/skylab/snM3C/snM3C.wdl +++ b/pipelines/skylab/snM3C/snM3C.wdl @@ -23,6 +23,7 @@ workflow snM3C { Int num_downstr_bases = 2 Int compress_level = 5 Int batch_number + } # version of the pipeline diff --git a/tasks/skylab/CheckInputs.wdl b/tasks/skylab/CheckInputs.wdl index b24c77c133..89b99c7798 100644 --- a/tasks/skylab/CheckInputs.wdl +++ b/tasks/skylab/CheckInputs.wdl @@ -55,6 +55,8 @@ task checkInputArrays { task checkOptimusInput { input { + String cloud_provider + #String SAS_TOKEN File r1_fastq String counting_mode Boolean force_no_check @@ -63,9 +65,12 @@ task checkOptimusInput { Int machine_mem_mb = 1000 Int cpu = 1 Int tenx_chemistry_version - String whitelist_v2 - String whitelist_v3 + String gcp_whitelist_v2 + String gcp_whitelist_v3 + String azure_whitelist_v2 + String azure_whitelist_v3 Boolean ignore_r1_read_length + String alpine_docker_path } meta { @@ -108,15 +113,36 @@ task checkOptimusInput { echo "ERROR: Invalid value count_exons should not be used with \"${counting_mode}\" input." fi fi + # Check for chemistry version to produce read structure and whitelist if [[ ~{tenx_chemistry_version} == 2 ]] then - WHITELIST=~{whitelist_v2} + if [[ "~{cloud_provider}" == "gcp" ]] + then + WHITELIST=~{gcp_whitelist_v2} + elif [[ "~{cloud_provider}" == "azure" ]] + then + WHITELIST=~{azure_whitelist_v2} + else + pass="false" + echo "ERROR: Cloud provider must be either gcp or azure" + fi + echo "WHITELIST:" $WHITELIST echo $WHITELIST > whitelist.txt echo 16C10M > read_struct.txt elif [[ ~{tenx_chemistry_version} == 3 ]] then - WHITELIST=~{whitelist_v3} + if [[ "~{cloud_provider}" == "gcp" ]] + then + WHITELIST=~{gcp_whitelist_v3} + elif [[ "~{cloud_provider}" == "azure" ]] + then + WHITELIST=~{azure_whitelist_v3} + else + pass="false" + echo "ERROR: Cloud provider must be either gcp or azure" + fi + echo "WHITELIST:" $WHITELIST echo $WHITELIST > whitelist.txt echo 16C12M > read_struct.txt else @@ -153,7 +179,7 @@ task checkOptimusInput { String read_struct_out = read_string("read_struct.txt") } runtime { - docker: "bashell/alpine-bash:latest" + docker: alpine_docker_path cpu: cpu memory: "~{machine_mem_mb} MiB" disks: "local-disk ~{disk} HDD" diff --git a/tasks/skylab/FastqProcessing.wdl b/tasks/skylab/FastqProcessing.wdl index a4d7a8e615..939d1e1e12 100644 --- a/tasks/skylab/FastqProcessing.wdl +++ b/tasks/skylab/FastqProcessing.wdl @@ -11,7 +11,8 @@ task FastqProcessing { String read_struct #using the latest build of warp-tools in GCR - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" + String warp_tools_docker_path + #runtime values Int machine_mem_mb = 40000 Int cpu = 16 @@ -34,7 +35,7 @@ task FastqProcessing { whitelist: "10x genomics cell barcode whitelist" chemistry: "chemistry employed, currently can be tenX_v2 or tenX_v3, the latter implies NO feature barcodes" sample_id: "name of sample matching this file, inserted into read group header" - docker: "(optional) the docker image containing the runtime environment for this task" + warp_tools_docker_path: "(optional) the docker image containing the runtime environment for this task" machine_mem_mb: "(optional) the amount of memory (MiB) to provision for this task" cpu: "(optional) the number of cpus to provision for this task" disk: "(optional) the amount of disk space (GiB) to provision for this task" @@ -111,7 +112,7 @@ task FastqProcessing { } runtime { - docker: docker + docker: warp_tools_docker_path memory: "${machine_mem_mb} MiB" disks: "local-disk ${disk} HDD" disk: disk + " GB" # TES diff --git a/tasks/skylab/H5adUtils.wdl b/tasks/skylab/H5adUtils.wdl index 18fed45fc1..99ef957e4b 100644 --- a/tasks/skylab/H5adUtils.wdl +++ b/tasks/skylab/H5adUtils.wdl @@ -6,7 +6,7 @@ task OptimusH5adGeneration { input { #runtime values - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" + String warp_tools_docker_path # name of the sample String input_id # user provided id @@ -88,7 +88,7 @@ task OptimusH5adGeneration { >>> runtime { - docker: docker + docker: warp_tools_docker_path cpu: cpu # note that only 1 thread is supported by pseudobam memory: "~{machine_mem_mb} MiB" disks: "local-disk ~{disk} HDD" @@ -105,7 +105,7 @@ task SingleNucleusOptimusH5adOutput { input { #runtime values - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" + String warp_tools_docker_path # name of the sample String input_id # user provided id @@ -170,7 +170,7 @@ task SingleNucleusOptimusH5adOutput { } runtime { - docker: docker + docker: warp_tools_docker_path cpu: cpu # note that only 1 thread is supported by pseudobam memory: "~{machine_mem_mb} MiB" disks: "local-disk ~{disk} HDD" diff --git a/tasks/skylab/MergeSortBam.wdl b/tasks/skylab/MergeSortBam.wdl index 229ed18f8a..23ea466708 100644 --- a/tasks/skylab/MergeSortBam.wdl +++ b/tasks/skylab/MergeSortBam.wdl @@ -9,7 +9,7 @@ task MergeSortBamFiles { Int compression_level = 5 # runtime values - String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + String picard_cloud_docker_path Int machine_mem_mb = 18150 Int cpu = 1 # default to 500GiB of space @@ -28,7 +28,7 @@ task MergeSortBamFiles { parameter_meta { bam_inputs: "Merges Sam/Bam files" sort_order: "sort order of output bam" - docker: "(optional) the docker image containing the runtime environment for this task" + picard_cloud_docker_path: "(optional) the docker image containing the runtime environment for this task" machine_mem_mb: "(optional) the amount of memory (MiB) to provision for this task" cpu: "(optional) the number of cpus to provision for this task" disk: "(optional) the amount of disk space (GiB) to provision for this task" @@ -47,7 +47,7 @@ task MergeSortBamFiles { } runtime { - docker: docker + docker: picard_cloud_docker_path memory: "${machine_mem_mb} MiB" disks: "local-disk ${disk} HDD" disk: disk + " GB" # TES diff --git a/tasks/skylab/Metrics.wdl b/tasks/skylab/Metrics.wdl index fb91283d71..76b85d1012 100644 --- a/tasks/skylab/Metrics.wdl +++ b/tasks/skylab/Metrics.wdl @@ -8,7 +8,8 @@ task CalculateCellMetrics { String input_id # runtime values - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" + + String warp_tools_docker_path Int machine_mem_mb = 8000 Int cpu = 4 Int disk = ceil(size(bam_input, "Gi") * 4) + ceil((size(original_gtf, "Gi") * 3)) @@ -21,7 +22,7 @@ task CalculateCellMetrics { parameter_meta { bam_input: "Input bam file containing reads marked with tags for cell barcodes (CB), molecule barcodes (UB) and gene ids (GX)" - docker: "(optional) the docker image containing the runtime environment for this task" + warp_tools_docker_path: "(optional) the docker image containing the runtime environment for this task" machine_mem_mb: "(optional) the amount of memory (MiB) to provision for this task" cpu: "(optional) the number of cpus to provision for this task" disk: "(optional) the amount of disk space (GiB) to provision for this task" @@ -64,7 +65,7 @@ task CalculateCellMetrics { } runtime { - docker: docker + docker: warp_tools_docker_path memory: "${machine_mem_mb} MiB" disks: "local-disk ${disk} HDD" disk: disk + " GB" # TES @@ -85,7 +86,7 @@ task CalculateGeneMetrics { String input_id # runtime values - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" + String warp_tools_docker_path Int machine_mem_mb = 32000 Int cpu = 4 Int disk = ceil(size(bam_input, "Gi") * 4) + ceil((size(original_gtf, "Gi") * 3)) @@ -99,7 +100,7 @@ task CalculateGeneMetrics { parameter_meta { bam_input: "Input bam file containing reads marked with tags for cell barcodes (CB), molecule barcodes (UB) and gene ids (GE)" - docker: "(optional) the docker image containing the runtime environment for this task" + warp_tools_docker_path: "(optional) the docker image containing the runtime environment for this task" machine_mem_mb: "(optional) the amount of memory (MiB) to provision for this task" cpu: "(optional) the number of cpus to provision for this task" disk: "(optional) the amount of disk space (GiB) to provision for this task" @@ -144,7 +145,7 @@ task CalculateGeneMetrics { } runtime { - docker: docker + docker: warp_tools_docker_path memory: "${machine_mem_mb} MiB" disks: "local-disk ${disk} HDD" disk: disk + " GB" # TES diff --git a/tasks/skylab/RunEmptyDrops.wdl b/tasks/skylab/RunEmptyDrops.wdl index a0f60b1c99..0921393862 100644 --- a/tasks/skylab/RunEmptyDrops.wdl +++ b/tasks/skylab/RunEmptyDrops.wdl @@ -16,7 +16,7 @@ task RunEmptyDrops { Int emptydrops_lower = 100 # runtime values - String docker = "us.gcr.io/broad-gotc-prod/empty-drops:1.0.1-4.2" + String empty_drops_docker_path Int machine_mem_mb = 32000 Int cpu = 1 Int disk = 20 @@ -48,7 +48,7 @@ task RunEmptyDrops { } runtime { - docker: docker + docker: empty_drops_docker_path memory: "${machine_mem_mb} MiB" disks: "local-disk ${disk} HDD" disk: disk_size + " GB" # TES diff --git a/tasks/skylab/StarAlign.wdl b/tasks/skylab/StarAlign.wdl index 81f6668c42..e6ddc818f5 100644 --- a/tasks/skylab/StarAlign.wdl +++ b/tasks/skylab/StarAlign.wdl @@ -226,7 +226,7 @@ task STARsoloFastq { String? soloMultiMappers # runtime values - String docker = "us.gcr.io/broad-gotc-prod/star:1.0.1-2.7.11a-1692706072" + String star_docker_path Int machine_mem_mb = 64000 Int cpu = 8 # multiply input size by 2.2 to account for output bam file + 20% overhead, add size of reference. @@ -244,7 +244,7 @@ task STARsoloFastq { r2_fastq: "array of forward read FASTQ files" tar_star_reference: "star reference tarball built against the species that the bam_input is derived from" star_strand_mode: "STAR mode for handling stranded reads. Options are 'Forward', 'Reverse, or 'Unstranded'" - docker: "(optional) the docker image containing the runtime environment for this task" + star_docker_path: "(optional) the docker image containing the runtime environment for this task" machine_mem_mb: "(optional) the amount of memory (MiB) to provision for this task" cpu: "(optional) the number of cpus to provision for this task" disk: "(optional) the amount of disk space (GiB) to provision for this task" @@ -432,7 +432,7 @@ task STARsoloFastq { >>> runtime { - docker: docker + docker: star_docker_path memory: "~{machine_mem_mb} MiB" disks: "local-disk ~{disk} HDD" disk: disk + " GB" # TES @@ -475,11 +475,12 @@ task MergeStarOutput { Array[File]? summary Array[File]? align_features Array[File]? umipercell - + String? counting_mode + String input_id #runtime values - String docker = "us.gcr.io/broad-gotc-prod/pytools:1.0.0-1661263730" + String warp_tools_docker_path Int machine_mem_gb = 20 Int cpu = 1 Int disk = ceil(size(matrix, "Gi") * 2) + 10 @@ -490,7 +491,7 @@ task MergeStarOutput { } parameter_meta { - docker: "(optional) the docker image containing the runtime environment for this task" + warp_tools_docker_path: "(optional) the docker image containing the runtime environment for this task" machine_mem_gb: "(optional) the amount of memory (GiB) to provision for this task" cpu: "(optional) the number of cpus to provision for this task" disk: "(optional) the amount of disk space (GiB) to provision for this task" @@ -564,15 +565,18 @@ task MergeStarOutput { fi done - # If text files are present, create a tar archive with them + # If text files are present, create a tar archive with them and run python script to combine shard metrics if ls *.txt 1> /dev/null 2>&1; then + echo "listing files" + ls + python3 /warptools/scripts/combine_shard_metrics.py ~{input_id}_summary.txt ~{input_id}_align_features.txt ~{input_id}_cell_reads.txt ~{counting_mode} ~{input_id} tar -zcvf ~{input_id}.star_metrics.tar *.txt else echo "No text files found in the folder." fi # create the compressed raw count matrix with the counts, gene names and the barcodes - python3 /usr/gitc/create-merged-npz-output.py \ + python3 /warptools/scripts/create-merged-npz-output.py \ --barcodes ${barcodes_files[@]} \ --features ${features_files[@]} \ --matrix ${matrix_files[@]} \ @@ -580,7 +584,7 @@ task MergeStarOutput { >>> runtime { - docker: docker + docker: warp_tools_docker_path memory: "${machine_mem_gb} GiB" disks: "local-disk ${disk} HDD" disk: disk + " GB" # TES @@ -593,6 +597,7 @@ task MergeStarOutput { File col_index = "~{input_id}_sparse_counts_col_index.npy" File sparse_counts = "~{input_id}_sparse_counts.npz" File? cell_reads_out = "~{input_id}.star_metrics.tar" + File? library_metrics="~{input_id}_library_metrics.csv" } } @@ -717,6 +722,7 @@ task STARGenomeRefVersion { input { String tar_star_reference Int disk = 10 + String ubuntu_docker_path } meta { @@ -749,7 +755,7 @@ task STARGenomeRefVersion { } runtime { - docker: "gcr.io/gcp-runtimes/ubuntu_16_0_4:latest" + docker: ubuntu_docker_path memory: "2 GiB" disks: "local-disk ${disk} HDD" disk: disk + " GB" # TES diff --git a/verification/test-wdls/TestMultiSampleSmartSeq2SingleNucleus.wdl b/verification/test-wdls/TestMultiSampleSmartSeq2SingleNucleus.wdl index a09838c3a4..228b6b1f41 100644 --- a/verification/test-wdls/TestMultiSampleSmartSeq2SingleNucleus.wdl +++ b/verification/test-wdls/TestMultiSampleSmartSeq2SingleNucleus.wdl @@ -33,6 +33,8 @@ workflow TestMultiSampleSmartSeq2SingleNucleus { Boolean update_truth String vault_token_path String google_account_vault_path + + String cloud_provider } meta { @@ -57,7 +59,8 @@ workflow TestMultiSampleSmartSeq2SingleNucleus { species = species, organ = organ, input_name_metadata_field = input_name_metadata_field, - input_id_metadata_field = input_id_metadata_field + input_id_metadata_field = input_id_metadata_field, + cloud_provider = cloud_provider } diff --git a/verification/test-wdls/TestMultiome.wdl b/verification/test-wdls/TestMultiome.wdl index 9a4a0ec83a..6da047efcc 100644 --- a/verification/test-wdls/TestMultiome.wdl +++ b/verification/test-wdls/TestMultiome.wdl @@ -10,6 +10,7 @@ workflow TestMultiome { input { String input_id + String cloud_provider # Optimus Inputs String counting_mode = "sn_rna" @@ -85,7 +86,8 @@ workflow TestMultiome { chrom_sizes = chrom_sizes, atac_whitelist = atac_whitelist, run_cellbender = run_cellbender, - soloMultiMappers = soloMultiMappers + soloMultiMappers = soloMultiMappers, + cloud_provider = cloud_provider } diff --git a/verification/test-wdls/TestOptimus.wdl b/verification/test-wdls/TestOptimus.wdl index 82bdf03adc..51e34e04e9 100644 --- a/verification/test-wdls/TestOptimus.wdl +++ b/verification/test-wdls/TestOptimus.wdl @@ -59,6 +59,8 @@ workflow TestOptimus { String vault_token_path String google_account_vault_path + String cloud_provider + } meta { @@ -84,7 +86,8 @@ workflow TestOptimus { star_strand_mode = star_strand_mode, count_exons = count_exons, ignore_r1_read_length = ignore_r1_read_length, - soloMultiMappers = soloMultiMappers + soloMultiMappers = soloMultiMappers, + cloud_provider = cloud_provider } # Collect all of the pipeling output into single Array diff --git a/verification/test-wdls/TestSlideSeq.wdl b/verification/test-wdls/TestSlideSeq.wdl index b63cd87099..b0523fee21 100644 --- a/verification/test-wdls/TestSlideSeq.wdl +++ b/verification/test-wdls/TestSlideSeq.wdl @@ -26,6 +26,7 @@ workflow TestSlideSeq { Boolean update_truth String vault_token_path String google_account_vault_path + String cloud_provider } meta { @@ -43,7 +44,8 @@ workflow TestSlideSeq { annotations_gtf = annotations_gtf, output_bam_basename = output_bam_basename, count_exons = count_exons, - bead_locations = bead_locations + bead_locations = bead_locations, + cloud_provider = cloud_provider } diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md index 4f0750f35d..547bbeb5ac 100644 --- a/website/docs/Pipelines/ATAC/README.md +++ b/website/docs/Pipelines/ATAC/README.md @@ -8,7 +8,7 @@ slug: /Pipelines/ATAC/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [1.1.8](https://github.com/broadinstitute/warp/releases) | January, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [1.1.9](https://github.com/broadinstitute/warp/releases) | March, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | ## Introduction to the ATAC workflow diff --git a/website/docs/Pipelines/Multiome_Pipeline/README.md b/website/docs/Pipelines/Multiome_Pipeline/README.md index 3409347d3f..511f27c285 100644 --- a/website/docs/Pipelines/Multiome_Pipeline/README.md +++ b/website/docs/Pipelines/Multiome_Pipeline/README.md @@ -8,7 +8,7 @@ slug: /Pipelines/Multiome_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [Multiome v3.2.0](https://github.com/broadinstitute/warp/releases) | February, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact the [WARP Pipeline Development team](mailto:warp-pipelines-help@broadinstitute.org) | +| [Multiome v3.2.2](https://github.com/broadinstitute/warp/releases) | March, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact the [WARP Pipeline Development team](mailto:warp-pipelines-help@broadinstitute.org) | ![Multiome_diagram](./multiome_diagram.png) @@ -56,6 +56,7 @@ Multiome can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/sta | Input name | Description | Type | | --- | --- | --- | | input_id | Unique identifier describing the biological sample or replicate that corresponds with the FASTQ files; can be a human-readable name or UUID. | String | +| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | | annotations_gtf | GTF file containing gene annotations used for GEX cell metric calculation and ATAC fragment metrics; must match the GTF used to build the STAR aligner. | File | | gex_r1_fastq | Array of read 1 FASTQ files representing a single GEX 10x library. | Array[File] | | gex_r2_fastq | Array of read 2 FASTQ files representing a single GEX 10x library.| Array[File] | @@ -69,7 +70,7 @@ Multiome can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/sta | ignore_r1_read_length | Optional boolean for the Optimus (GEX) pipeline indicating if the pipeline should ignore barcode chemistry check; if "true", the workflow will not ensure the `10x_chemistry_version` input matches the chemistry in the read 1 FASTQ; default is "false". | Boolean | | star_strand_mode | Optional string for the Optimus (GEX) pipeline for performing STARsolo alignment on forward stranded, reverse stranded, or unstranded data; default is "Forward". | String | | count_exons | Optional boolean for the Optimus (GEX) pipeline indicating if the workflow should calculate exon counts **when in single-nucleus (sn_rna) mode**; if "true" in sc_rna mode, the workflow will return an error; default is "false". | Boolean | -| gex_whitelist | Optional file containing the list of valid barcodes for 10x multiome GEX data; default is "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_gex.txt". | File | +| gex_whitelist | Optional file containing the list of valid barcodes for 10x multiome GEX data; default is "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_gex.txt" when run on GCP. | File | | soloMultiMappers | Optional string describing whether or not the Optimus (GEX) pipeline should run STARsolo with the `--soloMultiMappers` flag. | String | | atac_r1_fastq | Array of read 1 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] | | atac_r2_fastq | Array of barcodes FASTQ files representing a single 10x multiome ATAC library. | Array[File] | diff --git a/website/docs/Pipelines/Optimus_Pipeline/README.md b/website/docs/Pipelines/Optimus_Pipeline/README.md index 382804e447..67a8ea0f7b 100644 --- a/website/docs/Pipelines/Optimus_Pipeline/README.md +++ b/website/docs/Pipelines/Optimus_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Optimus_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [optimus_v6.4.0](https://github.com/broadinstitute/warp/releases?q=optimus&expanded=true) | February, 2024 | Elizabeth Kiernan | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [optimus_v6.4.2](https://github.com/broadinstitute/warp/releases?q=optimus&expanded=true) | March, 2024 | Elizabeth Kiernan | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | ![Optimus_diagram](Optimus_diagram.png) @@ -85,6 +85,7 @@ The example configuration files also contain metadata for the reference files, d | Parameter name | Description | Optional attributes (when applicable) | | --- | --- | --- | +| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | | whitelist | List of known CBs; the workflow automatically selects the [10x Genomics](https://www.10xgenomics.com/) whitelist that corresponds to the v2 or v3 chemistry based on the input `tenx_chemistry_version`. A custom whitelist can also be provided if the input data was generated with a chemistry different from 10x Genomics v2 or v3. To use a custom whitelist, set the input `ignore_r1_read_length` to "true". | N/A | | read_struct | String describing the structure of reads; the workflow automatically selects the [10x Genomics](https://www.10xgenomics.com/) read structure that corresponds to the v2 or v3 chemistry based on the input `tenx_chemistry_version`. A custom read structure can also be provided if the input data was generated with a chemistry different from 10x Genomics v2 or v3. To use a custom read structure, set the input `force_no_check` to "true". | N/A | | tar_star_reference | TAR file containing a species-specific reference genome and GTF; it is generated using the [BuildIndices workflow](https://github.com/broadinstitute/warp/tree/master/pipelines/skylab/build_indices/BuildIndices.wdl). | N/A | diff --git a/website/docs/Pipelines/PairedTag_Pipeline/README.md b/website/docs/Pipelines/PairedTag_Pipeline/README.md index cc0114a766..40d588fb58 100644 --- a/website/docs/Pipelines/PairedTag_Pipeline/README.md +++ b/website/docs/Pipelines/PairedTag_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/PairedTag_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [PairedTag_v0.1.0](https://github.com/broadinstitute/warp/releases) | February, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | +| [PairedTag_v0.2.1](https://github.com/broadinstitute/warp/releases) | March, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | ## Introduction to the Paired-Tag workflow diff --git a/website/docs/Pipelines/SlideSeq_Pipeline/README.md b/website/docs/Pipelines/SlideSeq_Pipeline/README.md index 0b59323acf..7cf8c08935 100644 --- a/website/docs/Pipelines/SlideSeq_Pipeline/README.md +++ b/website/docs/Pipelines/SlideSeq_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/SlideSeq_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [SlideSeq v3.1.0](https://github.com/broadinstitute/warp/releases) | February, 2024 | Elizabeth Kiernan & Kaylee Mathews | Please file GitHub issues in warp or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | +| [SlideSeq v3.1.2](https://github.com/broadinstitute/warp/releases) | March, 2024 | Elizabeth Kiernan & Kaylee Mathews | Please file GitHub issues in warp or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | ![SlideSeq_diagram](./slide-seq_diagram.png) @@ -69,6 +69,7 @@ The Slide-seq workflow inputs are specified in JSON configuration files. Example | output_bam_basename | Optional string used for the output BAM file basename. | String | | count_exons | Optional boolean indicating if the workflow should calculate exon counts; default is set to “true” and produces an h5ad file containing both whole-gene counts and exon counts in an additional layer; when set to “false”, an h5ad file containing only whole-gene counts is produced. | Boolean | | bead_locations | Whitelist TSV file containing bead barcodes and XY coordinates on a single line for each bead; determined by sequencing prior to mRNA transfer and library preparation. | File | +| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | #### Pseudogene handling diff --git a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md index 09acab0beb..1613d69876 100644 --- a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md +++ b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [MultiSampleSmartSeq2SingleNuclei_v1.3.0](https://github.com/broadinstitute/warp/releases) | February, 2024 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in WARP or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [MultiSampleSmartSeq2SingleNuclei_v1.3.1](https://github.com/broadinstitute/warp/releases) | March, 2024 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in WARP or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | ![](./snSS2.png) @@ -82,6 +82,7 @@ The table below details the Multi-snSS2 inputs. The pipeline is designed to take | species | Optional description of the species from which the cells were derived. | Array of strings | | input_name_metadata_field | Optional input describing, when applicable, the metadata field containing the `input_names`. | String | | input_id_metadata_field | Optional string describing, when applicable, the metadata field containing the `input_ids`. | String | +| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | ## Multi-snSS2 tasks and tools diff --git a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/multi_snss2.methods.md b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/multi_snss2.methods.md index 8ab56b15bd..a758e085cb 100644 --- a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/multi_snss2.methods.md +++ b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/multi_snss2.methods.md @@ -2,13 +2,13 @@ sidebar_position: 2 --- -# Smart-seq2 Single Nucleus Multi-Sample v1.3.0 Publication Methods +# Smart-seq2 Single Nucleus Multi-Sample v1.3.1 Publication Methods Below we provide an example methods section for a publication. For the complete pipeline documentation, see the [Smart-seq2 Single Nucleus Multi-Sample Overview](./README.md). ## Methods -Data preprocessing and count matrix construction for a batch (or plate) were performed using the Smart-seq2 Single Nucleus Multi-Sample v1.3.0 Pipeline (RRID:SCR_021312) as well as Picard v.2.26.10 with default tool parameters unless otherwise specified. Genomic references are publicly available in the [Broad References](https://console.cloud.google.com/storage/browser/gcp-public-data--broad-references/mm10/v0/single_nucleus?pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))&prefix=&forceOnObjectsSortingFiltering=false) Google Bucket and are also listed in the [example workflow configuration](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/smartseq2_single_nucleus_multisample/mouse_example.json) in GitHub. +Data preprocessing and count matrix construction for a batch (or plate) were performed using the Smart-seq2 Single Nucleus Multi-Sample v1.3.1 Pipeline (RRID:SCR_021312) as well as Picard v.2.26.10 with default tool parameters unless otherwise specified. Genomic references are publicly available in the [Broad References](https://console.cloud.google.com/storage/browser/gcp-public-data--broad-references/mm10/v0/single_nucleus?pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))&prefix=&forceOnObjectsSortingFiltering=false) Google Bucket and are also listed in the [example workflow configuration](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/smartseq2_single_nucleus_multisample/mouse_example.json) in GitHub. For each nucleus in the batch, paired-end FASTQ files were first trimmed to remove adapters using the fastq-mcf tool with a subsampling parameter of 200,000 reads. The trimmed FASTQ files were then aligned to the GENCODE GRCm38 mouse genome using STAR v.2.7.10a. To count the number of reads per gene, but not isoforms, the quantMode parameter was set to GeneCounts. Multi-mapped reads, and optical and PCR duplicates, were removed from the resulting aligned BAM using the Picard MarkDuplicates tool with REMOVE_DUPLICATES = true. Metrics were collected on the deduplicated BAM using Picard CollectMultipleMetrics with VALIDATION_STRINGENCY =SILENT. From 76762df2f12e9077add4e9c42c2f9903e034e122 Mon Sep 17 00:00:00 2001 From: Nikelle Petrillo <38223776+nikellepetrillo@users.noreply.github.com> Date: Thu, 14 Mar 2024 11:05:34 -0400 Subject: [PATCH 003/186] Ph pd 2514 multiome on terra (#1237) * ph logic to pass in docker images based on cloud provider * determine which whitelist files to use * update tests * add parameter metadata * add error handling in atac * fix comment * PR comments * update image and add utils * add import --------- Co-authored-by: phendriksen100 <103142505+phendriksen100@users.noreply.github.com> --- pipelines/skylab/multiome/Multiome.wdl | 36 ++++++++++-- pipelines/skylab/multiome/atac.json | 1 + pipelines/skylab/multiome/atac.wdl | 57 ++++++++++++++----- .../Plumbing/10k_pbmc_downsampled.json | 1 + .../test_inputs/Scientific/10k_pbmc.json | 1 + tasks/skylab/FastqProcessing.wdl | 9 +-- tasks/skylab/H5adUtils.wdl | 10 ++-- tasks/skylab/PairedTagUtils.wdl | 8 +-- 8 files changed, 88 insertions(+), 35 deletions(-) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 64aa671836..73ae9d9670 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -4,11 +4,13 @@ import "../../../pipelines/skylab/multiome/atac.wdl" as atac import "../../../pipelines/skylab/optimus/Optimus.wdl" as optimus import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils import "https://raw.githubusercontent.com/broadinstitute/CellBender/v0.3.0/wdl/cellbender_remove_background.wdl" as CellBender +import "../../../tasks/broad/Utilities.wdl" as utils workflow Multiome { String pipeline_version = "3.2.2" input { + String cloud_provider String input_id String cloud_provider @@ -26,7 +28,6 @@ workflow Multiome { Boolean ignore_r1_read_length = false String star_strand_mode = "Forward" Boolean count_exons = false - File gex_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_gex.txt" String? soloMultiMappers # ATAC inputs @@ -34,7 +35,6 @@ workflow Multiome { Array[File] atac_r1_fastq Array[File] atac_r2_fastq Array[File] atac_r3_fastq - # BWA tar reference File tar_bwa_reference # Chromosone sizes @@ -42,17 +42,42 @@ workflow Multiome { # Trimadapters input String adapter_seq_read1 = "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG" String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" - # Whitelist - File atac_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_atac.txt" # CellBender Boolean run_cellbender = false } + # Determine docker prefix based on cloud provider + String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" + String acr_docker_prefix = "dsppipelinedev.azurecr.io/" + String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix + + # Define docker images + String snap_atac_docker_image = "snapatac2:1.0.5-2.3.2-1709230223" + + # Define all whitelist files + File gcp_gex_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_gex.txt" + File gcp_atac_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_atac.txt" + File azure_gex_whitelist = "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/arc-v1/737K-arc-v1_gex.txt" + File azure_atac_whitelist = "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/arc-v1/737K-arc-v1_atac.txt" + + # Determine which whitelist files to use based on cloud provider + File gex_whitelist = if cloud_provider == "gcp" then gcp_gex_whitelist else azure_gex_whitelist + File atac_whitelist = if cloud_provider == "gcp" then gcp_atac_whitelist else azure_atac_whitelist + + # Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error + if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { + call utils.ErrorWithMessage as ErrorMessageIncorrectInput { + input: + message = "cloud_provider must be supplied with either 'gcp' or 'azure'." + } + } + # Call the Optimus workflow call optimus.Optimus as Optimus { input: + cloud_provider = cloud_provider, counting_mode = counting_mode, r1_fastq = gex_r1_fastq, r2_fastq = gex_r2_fastq, @@ -76,6 +101,7 @@ workflow Multiome { # Call the ATAC workflow call atac.ATAC as Atac { input: + cloud_provider = cloud_provider, read1_fastq_gzipped = atac_r1_fastq, read2_fastq_gzipped = atac_r2_fastq, read3_fastq_gzipped = atac_r3_fastq, @@ -89,6 +115,7 @@ workflow Multiome { } call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes { input: + docker_path = docker_prefix + snap_atac_docker_image, atac_h5ad = Atac.snap_metrics, gex_h5ad = Optimus.h5ad_output_file, gex_whitelist = gex_whitelist, @@ -110,7 +137,6 @@ workflow Multiome { hardware_preemptible_tries = 2, hardware_zones = "us-central1-a us-central1-c", nvidia_driver_version = "470.82.01" - } } diff --git a/pipelines/skylab/multiome/atac.json b/pipelines/skylab/multiome/atac.json index a8b9465fdc..1e898edd48 100644 --- a/pipelines/skylab/multiome/atac.json +++ b/pipelines/skylab/multiome/atac.json @@ -4,6 +4,7 @@ "ATAC.TrimAdapters.adapter_seq_read1": "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG", "ATAC.TrimAdapters.adapter_seq_read2": "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG", "ATAC.input_id": "scATAC", + "ATAC.cloud_provider":"gcp", "ATAC.tar_bwa_reference": "gs://fc-dd55e131-ef49-4d02-aa2a-20640daaae1e/submissions/8f0dd71a-b42f-4503-b839-3f146941758a/IndexRef/53a91851-1f6c-4ab9-af66-b338ffb28b5a/call-BwaMem2Index/GRCh38.primary_assembly.genome.bwamem2.fa.tar", "ATAC.preindex": "false" } diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index 0431ba3997..be597c1f62 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -3,6 +3,7 @@ version 1.0 import "../../../tasks/skylab/MergeSortBam.wdl" as Merge import "../../../tasks/skylab/FastqProcessing.wdl" as FastqProcessing import "../../../tasks/skylab/PairedTagUtils.wdl" as AddBB +import "../../../tasks/broad/Utilities.wdl" as utils workflow ATAC { meta { @@ -18,6 +19,7 @@ workflow ATAC { # Output prefix/base name for all intermediate files and pipeline outputs String input_id + String cloud_provider # Option for running files with preindex Boolean preindex = false @@ -43,6 +45,26 @@ workflow ATAC { String pipeline_version = "1.1.9" + # Determine docker prefix based on cloud provider + String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" + String acr_docker_prefix = "dsppipelinedev.azurecr.io/" + String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix + + # Docker image names + String warp_tools_2_0_0 = "warp-tools:2.0.0" + String cutadapt_docker = "cutadapt:1.0.0-4.4-1709146458" + String samtools_docker = "samtools-dist-bwa:3.0.0" + String upstools_docker = "upstools:1.0.0-2023.03.03-1704300311" + String snap_atac_docker = "snapatac2:1.0.4-2.3.1" + + # Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error + if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { + call utils.ErrorWithMessage as ErrorMessageIncorrectInput { + input: + message = "cloud_provider must be supplied with either 'gcp' or 'azure'." + } + } + parameter_meta { read1_fastq_gzipped: "read 1 FASTQ file as input for the pipeline, contains read 1 of paired reads" read2_fastq_gzipped: "read 2 FASTQ file as input for the pipeline, contains the cellular barcodes corresponding to the reads in the read1 FASTQ and read 3 FASTQ" @@ -52,7 +74,6 @@ workflow ATAC { num_threads_bwa: "Number of threads for bwa-mem2 task (default: 128)" mem_size_bwa: "Memory size in GB for bwa-mem2 task (default: 512)" cpu_platform_bwa: "CPU platform for bwa-mem2 task (default: Intel Ice Lake)" - } call GetNumSplits { @@ -69,7 +90,8 @@ workflow ATAC { barcodes_fastq = read2_fastq_gzipped, output_base_name = input_id, num_output_files = GetNumSplits.ranks_per_node_out, - whitelist = whitelist + whitelist = whitelist, + docker_path = docker_prefix + warp_tools_2_0_0 } scatter(idx in range(length(SplitFastq.fastq_R1_output_array))) { @@ -79,7 +101,8 @@ workflow ATAC { read3_fastq = SplitFastq.fastq_R3_output_array[idx], output_base_name = input_id + "_" + idx, adapter_seq_read1 = adapter_seq_read1, - adapter_seq_read3 = adapter_seq_read3 + adapter_seq_read3 = adapter_seq_read3, + docker_path = docker_prefix + cutadapt_docker } } @@ -91,21 +114,24 @@ workflow ATAC { output_base_name = input_id, nthreads = num_threads_bwa, mem_size = mem_size_bwa, - cpu_platform = cpu_platform_bwa + cpu_platform = cpu_platform_bwa, + docker_path = docker_prefix + samtools_docker } if (preindex) { call AddBB.AddBBTag as BBTag { input: bam = BWAPairedEndAlignment.bam_aligned_output, - input_id = input_id + input_id = input_id, + docker_path = docker_prefix + upstools_docker } call CreateFragmentFile as BB_fragment { input: bam = BBTag.bb_bam, chrom_sizes = chrom_sizes, annotations_gtf = annotations_gtf, - preindex = preindex + preindex = preindex, + docker_path = docker_prefix + snap_atac_docker } } if (!preindex) { @@ -114,7 +140,8 @@ workflow ATAC { bam = BWAPairedEndAlignment.bam_aligned_output, chrom_sizes = chrom_sizes, annotations_gtf = annotations_gtf, - preindex = preindex + preindex = preindex, + docker_path = docker_prefix + snap_atac_docker } } @@ -231,7 +258,7 @@ task TrimAdapters { # Runtime attributes/docker Int disk_size = ceil(2 * ( size(read1_fastq, "GiB") + size(read3_fastq, "GiB") )) + 200 Int mem_size = 4 - String docker_image = "us.gcr.io/broad-gotc-prod/cutadapt:1.0.0-4.4-1686752919" + String docker_path } parameter_meta { @@ -242,7 +269,7 @@ task TrimAdapters { adapter_seq_read1: "cutadapt option for the sequence adapter for read 1 fastq" adapter_seq_read3: "cutadapt option for the sequence adapter for read 3 fastq" output_base_name: "base name to be used for the output of the task" - docker_image: "the docker image using cutadapt to be used (default:us.gcr.io/broad-gotc-prod/cutadapt:1.0.0-4.4-1686752919)" + docker_path: "The docker image path containing the runtime environment for this task" mem_size: "the size of memory used during trimming adapters" disk_size : "disk size used in trimming adapters step" } @@ -269,7 +296,7 @@ task TrimAdapters { # use docker image for given tool cutadapat runtime { - docker: docker_image + docker: docker_path disks: "local-disk ${disk_size} HDD" memory: "${mem_size} GiB" } @@ -290,7 +317,7 @@ task BWAPairedEndAlignment { String read_group_sample_name = "RGSN1" String suffix = "trimmed_adapters.fastq.gz" String output_base_name - String docker_image = "us.gcr.io/broad-gotc-prod/samtools-dist-bwa:2.0.0" + String docker_path # Runtime attributes Int disk_size = 2000 @@ -309,7 +336,7 @@ task BWAPairedEndAlignment { mem_size: "the size of memory used during alignment" disk_size : "disk size used in bwa alignment step" output_base_name: "basename to be used for the output of the task" - docker_image: "the docker image using BWA to be used (default: us.gcr.io/broad-gotc-prod/samtools-bwa-mem-2:1.0.0-2.2.1_x64-linux-1685469504)" + docker_path: "The docker image path containing the runtime environment for this task" } String bam_aligned_output_name = output_base_name + ".bam" @@ -418,7 +445,7 @@ task BWAPairedEndAlignment { >>> runtime { - docker: docker_image + docker: docker_path disks: "local-disk ${disk_size} SSD" cpu: nthreads cpuPlatform: cpu_platform @@ -442,6 +469,7 @@ task CreateFragmentFile { Int mem_size = 16 Int nthreads = 1 String cpuPlatform = "Intel Cascade Lake" + String docker_path } String bam_base_name = basename(bam, ".bam") @@ -452,6 +480,7 @@ task CreateFragmentFile { chrom_sizes: "Text file containing chrom_sizes for genome build (i.e. hg38)." disk_size: "Disk size used in create fragment file step." mem_size: "The size of memory used in create fragment file." + docker_path: "The docker image path containing the runtime environment for this task" } command <<< @@ -492,7 +521,7 @@ task CreateFragmentFile { >>> runtime { - docker: "us.gcr.io/broad-gotc-prod/snapatac2:1.0.4-2.3.1" + docker: docker_path disks: "local-disk ${disk_size} SSD" memory: "${mem_size} GiB" cpu: nthreads diff --git a/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json b/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json index bd9b7a1172..c4a7d6d5d7 100644 --- a/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json +++ b/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json @@ -1,6 +1,7 @@ { "Multiome.annotations_gtf":"gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_v43.annotation.gtf", "Multiome.input_id":"10k_PBMC_downsampled", + "Multiome.cloud_provider":"gcp", "Multiome.gex_r1_fastq":[ "gs://broad-gotc-test-storage/Multiome/input/plumbing/fastq_R1_gex.fastq.gz" ], diff --git a/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json b/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json index a5ddf2c947..3ca7b1d546 100644 --- a/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json +++ b/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json @@ -5,6 +5,7 @@ "gs://broad-gotc-test-storage/Multiome/input/scientific/10k_PBMC_Multiome/10k_PBMC_Multiome_nextgem_Chromium_Controller_gex_S1_L002_I1_001.fastq.gz" ], "Multiome.input_id":"10k_PBMC", + "Multiome.cloud_provider":"gcp", "Multiome.gex_r1_fastq":[ "gs://broad-gotc-test-storage/Multiome/input/scientific/10k_PBMC_Multiome/10k_PBMC_Multiome_nextgem_Chromium_Controller_gex_S1_L001_R1_001.fastq.gz", "gs://broad-gotc-test-storage/Multiome/input/scientific/10k_PBMC_Multiome/10k_PBMC_Multiome_nextgem_Chromium_Controller_gex_S1_L002_R1_001.fastq.gz" diff --git a/tasks/skylab/FastqProcessing.wdl b/tasks/skylab/FastqProcessing.wdl index 939d1e1e12..bd6f9b06b7 100644 --- a/tasks/skylab/FastqProcessing.wdl +++ b/tasks/skylab/FastqProcessing.wdl @@ -244,10 +244,7 @@ task FastqProcessATAC { String output_base_name File whitelist String barcode_index1 = basename(barcodes_fastq[0]) - - # [?] copied from corresponding optimus wdl for fastqprocessing - # using the latest build of warp-tools in GCR - String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1" + String docker_path # Runtime attributes [?] Int mem_size = 5 @@ -273,7 +270,7 @@ task FastqProcessATAC { read_structure: "A string that specifies the barcode (C) positions in the Read 2 fastq" barcode_orientation: "A string that specifies the orientation of barcode needed for scATAC data. The default is FIRST_BP. Other options include LAST_BP, FIRST_BP_RC or LAST_BP_RC." whitelist: "10x genomics cell barcode whitelist for scATAC" - docker: "(optional) the docker image containing the runtime environment for this task" + docker_path: "The docker image path containing the runtime environment for this task" mem_size: "(optional) the amount of memory (MiB) to provision for this task" cpu: "(optional) the number of cpus to provision for this task" disk_size: "(optional) the amount of disk space (GiB) to provision for this task" @@ -362,7 +359,7 @@ task FastqProcessATAC { >>> runtime { - docker: docker + docker: docker_path cpu: cpu memory: "${mem_size} MiB" disks: "local-disk ${disk_size} HDD" diff --git a/tasks/skylab/H5adUtils.wdl b/tasks/skylab/H5adUtils.wdl index 99ef957e4b..54a27de18f 100644 --- a/tasks/skylab/H5adUtils.wdl +++ b/tasks/skylab/H5adUtils.wdl @@ -184,7 +184,7 @@ task SingleNucleusOptimusH5adOutput { } task JoinMultiomeBarcodes { - input { + input { File atac_h5ad File atac_fragment File gex_h5ad @@ -196,9 +196,9 @@ task JoinMultiomeBarcodes { Int machine_mem_mb = ceil((size(atac_h5ad, "MiB") + size(gex_h5ad, "MiB") + size(atac_fragment, "MiB")) * 3) + 10000 Int disk = ceil((size(atac_h5ad, "GiB") + size(gex_h5ad, "GiB") + size(atac_fragment, "GiB")) * 5) + 10 } - String gex_base_name = basename(gex_h5ad, ".h5ad") - String atac_base_name = basename(atac_h5ad, ".h5ad") - String atac_fragment_base = basename(atac_fragment, ".tsv") + String gex_base_name = basename(gex_h5ad, ".h5ad") + String atac_base_name = basename(atac_h5ad, ".h5ad") + String atac_fragment_base = basename(atac_fragment, ".tsv") parameter_meta { atac_h5ad: "The resulting h5ad from the ATAC workflow." @@ -277,7 +277,7 @@ task JoinMultiomeBarcodes { >>> runtime { - docker: "us.gcr.io/broad-gotc-prod/snapatac2:1.0.4-2.3.1-1700590229" + docker: docker_path disks: "local-disk ~{disk} HDD" memory: "${machine_mem_mb} MiB" cpu: nthreads diff --git a/tasks/skylab/PairedTagUtils.wdl b/tasks/skylab/PairedTagUtils.wdl index 779ac4fe57..3abc7df45a 100644 --- a/tasks/skylab/PairedTagUtils.wdl +++ b/tasks/skylab/PairedTagUtils.wdl @@ -130,9 +130,7 @@ task AddBBTag { input { File bam String input_id - - # using the latest build of upstools docker in GCR - String docker = "us.gcr.io/broad-gotc-prod/upstools:1.0.0-2023.03.03-1704300311" + String docker_path # Runtime attributes Int mem_size = 8 @@ -150,7 +148,7 @@ task AddBBTag { parameter_meta { bam: "BAM with aligned reads and barcode in the CB tag" input_id: "input ID" - docker: "(optional) the docker image containing the runtime environment for this task" + docker_path: "The docker image path containing the runtime environment for this task" mem_size: "(optional) the amount of memory (MiB) to provision for this task" cpu: "(optional) the number of cpus to provision for this task" disk_size: "(optional) the amount of disk space (GiB) to provision for this task" @@ -169,7 +167,7 @@ task AddBBTag { >>> runtime { - docker: docker + docker: docker_path cpu: cpu memory: "${mem_size} GiB" disks: "local-disk ${disk_size} HDD" From a859091b8d0d99c8ce3afda48ced2dc5b0bd5861 Mon Sep 17 00:00:00 2001 From: npetrill Date: Thu, 14 Mar 2024 11:25:42 -0400 Subject: [PATCH 004/186] lost the docker_path in joinmultiomebarcode task --- tasks/skylab/H5adUtils.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/tasks/skylab/H5adUtils.wdl b/tasks/skylab/H5adUtils.wdl index 54a27de18f..f5e61243a5 100644 --- a/tasks/skylab/H5adUtils.wdl +++ b/tasks/skylab/H5adUtils.wdl @@ -195,6 +195,7 @@ task JoinMultiomeBarcodes { String cpuPlatform = "Intel Cascade Lake" Int machine_mem_mb = ceil((size(atac_h5ad, "MiB") + size(gex_h5ad, "MiB") + size(atac_fragment, "MiB")) * 3) + 10000 Int disk = ceil((size(atac_h5ad, "GiB") + size(gex_h5ad, "GiB") + size(atac_fragment, "GiB")) * 5) + 10 + String docker_path } String gex_base_name = basename(gex_h5ad, ".h5ad") String atac_base_name = basename(atac_h5ad, ".h5ad") From 831e2444d6e3d26b7156a565709f4ed4a10a4af0 Mon Sep 17 00:00:00 2001 From: npetrill Date: Thu, 14 Mar 2024 11:42:41 -0400 Subject: [PATCH 005/186] update TestMultiome.wdl --- verification/test-wdls/TestMultiome.wdl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/verification/test-wdls/TestMultiome.wdl b/verification/test-wdls/TestMultiome.wdl index 6da047efcc..1bc9953637 100644 --- a/verification/test-wdls/TestMultiome.wdl +++ b/verification/test-wdls/TestMultiome.wdl @@ -26,7 +26,6 @@ workflow TestMultiome { Boolean ignore_r1_read_length = false String star_strand_mode = "Forward" Boolean count_exons = false - File gex_whitelist = "gs://broad-gotc-test-storage/Multiome/input/737K-arc-v1_gex.txt" String? soloMultiMappers # ATAC inputs @@ -43,8 +42,6 @@ workflow TestMultiome { # Trimadapters input String adapter_seq_read1 = "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG" String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" - # Whitelist - File atac_whitelist = "gs://broad-gotc-test-storage/Multiome/input/737K-arc-v1_atac.txt" # These values will be determined and injected into the inputs by the scala test framework String truth_path @@ -76,7 +73,6 @@ workflow TestMultiome { ignore_r1_read_length = ignore_r1_read_length, star_strand_mode = star_strand_mode, count_exons = count_exons, - gex_whitelist = gex_whitelist, atac_r1_fastq = atac_r1_fastq, atac_r2_fastq = atac_r2_fastq, atac_r3_fastq = atac_r3_fastq, @@ -84,7 +80,6 @@ workflow TestMultiome { adapter_seq_read1 = adapter_seq_read1, adapter_seq_read3 = adapter_seq_read3, chrom_sizes = chrom_sizes, - atac_whitelist = atac_whitelist, run_cellbender = run_cellbender, soloMultiMappers = soloMultiMappers, cloud_provider = cloud_provider From 53c7c2725d7b69b4b70bfd7a1498ffe389727e08 Mon Sep 17 00:00:00 2001 From: npetrill Date: Thu, 14 Mar 2024 13:09:53 -0400 Subject: [PATCH 006/186] update cutadapt wdl --- pipelines/skylab/multiome/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index be597c1f62..4a36f1b95a 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -52,7 +52,7 @@ workflow ATAC { # Docker image names String warp_tools_2_0_0 = "warp-tools:2.0.0" - String cutadapt_docker = "cutadapt:1.0.0-4.4-1709146458" + String cutadapt_docker = "cutadapt:1.0.0-4.4-1686752919" String samtools_docker = "samtools-dist-bwa:3.0.0" String upstools_docker = "upstools:1.0.0-2023.03.03-1704300311" String snap_atac_docker = "snapatac2:1.0.4-2.3.1" From cbc01c38cbdeac372ac5df392a848042a56f3e19 Mon Sep 17 00:00:00 2001 From: npetrill Date: Thu, 14 Mar 2024 18:42:06 -0400 Subject: [PATCH 007/186] update cutadapt wdl --- pipelines/skylab/multiome/Multiome.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 236f5fd684..ebca6c2083 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -55,7 +55,7 @@ workflow Multiome { String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix # Define docker images - String snap_atac_docker_image = "snapatac2:1.0.5-2.3.2-1709230223" + String snap_atac_docker_image = "snapatac2:1.0.4-2.3.1-1700590229" # Define all whitelist files File gcp_gex_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_gex.txt" From f3b0294b470ecf3a01e7e8f07c83fffe66b843a0 Mon Sep 17 00:00:00 2001 From: npetrill Date: Fri, 15 Mar 2024 09:50:43 -0400 Subject: [PATCH 008/186] try to fix changelog --- pipelines/skylab/paired_tag/PairedTag.changelog.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index 5eb6f52ac4..b97f823d6f 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,9 +1,8 @@ # 0.3.1 -2024-03-01 (Date of Last Commit) +2024-03-02 (Date of Last Commit) * Updated the Optimus.wdl to run on Azure. This change does not affect the PairedTag pipeline. # 0.3.0 - 2024-03-01 (Date of Last Commit) * Added the gene expression library-level metrics CSV as output of the Paired-tag pipeline; this is produced by the Optimus subworkflow From 66c9082429bfc3fd0fb6edff71fd1999067a56f9 Mon Sep 17 00:00:00 2001 From: npetrill Date: Fri, 15 Mar 2024 09:54:47 -0400 Subject: [PATCH 009/186] try to fix changelog --- pipelines/skylab/paired_tag/PairedTag.changelog.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index b97f823d6f..0e4b60be07 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,8 +1,10 @@ # 0.3.1 2024-03-02 (Date of Last Commit) + * Updated the Optimus.wdl to run on Azure. This change does not affect the PairedTag pipeline. # 0.3.0 + 2024-03-01 (Date of Last Commit) * Added the gene expression library-level metrics CSV as output of the Paired-tag pipeline; this is produced by the Optimus subworkflow From e4fcd9918a8627b2614398ca6886c91ac2e90567 Mon Sep 17 00:00:00 2001 From: npetrill Date: Fri, 15 Mar 2024 13:35:39 -0400 Subject: [PATCH 010/186] remove cloud provider --- pipelines/skylab/multiome/Multiome.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index ebca6c2083..9d9f257d15 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -13,7 +13,6 @@ workflow Multiome { input { String cloud_provider String input_id - String cloud_provider # Optimus Inputs String counting_mode = "sn_rna" From f3b97c81c2e5c1a6c724a6bf832ea0692cd75424 Mon Sep 17 00:00:00 2001 From: npetrill Date: Mon, 18 Mar 2024 09:51:26 -0400 Subject: [PATCH 011/186] try adding sas token to azure public bucket --- pipelines/skylab/optimus/Optimus.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index 689d99636b..feaa8d204a 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -76,8 +76,8 @@ workflow Optimus { # 10x parameters File gcp_whitelist_v2 = "gs://gcp-public-data--broad-references/RNA/resources/737k-august-2016.txt" File gcp_whitelist_v3 = "gs://gcp-public-data--broad-references/RNA/resources/3M-febrary-2018.txt" - File azure_whitelist_v2 = "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/737k-august-2016.txt" - File azure_whitelist_v3 = "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/3M-febrary-2018.txt" + File azure_whitelist_v2 = "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/737k-august-2016.txt?sv=2020-04-08&si=prod&sr=c&sig=DQxmjB4D1lAfOW9AxIWbXwZx6ksbwjlNkixw597JnvQ%3D" + File azure_whitelist_v3 = "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/3M-febrary-2018.txt?sv=2020-04-08&si=prod&sr=c&sig=DQxmjB4D1lAfOW9AxIWbXwZx6ksbwjlNkixw597JnvQ%3D" # Takes the first read1 FASTQ from the inputs to check for chemistry match File r1_single_fastq = r1_fastq[0] From c5f2af5feaeaed845dd8a81644c75b2e7770b10e Mon Sep 17 00:00:00 2001 From: npetrill Date: Mon, 18 Mar 2024 09:52:24 -0400 Subject: [PATCH 012/186] try adding sas token to azure public bucket --- pipelines/skylab/multiome/Multiome.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 9d9f257d15..b933c43be4 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -59,8 +59,8 @@ workflow Multiome { # Define all whitelist files File gcp_gex_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_gex.txt" File gcp_atac_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_atac.txt" - File azure_gex_whitelist = "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/arc-v1/737K-arc-v1_gex.txt" - File azure_atac_whitelist = "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/arc-v1/737K-arc-v1_atac.txt" + File azure_gex_whitelist = "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/arc-v1/737K-arc-v1_gex.txt?sv=2020-04-08&si=prod&sr=c&sig=DQxmjB4D1lAfOW9AxIWbXwZx6ksbwjlNkixw597JnvQ%3D" + File azure_atac_whitelist = "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/arc-v1/737K-arc-v1_atac.txt?sv=2020-04-08&si=prod&sr=c&sig=DQxmjB4D1lAfOW9AxIWbXwZx6ksbwjlNkixw597JnvQ%3D" # Determine which whitelist files to use based on cloud provider File gex_whitelist = if cloud_provider == "gcp" then gcp_gex_whitelist else azure_gex_whitelist From c5f462a77d8059ce8b7436e068149604dbe14af2 Mon Sep 17 00:00:00 2001 From: npetrill Date: Mon, 18 Mar 2024 14:54:02 -0400 Subject: [PATCH 013/186] try files not strings --- tasks/skylab/FastqProcessing.wdl | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/tasks/skylab/FastqProcessing.wdl b/tasks/skylab/FastqProcessing.wdl index bd6f9b06b7..de605692a2 100644 --- a/tasks/skylab/FastqProcessing.wdl +++ b/tasks/skylab/FastqProcessing.wdl @@ -236,9 +236,9 @@ task FastqProcessingSlidSeq { task FastqProcessATAC { input { - Array[String] read1_fastq - Array[String] read3_fastq - Array[String] barcodes_fastq + Array[File] read1_fastq + Array[File] read3_fastq + Array[File] barcodes_fastq String read_structure = "16C" String barcode_orientation = "FIRST_BP_RC" String output_base_name @@ -295,9 +295,18 @@ task FastqProcessATAC { echo $read1_fastq_files # Make downsample fq for barcode orientation check of R2 barcodes mkdir /cromwell_root/input_fastq - gcloud storage cp $read1_fastq_files /cromwell_root/input_fastq - gcloud storage cp $read2_fastq_files /cromwell_root/input_fastq - gcloud storage cp $read3_fastq_files /cromwell_root/input_fastq + mv $read1_fastq_files /cromwell_root/input_fastq + mv $read2_fastq_files /cromwell_root/input_fastq + mv $read3_fastq_files /cromwell_root/input_fastq + + #gcloud storage cp $read1_fastq_files /cromwell_root/input_fastq + #gcloud storage cp $read2_fastq_files /cromwell_root/input_fastq + #gcloud storage cp $read3_fastq_files /cromwell_root/input_fastq + + # Use azcopy to copy files from Azure Blob Storage + #azcopy copy $read1_fastq_files /cromwell_root/input_fastq #--recursive --from-to=BlobLocal --blob-type=BlockBlob --sas-token="~{azure_sas_token}" + #azcopy copy $read2_fastq_files /cromwell_root/input_fastq #--recursive --from-to=BlobLocal --blob-type=BlockBlob --sas-token="~{azure_sas_token}" + #azcopy copy $read3_fastq_files /cromwell_root/input_fastq #--recursive --from-to=BlobLocal --blob-type=BlockBlob --sas-token="~{azure_sas_token}" path="/cromwell_root/input_fastq/" barcode_index="~{barcode_index1}" From e70547920f9d132066fa8deb8498e26c1bcc53f3 Mon Sep 17 00:00:00 2001 From: npetrill Date: Mon, 18 Mar 2024 15:38:17 -0400 Subject: [PATCH 014/186] try files not strings --- tasks/skylab/FastqProcessing.wdl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tasks/skylab/FastqProcessing.wdl b/tasks/skylab/FastqProcessing.wdl index de605692a2..c899901c3d 100644 --- a/tasks/skylab/FastqProcessing.wdl +++ b/tasks/skylab/FastqProcessing.wdl @@ -303,11 +303,6 @@ task FastqProcessATAC { #gcloud storage cp $read2_fastq_files /cromwell_root/input_fastq #gcloud storage cp $read3_fastq_files /cromwell_root/input_fastq - # Use azcopy to copy files from Azure Blob Storage - #azcopy copy $read1_fastq_files /cromwell_root/input_fastq #--recursive --from-to=BlobLocal --blob-type=BlockBlob --sas-token="~{azure_sas_token}" - #azcopy copy $read2_fastq_files /cromwell_root/input_fastq #--recursive --from-to=BlobLocal --blob-type=BlockBlob --sas-token="~{azure_sas_token}" - #azcopy copy $read3_fastq_files /cromwell_root/input_fastq #--recursive --from-to=BlobLocal --blob-type=BlockBlob --sas-token="~{azure_sas_token}" - path="/cromwell_root/input_fastq/" barcode_index="~{barcode_index1}" file="${path}${barcode_index}" From 11835ddcb4b72964f17deb5173815057cc1e1e9d Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 19 Mar 2024 08:56:16 -0400 Subject: [PATCH 015/186] remove cromwell root --- tasks/skylab/FastqProcessing.wdl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tasks/skylab/FastqProcessing.wdl b/tasks/skylab/FastqProcessing.wdl index c899901c3d..4a4aad4e1a 100644 --- a/tasks/skylab/FastqProcessing.wdl +++ b/tasks/skylab/FastqProcessing.wdl @@ -294,16 +294,16 @@ task FastqProcessATAC { echo $read1_fastq_files # Make downsample fq for barcode orientation check of R2 barcodes - mkdir /cromwell_root/input_fastq - mv $read1_fastq_files /cromwell_root/input_fastq - mv $read2_fastq_files /cromwell_root/input_fastq - mv $read3_fastq_files /cromwell_root/input_fastq + mkdir input_fastq + mv $read1_fastq_files input_fastq/ + mv $read2_fastq_files input_fastq/ + mv $read3_fastq_files input_fastq/ #gcloud storage cp $read1_fastq_files /cromwell_root/input_fastq #gcloud storage cp $read2_fastq_files /cromwell_root/input_fastq #gcloud storage cp $read3_fastq_files /cromwell_root/input_fastq - path="/cromwell_root/input_fastq/" + path="input_fastq/" barcode_index="~{barcode_index1}" file="${path}${barcode_index}" zcat "$file" | sed -n '2~4p' | shuf -n 1000 > downsample.fq @@ -313,7 +313,7 @@ task FastqProcessATAC { for fastq in "${FASTQ2_ARRAY[@]}" do BASE=`basename $fastq` - BASE=`echo --R1 /cromwell_root/input_fastq/$BASE` + BASE=`echo --R1 input_fastq/$BASE` R1_FILES_CONCAT+="$BASE " done echo $R1_FILES_CONCAT @@ -323,7 +323,7 @@ task FastqProcessATAC { for fastq in "${FASTQ1_ARRAY[@]}" do BASE=`basename $fastq` - BASE=`echo --R2 /cromwell_root/input_fastq/$BASE` + BASE=`echo --R2 /input_fastq/$BASE` R2_FILES_CONCAT+="$BASE " done echo $R2_FILES_CONCAT @@ -333,7 +333,7 @@ task FastqProcessATAC { for fastq in "${FASTQ3_ARRAY[@]}" do BASE=`basename $fastq` - BASE=`echo --R3 /cromwell_root/input_fastq/$BASE` + BASE=`echo --R3 /input_fastq/$BASE` R3_FILES_CONCAT+="$BASE " done echo $R3_FILES_CONCAT @@ -346,8 +346,8 @@ task FastqProcessATAC { # Call fastq process # outputs fastq files where the corrected barcode is in the read name - mkdir /cromwell_root/output_fastq - cd /cromwell_root/output_fastq + mkdir output_fastq/ + cd /output_fastq fastqprocess \ --num-output-files ~{num_output_files} \ From a82926083378d0a431dbfe46e9e4a33fa4caa8c2 Mon Sep 17 00:00:00 2001 From: kayleemathews Date: Tue, 19 Mar 2024 10:48:28 -0400 Subject: [PATCH 016/186] update docs --- website/docs/Pipelines/ATAC/README.md | 1 + website/docs/Pipelines/Multiome_Pipeline/README.md | 4 +--- website/docs/Pipelines/Optimus_Pipeline/README.md | 2 +- website/docs/Pipelines/PairedTag_Pipeline/README.md | 2 +- website/docs/Pipelines/SlideSeq_Pipeline/README.md | 2 +- .../Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md | 2 +- .../multi_snss2.methods.md | 4 ++-- 7 files changed, 8 insertions(+), 9 deletions(-) diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md index 547bbeb5ac..abe0d8c91c 100644 --- a/website/docs/Pipelines/ATAC/README.md +++ b/website/docs/Pipelines/ATAC/README.md @@ -50,6 +50,7 @@ The following describes the inputs of the ATAC workflow. For more details on how | read2_fastq_gzipped | Fastq inputs (array of compressed read 2 FASTQ files containing cellular barcodes). | | read3_fastq_gzipped | Fastq inputs (array of compressed read 3 FASTQ files). | | input_id | Output prefix/base name for all intermediate files and pipeline outputs. | +| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | | preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false. | | tar_bwa_reference | BWA reference (tar file containing reference fasta and corresponding files). | | num_threads_bwa | Optional integer defining the number of CPUs per node for the BWA-mem alignment task (default: 128). | diff --git a/website/docs/Pipelines/Multiome_Pipeline/README.md b/website/docs/Pipelines/Multiome_Pipeline/README.md index 085369d5a0..3f7e5a04f3 100644 --- a/website/docs/Pipelines/Multiome_Pipeline/README.md +++ b/website/docs/Pipelines/Multiome_Pipeline/README.md @@ -8,7 +8,7 @@ slug: /Pipelines/Multiome_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [Multiome v3.3.0](https://github.com/broadinstitute/warp/releases) | February, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact the [WARP Pipeline Development team](mailto:warp-pipelines-help@broadinstitute.org) | +| [Multiome v3.3.1](https://github.com/broadinstitute/warp/releases) | March, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact the [WARP Pipeline Development team](mailto:warp-pipelines-help@broadinstitute.org) | ![Multiome_diagram](./multiome_diagram.png) @@ -70,7 +70,6 @@ Multiome can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/sta | ignore_r1_read_length | Optional boolean for the Optimus (GEX) pipeline indicating if the pipeline should ignore barcode chemistry check; if "true", the workflow will not ensure the `10x_chemistry_version` input matches the chemistry in the read 1 FASTQ; default is "false". | Boolean | | star_strand_mode | Optional string for the Optimus (GEX) pipeline for performing STARsolo alignment on forward stranded, reverse stranded, or unstranded data; default is "Forward". | String | | count_exons | Optional boolean for the Optimus (GEX) pipeline indicating if the workflow should calculate exon counts **when in single-nucleus (sn_rna) mode**; if "true" in sc_rna mode, the workflow will return an error; default is "false". | Boolean | -| gex_whitelist | Optional file containing the list of valid barcodes for 10x multiome GEX data; default is "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_gex.txt" when run on GCP. | File | | soloMultiMappers | Optional string describing whether or not the Optimus (GEX) pipeline should run STARsolo with the `--soloMultiMappers` flag. | String | | atac_r1_fastq | Array of read 1 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] | | atac_r2_fastq | Array of barcodes FASTQ files representing a single 10x multiome ATAC library. | Array[File] | @@ -79,7 +78,6 @@ Multiome can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/sta | chrom_sizes | File containing the genome chromosome sizes; used to calculate ATAC fragment file metrics. | File | | adapter_seq_read1 | Optional string describing the adapter sequence for ATAC read 1 paired-end reads to be used during adapter trimming with Cutadapt; default is "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG". | String | | adapter_seq_read3 | Optional string describing the adapter sequence for ATAC read 2 paired-end reads to be used during adapter trimming with Cutadapt; default is "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG". | String | -| atac_whitelist | Optional file containing the list of valid barcodes for 10x multiome ATAC adata; default is "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_atac.txt". | File | | run_cellbender | Optional boolean used to determine if the Optimus (GEX) pipeline should run CellBender on the output gene expression h5ad file, `h5ad_output_file_gex`; default is "false". | Boolean | #### Sample inputs for analyses in a Terra Workspace diff --git a/website/docs/Pipelines/Optimus_Pipeline/README.md b/website/docs/Pipelines/Optimus_Pipeline/README.md index 843fc23220..8a79e553af 100644 --- a/website/docs/Pipelines/Optimus_Pipeline/README.md +++ b/website/docs/Pipelines/Optimus_Pipeline/README.md @@ -8,7 +8,7 @@ slug: /Pipelines/Optimus_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [optimus_v6.5.0](https://github.com/broadinstitute/warp/releases?q=optimus&expanded=true) | February, 2024 | Elizabeth Kiernan | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [optimus_v6.5.1](https://github.com/broadinstitute/warp/releases?q=optimus&expanded=true) | March, 2024 | Elizabeth Kiernan | Please file GitHub issues in warp or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | ![Optimus_diagram](Optimus_diagram.png) diff --git a/website/docs/Pipelines/PairedTag_Pipeline/README.md b/website/docs/Pipelines/PairedTag_Pipeline/README.md index d90eb1f309..d0f5b42e47 100644 --- a/website/docs/Pipelines/PairedTag_Pipeline/README.md +++ b/website/docs/Pipelines/PairedTag_Pipeline/README.md @@ -8,7 +8,7 @@ slug: /Pipelines/PairedTag_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [PairedTag_v0.3.0](https://github.com/broadinstitute/warp/releases) | February, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | +| [PairedTag_v0.3.1](https://github.com/broadinstitute/warp/releases) | March, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | ## Introduction to the Paired-Tag workflow diff --git a/website/docs/Pipelines/SlideSeq_Pipeline/README.md b/website/docs/Pipelines/SlideSeq_Pipeline/README.md index 153a8656ae..c56f4064d3 100644 --- a/website/docs/Pipelines/SlideSeq_Pipeline/README.md +++ b/website/docs/Pipelines/SlideSeq_Pipeline/README.md @@ -8,7 +8,7 @@ slug: /Pipelines/SlideSeq_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [SlideSeq v3.1.2](https://github.com/broadinstitute/warp/releases) | February, 2024 | Elizabeth Kiernan & Kaylee Mathews | Please file GitHub issues in warp or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | +| [SlideSeq v3.1.3](https://github.com/broadinstitute/warp/releases) | March, 2024 | Elizabeth Kiernan & Kaylee Mathews | Please file GitHub issues in warp or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | ![SlideSeq_diagram](./slide-seq_diagram.png) diff --git a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md index 25a5426fe7..11983e5187 100644 --- a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md +++ b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [MultiSampleSmartSeq2SingleNuclei_v1.3.1](https://github.com/broadinstitute/warp/releases) | March, 2024 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in WARP or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | +| [MultiSampleSmartSeq2SingleNuclei_v1.3.2](https://github.com/broadinstitute/warp/releases) | March, 2024 | [Elizabeth Kiernan](mailto:ekiernan@broadinstitute.org) | Please file GitHub issues in WARP or contact [the WARP team](mailto:warp-pipelines-help@broadinstitute.org) | ![](./snSS2.png) diff --git a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/multi_snss2.methods.md b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/multi_snss2.methods.md index a758e085cb..03133f3ce4 100644 --- a/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/multi_snss2.methods.md +++ b/website/docs/Pipelines/Smart-seq2_Single_Nucleus_Multi_Sample_Pipeline/multi_snss2.methods.md @@ -2,13 +2,13 @@ sidebar_position: 2 --- -# Smart-seq2 Single Nucleus Multi-Sample v1.3.1 Publication Methods +# Smart-seq2 Single Nucleus Multi-Sample v1.3.2 Publication Methods Below we provide an example methods section for a publication. For the complete pipeline documentation, see the [Smart-seq2 Single Nucleus Multi-Sample Overview](./README.md). ## Methods -Data preprocessing and count matrix construction for a batch (or plate) were performed using the Smart-seq2 Single Nucleus Multi-Sample v1.3.1 Pipeline (RRID:SCR_021312) as well as Picard v.2.26.10 with default tool parameters unless otherwise specified. Genomic references are publicly available in the [Broad References](https://console.cloud.google.com/storage/browser/gcp-public-data--broad-references/mm10/v0/single_nucleus?pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))&prefix=&forceOnObjectsSortingFiltering=false) Google Bucket and are also listed in the [example workflow configuration](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/smartseq2_single_nucleus_multisample/mouse_example.json) in GitHub. +Data preprocessing and count matrix construction for a batch (or plate) were performed using the Smart-seq2 Single Nucleus Multi-Sample v1.3.2 Pipeline (RRID:SCR_021312) as well as Picard v.2.26.10 with default tool parameters unless otherwise specified. Genomic references are publicly available in the [Broad References](https://console.cloud.google.com/storage/browser/gcp-public-data--broad-references/mm10/v0/single_nucleus?pageState=(%22StorageObjectListTable%22:(%22f%22:%22%255B%255D%22))&prefix=&forceOnObjectsSortingFiltering=false) Google Bucket and are also listed in the [example workflow configuration](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/smartseq2_single_nucleus_multisample/mouse_example.json) in GitHub. For each nucleus in the batch, paired-end FASTQ files were first trimmed to remove adapters using the fastq-mcf tool with a subsampling parameter of 200,000 reads. The trimmed FASTQ files were then aligned to the GENCODE GRCm38 mouse genome using STAR v.2.7.10a. To count the number of reads per gene, but not isoforms, the quantMode parameter was set to GeneCounts. Multi-mapped reads, and optical and PCR duplicates, were removed from the resulting aligned BAM using the Picard MarkDuplicates tool with REMOVE_DUPLICATES = true. Metrics were collected on the deduplicated BAM using Picard CollectMultipleMetrics with VALIDATION_STRINGENCY =SILENT. From 68321e06f78f9e038e27baa07d3d3642f16b6cb7 Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 19 Mar 2024 10:57:04 -0400 Subject: [PATCH 017/186] quote whitelist --- tasks/skylab/FastqProcessing.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/skylab/FastqProcessing.wdl b/tasks/skylab/FastqProcessing.wdl index 4a4aad4e1a..017ff02d8a 100644 --- a/tasks/skylab/FastqProcessing.wdl +++ b/tasks/skylab/FastqProcessing.wdl @@ -338,7 +338,7 @@ task FastqProcessATAC { done echo $R3_FILES_CONCAT - python3 /warptools/scripts/dynamic-barcode-orientation.py downsample.fq ~{whitelist} best_match.txt + python3 /warptools/scripts/dynamic-barcode-orientation.py downsample.fq "~{whitelist}" best_match.txt cat best_match.txt barcode_choice=$( Date: Tue, 19 Mar 2024 11:29:57 -0400 Subject: [PATCH 018/186] quote whitelist --- tasks/skylab/FastqProcessing.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/skylab/FastqProcessing.wdl b/tasks/skylab/FastqProcessing.wdl index 017ff02d8a..7bb6dc42fd 100644 --- a/tasks/skylab/FastqProcessing.wdl +++ b/tasks/skylab/FastqProcessing.wdl @@ -347,7 +347,7 @@ task FastqProcessATAC { # Call fastq process # outputs fastq files where the corrected barcode is in the read name mkdir output_fastq/ - cd /output_fastq + cd output_fastq/ fastqprocess \ --num-output-files ~{num_output_files} \ From 96ca1609943c0f828cea9710d3ecc3a9ac222f39 Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 19 Mar 2024 11:52:38 -0400 Subject: [PATCH 019/186] dirs --- tasks/skylab/FastqProcessing.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tasks/skylab/FastqProcessing.wdl b/tasks/skylab/FastqProcessing.wdl index 7bb6dc42fd..90cd0732fb 100644 --- a/tasks/skylab/FastqProcessing.wdl +++ b/tasks/skylab/FastqProcessing.wdl @@ -323,7 +323,7 @@ task FastqProcessATAC { for fastq in "${FASTQ1_ARRAY[@]}" do BASE=`basename $fastq` - BASE=`echo --R2 /input_fastq/$BASE` + BASE=`echo --R2 input_fastq/$BASE` R2_FILES_CONCAT+="$BASE " done echo $R2_FILES_CONCAT @@ -333,7 +333,7 @@ task FastqProcessATAC { for fastq in "${FASTQ3_ARRAY[@]}" do BASE=`basename $fastq` - BASE=`echo --R3 /input_fastq/$BASE` + BASE=`echo --R3 input_fastq/$BASE` R3_FILES_CONCAT+="$BASE " done echo $R3_FILES_CONCAT @@ -371,8 +371,8 @@ task FastqProcessATAC { } output { - Array[File] fastq_R1_output_array = glob("/cromwell_root/output_fastq/fastq_R1_*") - Array[File] fastq_R3_output_array = glob("/cromwell_root/output_fastq/fastq_R3_*") + Array[File] fastq_R1_output_array = glob("output_fastq/fastq_R1_*") + Array[File] fastq_R3_output_array = glob("output_fastq/fastq_R3_*") } } From bd24093da53f2d4249dc3df4e39b7d860f4b7b3e Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 19 Mar 2024 13:01:06 -0400 Subject: [PATCH 020/186] dirs --- .../multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json | 3 +-- tasks/skylab/FastqProcessing.wdl | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json b/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json index c4a7d6d5d7..6e44d0c7c1 100644 --- a/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json +++ b/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json @@ -24,6 +24,5 @@ "Multiome.Atac.cpu_platform_bwa":"Intel Cascade Lake", "Multiome.Atac.num_threads_bwa":"16", "Multiome.Atac.mem_size_bwa":"64", - "Multiome.soloMultiMappers":"Uniform", - "Multiome.cloud_provider":"gcp" + "Multiome.soloMultiMappers":"Uniform" } diff --git a/tasks/skylab/FastqProcessing.wdl b/tasks/skylab/FastqProcessing.wdl index 90cd0732fb..90011dfa8a 100644 --- a/tasks/skylab/FastqProcessing.wdl +++ b/tasks/skylab/FastqProcessing.wdl @@ -352,9 +352,9 @@ task FastqProcessATAC { fastqprocess \ --num-output-files ~{num_output_files} \ --sample-id "~{output_base_name}" \ - $R1_FILES_CONCAT \ - $R2_FILES_CONCAT \ - $R3_FILES_CONCAT \ + ../$R1_FILES_CONCAT \ + ../$R2_FILES_CONCAT \ + ../$R3_FILES_CONCAT \ --white-list "~{whitelist}" \ --output-format "FASTQ" \ --barcode-orientation $barcode_choice \ From 68c0e8924388d68d96e03d555140d75058b69882 Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 19 Mar 2024 18:46:59 -0400 Subject: [PATCH 021/186] dirs --- tasks/skylab/FastqProcessing.wdl | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tasks/skylab/FastqProcessing.wdl b/tasks/skylab/FastqProcessing.wdl index 90011dfa8a..c7ae558cf2 100644 --- a/tasks/skylab/FastqProcessing.wdl +++ b/tasks/skylab/FastqProcessing.wdl @@ -346,15 +346,13 @@ task FastqProcessATAC { # Call fastq process # outputs fastq files where the corrected barcode is in the read name - mkdir output_fastq/ - cd output_fastq/ fastqprocess \ --num-output-files ~{num_output_files} \ --sample-id "~{output_base_name}" \ - ../$R1_FILES_CONCAT \ - ../$R2_FILES_CONCAT \ - ../$R3_FILES_CONCAT \ + $R1_FILES_CONCAT \ + $R2_FILES_CONCAT \ + $R3_FILES_CONCAT \ --white-list "~{whitelist}" \ --output-format "FASTQ" \ --barcode-orientation $barcode_choice \ @@ -371,8 +369,8 @@ task FastqProcessATAC { } output { - Array[File] fastq_R1_output_array = glob("output_fastq/fastq_R1_*") - Array[File] fastq_R3_output_array = glob("output_fastq/fastq_R3_*") + Array[File] fastq_R1_output_array = glob("fastq_R1_*") + Array[File] fastq_R3_output_array = glob("fastq_R3_*") } } From 22510d0cba5dd914bbbf05c592bd36c70e119e5d Mon Sep 17 00:00:00 2001 From: npetrill Date: Wed, 20 Mar 2024 12:36:25 -0400 Subject: [PATCH 022/186] add quotes to whitelist --- tasks/skylab/StarAlign.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tasks/skylab/StarAlign.wdl b/tasks/skylab/StarAlign.wdl index 69a6851ec7..253040b820 100644 --- a/tasks/skylab/StarAlign.wdl +++ b/tasks/skylab/StarAlign.wdl @@ -299,7 +299,7 @@ task STARsoloFastq { --genomeDir genome_reference \ --readFilesIn "~{sep=',' r2_fastq}" "~{sep=',' r1_fastq}" \ --readFilesCommand "gunzip -c" \ - --soloCBwhitelist ~{white_list} \ + --soloCBwhitelist "~{white_list}" \ --soloUMIlen $UMILen --soloCBlen $CBLen \ --soloFeatures $COUNTING_MODE \ --clipAdapterType CellRanger4 \ @@ -325,7 +325,7 @@ task STARsoloFastq { --genomeDir genome_reference \ --readFilesIn "~{sep=',' r2_fastq}" "~{sep=',' r1_fastq}" \ --readFilesCommand "gunzip -c" \ - --soloCBwhitelist ~{white_list} \ + --soloCBwhitelist "~{white_list}" \ --soloUMIlen $UMILen --soloCBlen $CBLen \ --soloFeatures $COUNTING_MODE \ --clipAdapterType CellRanger4 \ @@ -347,7 +347,7 @@ task STARsoloFastq { --genomeDir genome_reference \ --readFilesIn "~{sep=',' r2_fastq}" "~{sep=',' r1_fastq}" \ --readFilesCommand "gunzip -c" \ - --soloCBwhitelist ~{white_list} \ + --soloCBwhitelist "~{white_list}" \ --soloUMIlen $UMILen --soloCBlen $CBLen \ --soloFeatures $COUNTING_MODE \ --clipAdapterType CellRanger4 \ From 208d14b5777145c13382401ee48a2da91be1e717 Mon Sep 17 00:00:00 2001 From: npetrill Date: Thu, 21 Mar 2024 08:56:46 -0400 Subject: [PATCH 023/186] mkdir cromwell_root --- tasks/skylab/StarAlign.wdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tasks/skylab/StarAlign.wdl b/tasks/skylab/StarAlign.wdl index 253040b820..5874ff7035 100644 --- a/tasks/skylab/StarAlign.wdl +++ b/tasks/skylab/StarAlign.wdl @@ -375,6 +375,8 @@ task STARsoloFastq { touch Summary_sn_rna.csv touch UMIperCellSorted_sn_rna.txt + mkdir /cromwell_root + if [[ "~{counting_mode}" == "sc_rna" ]] then From cb9bbdbfb07c047eab016a13f785d720458e052a Mon Sep 17 00:00:00 2001 From: npetrill Date: Thu, 21 Mar 2024 11:39:15 -0400 Subject: [PATCH 024/186] try vm family --- pipelines/skylab/multiome/atac.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index 4a36f1b95a..5899243683 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -450,6 +450,7 @@ task BWAPairedEndAlignment { cpu: nthreads cpuPlatform: cpu_platform memory: "${mem_size} GiB" + vm_size: "Standard_E64ds_v4" } output { From b32c3fbe03875cf4776d3619013b6ba4f06ba5ac Mon Sep 17 00:00:00 2001 From: npetrill Date: Thu, 21 Mar 2024 11:51:03 -0400 Subject: [PATCH 025/186] try vm family --- pipelines/skylab/multiome/atac.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index 5899243683..21a4d6c9b6 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -234,6 +234,7 @@ task GetNumSplits { cpu: nthreads cpuPlatform: cpu_platform memory: "${mem_size} GiB" + vm_size: "Standard_E64ds_v4" } output { From 252bec6bc5778fe947487ecb90c11944bc33b61b Mon Sep 17 00:00:00 2001 From: npetrill Date: Thu, 21 Mar 2024 11:55:10 -0400 Subject: [PATCH 026/186] Standard_M128s --- pipelines/skylab/multiome/atac.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index 21a4d6c9b6..8a9535cbb1 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -234,7 +234,7 @@ task GetNumSplits { cpu: nthreads cpuPlatform: cpu_platform memory: "${mem_size} GiB" - vm_size: "Standard_E64ds_v4" + vm_size: "Standard_M128s" } output { @@ -451,7 +451,7 @@ task BWAPairedEndAlignment { cpu: nthreads cpuPlatform: cpu_platform memory: "${mem_size} GiB" - vm_size: "Standard_E64ds_v4" + vm_size: "Standard_M128s" } output { From 5b6d3d31e1745cde2c4756ba02562f42e03520e7 Mon Sep 17 00:00:00 2001 From: npetrill Date: Wed, 27 Mar 2024 09:31:04 -0400 Subject: [PATCH 027/186] try not to use cromwell root --- tasks/skylab/StarAlign.wdl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tasks/skylab/StarAlign.wdl b/tasks/skylab/StarAlign.wdl index 5874ff7035..b71ae1eb5d 100644 --- a/tasks/skylab/StarAlign.wdl +++ b/tasks/skylab/StarAlign.wdl @@ -396,8 +396,11 @@ task STARsoloFastq { then SoloDirectory="Solo.out/GeneFull_Ex50pAS/raw" echo "SoloDirectory is $SoloDirectory" - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} echo mv {} /cromwell_root/ - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} mv {} /cromwell_root/ + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} echo mv {} /cromwell_root/ + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} mv {} /cromwell_root/ + echo "list matrix files in $SoloDirectory" + ls "$SoloDirectory"/*.mtx + mv "Solo.out/GeneFull_Ex50pAS/raw/*.mtx" matrix.mtx mv "Solo.out/GeneFull_Ex50pAS/raw/barcodes.tsv" barcodes.tsv mv "Solo.out/GeneFull_Ex50pAS/raw/features.tsv" features.tsv mv "Solo.out/GeneFull_Ex50pAS/CellReads.stats" CellReads.stats From 4024e61beaffeff543b37699d9d1ef99c8ca45e2 Mon Sep 17 00:00:00 2001 From: npetrill Date: Wed, 27 Mar 2024 15:17:34 -0400 Subject: [PATCH 028/186] try not to use cromwell root --- tasks/skylab/StarAlign.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/skylab/StarAlign.wdl b/tasks/skylab/StarAlign.wdl index b71ae1eb5d..01122bc77d 100644 --- a/tasks/skylab/StarAlign.wdl +++ b/tasks/skylab/StarAlign.wdl @@ -400,7 +400,7 @@ task STARsoloFastq { #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} mv {} /cromwell_root/ echo "list matrix files in $SoloDirectory" ls "$SoloDirectory"/*.mtx - mv "Solo.out/GeneFull_Ex50pAS/raw/*.mtx" matrix.mtx + mv "Solo.out/GeneFull_Ex50pAS/raw/matrix.mtx" matrix.mtx mv "Solo.out/GeneFull_Ex50pAS/raw/barcodes.tsv" barcodes.tsv mv "Solo.out/GeneFull_Ex50pAS/raw/features.tsv" features.tsv mv "Solo.out/GeneFull_Ex50pAS/CellReads.stats" CellReads.stats From f6827c303c79e98be4cca1d78ba234aabd87dffa Mon Sep 17 00:00:00 2001 From: npetrill Date: Wed, 27 Mar 2024 15:53:11 -0400 Subject: [PATCH 029/186] try using logic --- pipelines/skylab/multiome/atac.wdl | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index 8a9535cbb1..bf09d3564d 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -115,7 +115,8 @@ workflow ATAC { nthreads = num_threads_bwa, mem_size = mem_size_bwa, cpu_platform = cpu_platform_bwa, - docker_path = docker_prefix + samtools_docker + docker_path = docker_prefix + samtools_docker, + cloud_provider = cloud_provider } if (preindex) { @@ -319,6 +320,7 @@ task BWAPairedEndAlignment { String suffix = "trimmed_adapters.fastq.gz" String output_base_name String docker_path + String cloud_provider # Runtime attributes Int disk_size = 2000 @@ -338,6 +340,7 @@ task BWAPairedEndAlignment { disk_size : "disk size used in bwa alignment step" output_base_name: "basename to be used for the output of the task" docker_path: "The docker image path containing the runtime environment for this task" + cloud_provider: "The cloud provider for the pipeline." } String bam_aligned_output_name = output_base_name + ".bam" @@ -436,13 +439,27 @@ task BWAPairedEndAlignment { # rename file to this mv final.sorted.bam ~{bam_aligned_output_name} + echo "the present working dir" + pwd + # save output logs for bwa-mem2 mkdir output_logs mv *txt output_logs - tar -zcvf /cromwell_root/output_distbwa_log.tar.gz output_logs - - # move bam file to /cromwell_root - mv ~{bam_aligned_output_name} /cromwell_root + + if [ "~{cloud_provider}" == "gcp" ]; then + tar -zcvf /cromwell_root/output_distbwa_log.tar.gz output_logs + else + tar -zcvf /cromwell-executions/output_distbwa_log.tar.gz output_logs + fi + + # move bam file to the root of cromwell + # if the cloud provider is azure, move the file to /cromwell-executions + # if the cloud provider is gcp, move the file to /cromwell_root + if [ "~{cloud_provider}" == "gcp" ]; then + mv ~{bam_aligned_output_name} /cromwell_root + else + mv ~{bam_aligned_output_name} /cromwell-executions + fi >>> runtime { From 32a83fc358f8aa779428141a70de51a0b4c5e0c6 Mon Sep 17 00:00:00 2001 From: npetrill Date: Thu, 28 Mar 2024 11:45:34 -0400 Subject: [PATCH 030/186] try using logic --- pipelines/skylab/multiome/atac.wdl | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index bf09d3564d..ecd2ef51b2 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -444,21 +444,23 @@ task BWAPairedEndAlignment { # save output logs for bwa-mem2 mkdir output_logs - mv *txt output_logs + mv *.txt output_logs if [ "~{cloud_provider}" == "gcp" ]; then - tar -zcvf /cromwell_root/output_distbwa_log.tar.gz output_logs + tar -zcvf output_distbwa_log.tar.gz output_logs + mv output_distbwa_log.tar.gz ../ else - tar -zcvf /cromwell-executions/output_distbwa_log.tar.gz output_logs + tar -zcvf output_distbwa_log.tar.gz output_logs + mv output_distbwa_log.tar.gz ../ fi # move bam file to the root of cromwell # if the cloud provider is azure, move the file to /cromwell-executions # if the cloud provider is gcp, move the file to /cromwell_root if [ "~{cloud_provider}" == "gcp" ]; then - mv ~{bam_aligned_output_name} /cromwell_root + mv ~{bam_aligned_output_name} ../ else - mv ~{bam_aligned_output_name} /cromwell-executions + mv ~{bam_aligned_output_name} ../ fi >>> From 4b7a903062f1db4728b08998ae3fd12918c524ad Mon Sep 17 00:00:00 2001 From: npetrill Date: Thu, 28 Mar 2024 15:09:21 -0400 Subject: [PATCH 031/186] update snapatac2 docker --- pipelines/skylab/multiome/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index ecd2ef51b2..a6bed6d787 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -55,7 +55,7 @@ workflow ATAC { String cutadapt_docker = "cutadapt:1.0.0-4.4-1686752919" String samtools_docker = "samtools-dist-bwa:3.0.0" String upstools_docker = "upstools:1.0.0-2023.03.03-1704300311" - String snap_atac_docker = "snapatac2:1.0.4-2.3.1" + String snap_atac_docker = "snapatac2:1.0.5-2.3.2-1709230223" # Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { From aa1d23be57cd5b764d81849be41cb9682c57756c Mon Sep 17 00:00:00 2001 From: npetrill Date: Thu, 28 Mar 2024 16:11:22 -0400 Subject: [PATCH 032/186] remove mkdir cromwell root --- tasks/skylab/StarAlign.wdl | 3 --- 1 file changed, 3 deletions(-) diff --git a/tasks/skylab/StarAlign.wdl b/tasks/skylab/StarAlign.wdl index 01122bc77d..2bff800626 100644 --- a/tasks/skylab/StarAlign.wdl +++ b/tasks/skylab/StarAlign.wdl @@ -375,9 +375,6 @@ task STARsoloFastq { touch Summary_sn_rna.csv touch UMIperCellSorted_sn_rna.txt - mkdir /cromwell_root - - if [[ "~{counting_mode}" == "sc_rna" ]] then SoloDirectory="Solo.out/Gene/raw" From d826b0a0c6bef6ae5ac30da72669ebe8551f575a Mon Sep 17 00:00:00 2001 From: npetrill Date: Fri, 29 Mar 2024 08:04:07 -0400 Subject: [PATCH 033/186] snap dpcker --- pipelines/skylab/multiome/Multiome.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index b933c43be4..2cd2af2988 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -54,7 +54,7 @@ workflow Multiome { String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix # Define docker images - String snap_atac_docker_image = "snapatac2:1.0.4-2.3.1-1700590229" + String snap_atac_docker_image = "snapatac2:1.0.5-2.3.2-1709230223" # Define all whitelist files File gcp_gex_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_gex.txt" From a57225d9c36ce08bd9cf2678ea01e41bb78c5e0c Mon Sep 17 00:00:00 2001 From: npetrill Date: Fri, 29 Mar 2024 09:18:41 -0400 Subject: [PATCH 034/186] fix snap dpcker --- pipelines/skylab/multiome/Multiome.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 2cd2af2988..b933c43be4 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -54,7 +54,7 @@ workflow Multiome { String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix # Define docker images - String snap_atac_docker_image = "snapatac2:1.0.5-2.3.2-1709230223" + String snap_atac_docker_image = "snapatac2:1.0.4-2.3.1-1700590229" # Define all whitelist files File gcp_gex_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_gex.txt" From 9012509b3b859cd356535fe4a64b6112e295f8f3 Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 2 Apr 2024 13:32:53 -0400 Subject: [PATCH 035/186] fix starsolo fastq for other flavors of optimus --- tasks/skylab/StarAlign.wdl | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/tasks/skylab/StarAlign.wdl b/tasks/skylab/StarAlign.wdl index 2bff800626..d2148ad128 100644 --- a/tasks/skylab/StarAlign.wdl +++ b/tasks/skylab/StarAlign.wdl @@ -379,8 +379,11 @@ task STARsoloFastq { then SoloDirectory="Solo.out/Gene/raw" echo "SoloDirectory is $SoloDirectory" - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} echo mv {} /cromwell_root/ - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} mv {} /cromwell_root/ + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} echo mv {} /cromwell_root/ + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} mv {} /cromwell_root/ + echo "list matrix files in $SoloDirectory" + ls "$SoloDirectory"/*.mtx + mv $SoloDirectory/matrix.mtx matrix.mtx mv "Solo.out/Gene/raw/barcodes.tsv" barcodes.tsv mv "Solo.out/Gene/raw/features.tsv" features.tsv mv "Solo.out/Gene/CellReads.stats" CellReads.stats @@ -397,7 +400,7 @@ task STARsoloFastq { #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} mv {} /cromwell_root/ echo "list matrix files in $SoloDirectory" ls "$SoloDirectory"/*.mtx - mv "Solo.out/GeneFull_Ex50pAS/raw/matrix.mtx" matrix.mtx + mv $SoloDirectory/matrix.mtx matrix.mtx mv "Solo.out/GeneFull_Ex50pAS/raw/barcodes.tsv" barcodes.tsv mv "Solo.out/GeneFull_Ex50pAS/raw/features.tsv" features.tsv mv "Solo.out/GeneFull_Ex50pAS/CellReads.stats" CellReads.stats @@ -407,12 +410,18 @@ task STARsoloFastq { else SoloDirectory="Solo.out/GeneFull_Ex50pAS/raw" echo "SoloDirectory is $SoloDirectory" - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} echo mv {} /cromwell_root/ - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} mv {} /cromwell_root/ + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} echo mv {} /cromwell_root/ + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} mv {} /cromwell_root/ + echo "list matrix files in $SoloDirectory" + ls "$SoloDirectory"/*.mtx + mv $SoloDirectory/matrix.mtx matrix.mtx SoloDirectory="Solo.out/Gene/raw" echo "SoloDirectory is $SoloDirectory" - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} sh -c 'new_name="$(basename {} .mtx)_sn_rna.mtx"; echo mv {} "/cromwell_root/$new_name"' - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} sh -c 'new_name="$(basename {} .mtx)_sn_rna.mtx"; mv {} "/cromwell_root/$new_name"' + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} sh -c 'new_name="$(basename {} .mtx)_sn_rna.mtx"; echo mv {} "/cromwell_root/$new_name"' + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} sh -c 'new_name="$(basename {} .mtx)_sn_rna.mtx"; mv {} "/cromwell_root/$new_name"' + echo "list matrix files in $SoloDirectory" + ls "$SoloDirectory"/*.mtx + mv $SoloDirectory/matrix.mtx matrix_sn_rna.mtx mv "Solo.out/GeneFull_Ex50pAS/raw/barcodes.tsv" barcodes.tsv mv "Solo.out/GeneFull_Ex50pAS/raw/features.tsv" features.tsv mv "Solo.out/GeneFull_Ex50pAS/CellReads.stats" CellReads.stats From b2e42b3af97321def046c23b6aa105498eafbf67 Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 2 Apr 2024 14:26:25 -0400 Subject: [PATCH 036/186] merge conflicts --- pipelines/skylab/multiome/Multiome.changelog.md | 2 +- pipelines/skylab/multiome/Multiome.wdl | 2 +- pipelines/skylab/optimus/Optimus.changelog.md | 2 +- pipelines/skylab/optimus/Optimus.wdl | 2 +- pipelines/skylab/paired_tag/PairedTag.wdl | 2 +- pipelines/skylab/slideseq/SlideSeq.wdl | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index 3ee0584b4a..e2e5fcec2e 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -1,4 +1,4 @@ -# 3.4.1 +# 3.4.2 2024-04-01 (Date of Last Commit) * Updated the Optimus.wdl to run on Azure. This change does not affect the Multiome pipeline. diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index e07ff239e8..9fd708cc66 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow Multiome { - String pipeline_version = "3.4.1" + String pipeline_version = "3.4.2" input { String cloud_provider diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md index bce0f726f7..76dbb6637c 100644 --- a/pipelines/skylab/optimus/Optimus.changelog.md +++ b/pipelines/skylab/optimus/Optimus.changelog.md @@ -1,4 +1,4 @@ -# 6.6.1 +# 6.6.2 2024-04-01 (Date of Last Commit) * Updated the Optimus.wdl to run on Azure. diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index 64997c513d..0f4b96a2b3 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -68,7 +68,7 @@ workflow Optimus { # version of this pipeline - String pipeline_version = "6.6.1" + String pipeline_version = "6.6.2" # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index 242a942e76..8c7450abc2 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -5,7 +5,7 @@ import "../../../pipelines/skylab/optimus/Optimus.wdl" as optimus import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils import "../../../tasks/skylab/PairedTagUtils.wdl" as Demultiplexing workflow PairedTag { - String pipeline_version = "0.4.1" + String pipeline_version = "0.4.2" input { String input_id diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index a9fd0e25ec..04aed4f979 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -25,7 +25,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow SlideSeq { - String pipeline_version = "3.1.4" + String pipeline_version = "3.1.5" input { Array[File] r1_fastq From ab644328d9b7baf62b9eaf4e1af315ded570bd0d Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 2 Apr 2024 14:38:16 -0400 Subject: [PATCH 037/186] merge conflicts --- pipelines/skylab/slideseq/SlideSeq.wdl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index 04aed4f979..8005922895 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -50,6 +50,7 @@ workflow SlideSeq { String picard_cloud_docker = "picard-cloud:2.26.10" String warp_tools_docker_2_0_1 = "warp-tools:2.0.1" String warp_tools_docker_2_0_2 = "warp-tools:2.0.2-1709308985" + String star_merge_docker = "star-merge-npz:1.1" String ubuntu_docker = "ubuntu_16_0_4:latest" String gcp_ubuntu_docker_prefix = "gcr.io/gcp-runtimes/" @@ -148,7 +149,7 @@ workflow SlideSeq { features = STARsoloFastqSlideSeq.features, matrix = STARsoloFastqSlideSeq.matrix, input_id = input_id, - warp_tools_docker_path = docker_prefix + warp_tools_docker_2_0_2 + star_merge_docker_path = docker_prefix + star_merge_docker } if ( !count_exons ) { call H5adUtils.OptimusH5adGeneration as SlideseqH5adGeneration{ @@ -173,7 +174,7 @@ workflow SlideSeq { features = STARsoloFastqSlideSeq.features_sn_rna, matrix = STARsoloFastqSlideSeq.matrix_sn_rna, input_id = input_id, - warp_tools_docker_path = docker_prefix + warp_tools_docker_2_0_2 + star_merge_docker_path = docker_prefix + star_merge_docker } call H5adUtils.SingleNucleusOptimusH5adOutput as OptimusH5adGenerationWithExons{ input: From 537f3301c56c78f0b978ae3821e432c4e630877b Mon Sep 17 00:00:00 2001 From: npetrill Date: Wed, 3 Apr 2024 09:04:35 -0400 Subject: [PATCH 038/186] fix StarAlign.wdl --- tasks/skylab/StarAlign.wdl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tasks/skylab/StarAlign.wdl b/tasks/skylab/StarAlign.wdl index b4e75e5565..9002223f7e 100644 --- a/tasks/skylab/StarAlign.wdl +++ b/tasks/skylab/StarAlign.wdl @@ -530,9 +530,12 @@ task MergeStarOutput { mkdir matrix #Using cp because mv isn't moving pwd - cp /cromwell_root/~{input_id}.uniform.mtx ./matrix/matrix.mtx + ls -lR + cp ~{input_id}.uniform.mtx ./matrix/matrix.mtx cp ~{barcodes_single} ./matrix/barcodes.tsv cp ~{features_single} ./matrix/features.tsv + echo "doing another ls" + ls -lR tar -zcvf ~{input_id}.mtx_files.tar ./matrix/* From 1a0a3406aa0e3345c958600cdcc469eeda6b2a0a Mon Sep 17 00:00:00 2001 From: npetrill Date: Wed, 3 Apr 2024 14:13:42 -0400 Subject: [PATCH 039/186] put whitelists in quotes --- tasks/skylab/CheckInputs.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tasks/skylab/CheckInputs.wdl b/tasks/skylab/CheckInputs.wdl index 89b99c7798..57fbcaad1a 100644 --- a/tasks/skylab/CheckInputs.wdl +++ b/tasks/skylab/CheckInputs.wdl @@ -119,10 +119,10 @@ task checkOptimusInput { then if [[ "~{cloud_provider}" == "gcp" ]] then - WHITELIST=~{gcp_whitelist_v2} + WHITELIST="~{gcp_whitelist_v2}" elif [[ "~{cloud_provider}" == "azure" ]] then - WHITELIST=~{azure_whitelist_v2} + WHITELIST="~{azure_whitelist_v2}" else pass="false" echo "ERROR: Cloud provider must be either gcp or azure" @@ -134,10 +134,10 @@ task checkOptimusInput { then if [[ "~{cloud_provider}" == "gcp" ]] then - WHITELIST=~{gcp_whitelist_v3} + WHITELIST="~{gcp_whitelist_v3}" elif [[ "~{cloud_provider}" == "azure" ]] then - WHITELIST=~{azure_whitelist_v3} + WHITELIST="~{azure_whitelist_v3}" else pass="false" echo "ERROR: Cloud provider must be either gcp or azure" From bfcd70084ad63d912e43275d47a70ee0172a5548 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian <45041478+sahakiann@users.noreply.github.com> Date: Mon, 8 Apr 2024 11:52:41 -0400 Subject: [PATCH 040/186] PD-2516: Update Paired-Tag to run in Azure and GCP (#1212) * PD-2516: Update PairedTag to run in Azure and GCP * json formatting * update file location for GCP vs. Azure and documentation accordingly * merge conflicts after rebase, update pipeline cahngelog and readme version * more fixes after rebase * more fixes after rebase * more fixes after rebase * fix readme * adding sas tokens * fixing womtools error * update pipeline change logs and versions --------- Co-authored-by: npetrill Co-authored-by: Nikelle Petrillo <38223776+nikellepetrillo@users.noreply.github.com> --- .../skylab/multiome/Multiome.changelog.md | 5 ++ pipelines/skylab/multiome/Multiome.wdl | 2 +- pipelines/skylab/multiome/atac.changelog.md | 5 ++ pipelines/skylab/multiome/atac.wdl | 3 +- .../skylab/paired_tag/PairedTag.changelog.md | 5 ++ pipelines/skylab/paired_tag/PairedTag.wdl | 49 +++++++++++++++---- .../Plumbing/10k_pbmc_downsampled.json | 11 +++-- .../test_inputs/Scientific/10k_pbmc.json | 11 +++-- ...iSampleSmartSeq2SingleNucleus.changelog.md | 6 +++ .../MultiSampleSmartSeq2SingleNucleus.wdl | 3 +- tasks/skylab/PairedTagUtils.wdl | 9 ++-- .../Pipelines/PairedTag_Pipeline/README.md | 4 +- 12 files changed, 84 insertions(+), 29 deletions(-) diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index 8e2cc66d4c..7704a65ae5 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -1,3 +1,8 @@ +# 3.4.3 +2024-04-08 (Date of Last Commit) + +* Updated the PairedTag.wdl to run on Azure. This change does not affect the Multiome pipeline. + # 3.4.2 2024-04-01 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 9fd708cc66..8bfd9c7222 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow Multiome { - String pipeline_version = "3.4.2" + String pipeline_version = "3.4.3" input { String cloud_provider diff --git a/pipelines/skylab/multiome/atac.changelog.md b/pipelines/skylab/multiome/atac.changelog.md index ef74303072..67db095802 100644 --- a/pipelines/skylab/multiome/atac.changelog.md +++ b/pipelines/skylab/multiome/atac.changelog.md @@ -1,3 +1,8 @@ +# 1.2.3 +2024-04-08 (Date of Last Commit) + +* Updated the PairedTag.wdl to run on Azure. This change does not affect the ATAC pipeline. + # 1.2.2 2024-04-02 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index a7846f0e4f..b286144756 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -43,7 +43,8 @@ workflow ATAC { String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" } - String pipeline_version = "1.2.2" + String pipeline_version = "1.2.3" + # Determine docker prefix based on cloud provider String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index 09950e498c..b763a378a7 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,3 +1,8 @@ +# 0.5.2 +2024-04-08 (Date of Last Commit) + +* Updated the PairedTag.wdl to run in Azure + # 0.5.1 2024-04-04 (Date of Last Commit) diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index a4de0f85d7..83b9f351be 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -2,10 +2,12 @@ version 1.0 import "../../../pipelines/skylab/multiome/atac.wdl" as atac import "../../../pipelines/skylab/optimus/Optimus.wdl" as optimus -import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils import "../../../tasks/skylab/PairedTagUtils.wdl" as Demultiplexing +import "../../../tasks/broad/Utilities.wdl" as utils + workflow PairedTag { - String pipeline_version = "0.5.1" + + String pipeline_version = "0.5.2" input { String input_id @@ -24,7 +26,7 @@ workflow PairedTag { Boolean ignore_r1_read_length = false String star_strand_mode = "Forward" Boolean count_exons = false - File gex_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_gex.txt" + File gex_whitelist = if cloud_provider == "gcp" then "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_gex.txt" else "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/arc-v1/737K-arc-v1_gex.txt?sv=2020-04-08&si=prod&sr=c&sig=DQxmjB4D1lAfOW9AxIWbXwZx6ksbwjlNkixw597JnvQ%3D" # ATAC inputs # Array of input fastq files @@ -38,11 +40,34 @@ workflow PairedTag { String adapter_seq_read1 = "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG" String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" # Whitelist - File atac_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_atac.txt" + File atac_whitelist = if cloud_provider == "gcp" then "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_atac.txt" else "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/arc-v1/737K-arc-v1_atac.txt?sv=2020-04-08&si=prod&sr=c&sig=DQxmjB4D1lAfOW9AxIWbXwZx6ksbwjlNkixw597JnvQ%3D" # PairedTag Boolean preindex + + # Expected to be either 'gcp' or 'azure' + String cloud_provider + } + + # All docker images that are needed for tasks in this workflow + String upstools_docker = "upstools:1.2.0-2023.03.03-1704723060" + String snapatac_docker = "snapatac2:1.0.4-2.3.1-1700590229" + + # Prefixes based on cloud env + String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" + String acr_docker_prefix = "dsppipelinedev.azurecr.io/" + + # choose docker prefix based on cloud_provider input + String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix + + # Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error + if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { + call utils.ErrorWithMessage as ErrorMessageIncorrectInput { + input: + message = "cloud_provider must be supplied with either 'gcp' or 'azure'." + } } + # Call the Optimus workflow call optimus.Optimus as Optimus { input: @@ -62,10 +87,9 @@ workflow PairedTag { ignore_r1_read_length = ignore_r1_read_length, star_strand_mode = star_strand_mode, count_exons = count_exons, + cloud_provider = cloud_provider, } - # Call the ATAC workflow - # Call the ATAC workflow scatter (idx in range(length(atac_r1_fastq))) { call Demultiplexing.PairedTagDemultiplex as demultiplex { input: @@ -74,9 +98,12 @@ workflow PairedTag { barcodes_fastq = atac_r2_fastq[idx], input_id = input_id, whitelist = atac_whitelist, - preindex = preindex + preindex = preindex, + docker_path = docker_prefix + upstools_docker } - } + } + + # Call the ATAC workflow call atac.ATAC as Atac_preindex { input: read1_fastq_gzipped = demultiplex.fastq1, @@ -89,14 +116,16 @@ workflow PairedTag { whitelist = atac_whitelist, adapter_seq_read1 = adapter_seq_read1, adapter_seq_read3 = adapter_seq_read3, - preindex = preindex + preindex = preindex, + cloud_provider = cloud_provider, } if (preindex) { call Demultiplexing.ParseBarcodes as ParseBarcodes { input: atac_h5ad = Atac_preindex.snap_metrics, - atac_fragment = Atac_preindex.fragment_file + atac_fragment = Atac_preindex.fragment_file, + docker_path = docker_prefix + snapatac_docker, } } diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/10k_pbmc_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/10k_pbmc_downsampled.json index e46f86c366..869012fcb6 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Plumbing/10k_pbmc_downsampled.json +++ b/pipelines/skylab/paired_tag/test_inputs/Plumbing/10k_pbmc_downsampled.json @@ -16,9 +16,10 @@ "PairedTag.atac_r3_fastq":[ "gs://broad-gotc-test-storage/Multiome/input/plumbing/fastq_R3_atac.fastq.gz" ], - "PairedTag.ref_genome_fasta":"gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa", - "PairedTag.tar_bwa_reference":"gs://gcp-public-data--broad-references/hg38/v0/bwa/v2_2_1/bwa-mem2-2.2.1-Human-GENCODE-build-GRCh38.tar", - "PairedTag.tar_star_reference":"gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_star2.7.10a-Human-GENCODE-build-GRCh38-43.tar", - "PairedTag.chrom_sizes":"gs://broad-gotc-test-storage/Multiome/input/hg38.chrom.sizes", - "PairedTag.preindex":"false" + "PairedTag.ref_genome_fasta": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa", + "PairedTag.tar_bwa_reference": "gs://gcp-public-data--broad-references/hg38/v0/bwa/v2_2_1/bwa-mem2-2.2.1-Human-GENCODE-build-GRCh38.tar", + "PairedTag.tar_star_reference": "gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_star2.7.10a-Human-GENCODE-build-GRCh38-43.tar", + "PairedTag.chrom_sizes": "gs://broad-gotc-test-storage/Multiome/input/hg38.chrom.sizes", + "PairedTag.preindex": "false", + "PairedTag.cloud_provider": "gcp" } diff --git a/pipelines/skylab/paired_tag/test_inputs/Scientific/10k_pbmc.json b/pipelines/skylab/paired_tag/test_inputs/Scientific/10k_pbmc.json index 888439d2a6..0cbf338449 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Scientific/10k_pbmc.json +++ b/pipelines/skylab/paired_tag/test_inputs/Scientific/10k_pbmc.json @@ -25,9 +25,10 @@ "gs://broad-gotc-test-storage/Multiome/input/scientific/10k_PBMC_Multiome/10k_PBMC_Multiome_nextgem_Chromium_Controller_atac_S1_L001_R3_001.fastq.gz", "gs://broad-gotc-test-storage/Multiome/input/scientific/10k_PBMC_Multiome/10k_PBMC_Multiome_nextgem_Chromium_Controller_atac_S1_L002_R3_001.fastq.gz" ], - "PairedTag.ref_genome_fasta":"gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa", - "PairedTag.tar_bwa_reference":"gs://gcp-public-data--broad-references/hg38/v0/bwa/v2_2_1/bwa-mem2-2.2.1-Human-GENCODE-build-GRCh38.tar", - "PairedTag.tar_star_reference":"gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_star2.7.10a-Human-GENCODE-build-GRCh38-43.tar", - "PairedTag.chrom_sizes":"gs://broad-gotc-test-storage/Multiome/input/hg38.chrom.sizes", - "PairedTag.preindex":"false" + "PairedTag.ref_genome_fasta": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa", + "PairedTag.tar_bwa_reference": "gs://gcp-public-data--broad-references/hg38/v0/bwa/v2_2_1/bwa-mem2-2.2.1-Human-GENCODE-build-GRCh38.tar", + "PairedTag.tar_star_reference": "gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_star2.7.10a-Human-GENCODE-build-GRCh38-43.tar", + "PairedTag.chrom_sizes": "gs://broad-gotc-test-storage/Multiome/input/hg38.chrom.sizes", + "PairedTag.preindex": "false", + "PairedTag.cloud_provider": "gcp" } \ No newline at end of file diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md index d6ae9dd60f..d3bd1ba5a1 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md @@ -1,3 +1,8 @@ +# 1.3.5 +2024-04-08 (Date of Last Commit) + +* Updated the PairedTag.wdl to run on Azure. This change does not affect the MultiSampleSmartSeq2SingleNucleus pipeline. + # 1.3.4 2024-04-02 (Date of Last Commit) @@ -14,6 +19,7 @@ * Added cell metrics to the library-level metrics CSV; this does not impact the Single-nucleus Multi Sample Smartseq pipeline * Updated the docker for the MergeStarOutput task to include STARsolo v2.7.11a and custom scripts to create a uniform matrix file and scripts to collect library-level metrics from STARsolo output * Modified the MergeStarOutput to call a custom script for creating a uniform matrix file (mtx) from individual shard mtx files and to create a filtered matrix from the uniform matrix with STARsolo + # 1.3.1 2024-02-28 (Date of Last Commit) diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl index 77da68f74a..3ed1b6a220 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl @@ -57,7 +57,8 @@ workflow MultiSampleSmartSeq2SingleNucleus { } # Version of this pipeline - String pipeline_version = "1.3.4" + + String pipeline_version = "1.3.5" if (false) { String? none = "None" diff --git a/tasks/skylab/PairedTagUtils.wdl b/tasks/skylab/PairedTagUtils.wdl index 7fcd867474..d3754c55f3 100644 --- a/tasks/skylab/PairedTagUtils.wdl +++ b/tasks/skylab/PairedTagUtils.wdl @@ -7,7 +7,7 @@ task PairedTagDemultiplex { String input_id Boolean preindex File whitelist - String docker = "us.gcr.io/broad-gotc-prod/upstools:1.2.0-2023.03.03-1704723060" + String docker_path Int cpu = 1 Int disk_size = ceil(2 * (size(read1_fastq, "GiB") + size(read3_fastq, "GiB") + size(barcodes_fastq, "GiB") )) + 400 Int preemptible = 3 @@ -23,7 +23,7 @@ task PairedTagDemultiplex { preindex: "Boolean for whether data has a sample barcode that needs to be demultiplexed" whitelist: "Atac whitelist for 10x multiome data" input_id: "Input ID to demarcate sample" - docker: "(optional) the docker image containing the runtime environment for this task" + docker_path: "(optional) the docker image containing the runtime environment for this task" mem_size: "(optional) the amount of memory (MiB) to provision for this task" cpu: "(optional) the number of cpus to provision for this task" disk_size: "(optional) the amount of disk space (GiB) to provision for this task" @@ -112,7 +112,7 @@ task PairedTagDemultiplex { >>> runtime { - docker: docker + docker: docker_path cpu: cpu memory: "${mem_size} GiB" disks: "local-disk ${disk_size} HDD" @@ -185,6 +185,7 @@ task ParseBarcodes { File atac_fragment Int nthreads = 1 String cpuPlatform = "Intel Cascade Lake" + String docker_path } String atac_base_name = basename(atac_h5ad, ".h5ad") @@ -254,7 +255,7 @@ task ParseBarcodes { >>> runtime { - docker: "us.gcr.io/broad-gotc-prod/snapatac2:1.0.4-2.3.1-1700590229" + docker: docker_path disks: "local-disk ~{disk} HDD" memory: "${machine_mem_mb} MiB" cpu: nthreads diff --git a/website/docs/Pipelines/PairedTag_Pipeline/README.md b/website/docs/Pipelines/PairedTag_Pipeline/README.md index f7c2d16469..2d45e01ec4 100644 --- a/website/docs/Pipelines/PairedTag_Pipeline/README.md +++ b/website/docs/Pipelines/PairedTag_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/PairedTag_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | |:-------------------------------------------------------------------:| :---: | :----: | :--------------: | -| [PairedTag_v0.4.2](https://github.com/broadinstitute/warp/releases) | February, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | +| [PairedTag_v0.5.2](https://github.com/broadinstitute/warp/releases) | February, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | ## Introduction to the Paired-Tag workflow @@ -118,7 +118,7 @@ The Paired-Tag workflow calls two WARP subworkflows and an additional task which | cell_calls_gex | `_gex.emptyDrops` | TSV file containing the EmptyDrops results when the Optimus workflow is run in sc_rna mode. | | h5ad_output_file_gex | `_gex.h5ad` | h5ad (Anndata) file containing the raw cell-by-gene count matrix, gene metrics, cell metrics, and global attributes. See the [Optimus Count Matrix Overview](../Optimus_Pipeline/Loom_schema.md) for more details. | | library_metrics | `_library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | - +| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | ## Versioning and testing From d657038f059f88b2c84c5eaf3362777cf3855b9d Mon Sep 17 00:00:00 2001 From: Farzaneh Khajouei Date: Thu, 11 Apr 2024 14:08:07 -0500 Subject: [PATCH 041/186] Fk pd 2513 reblock gvcf (#1261) * added azure docker and updated ReblockGVCF to support gcp and azure * updated verification test wdl * updated UltimaGenomicsWholeGenomeGermline wdl and changelog * Updated Changelog on BroadInternalUltimaGenomics * updated tasks to use the new docker * updated additional changelogs * changed path for import utils.wdl * update other wdls because of qc wdl change * update other wdls because of qc wdl change * update test wdls * still need to update many input jsons * Update ExomeGermlineSingleSample.changelog.md * Update UltimaGenomicsWholeGenomeGermline.changelog.md --------- Co-authored-by: npetrill Co-authored-by: Nikelle Petrillo <38223776+nikellepetrillo@users.noreply.github.com> --- .../arrays/single_sample/Arrays.changelog.md | 5 +++++ .../broad/arrays/single_sample/Arrays.wdl | 2 +- .../reblocking/ReblockGVCF.changelog.md | 5 +++++ .../reblocking/ReblockGVCF.exome.inputs.json | 3 ++- .../reblocking/ReblockGVCF.wdl | 21 ++++++++++++++++--- .../Plumbing/G96830.NA12878.index.json | 3 ++- .../test_inputs/Plumbing/G96830.NA12878.json | 3 ++- .../test_inputs/Plumbing/NA12878.ultima.json | 3 ++- .../test_inputs/Plumbing/RP-929.NA12878.json | 3 ++- .../Scientific/C1963.CHMI_CHMI3_Nex1.json | 3 ++- .../test_inputs/Scientific/C862.NA19238.json | 3 ++- .../test_inputs/Scientific/D5327.NA12878.json | 3 ++- .../test_inputs/Scientific/D5327.NA12891.json | 3 ++- .../Scientific/G94794.CHMI_CHMI3_WGS2.json | 3 ++- .../Scientific/G94982.NA12878.json | 3 ++- .../Scientific/G94982.NA12891.json | 3 ++- .../Scientific/G94982.NA12892.json | 3 ++- .../Scientific/G96830.NA12878.json | 3 ++- .../test_inputs/Scientific/NA12878.bge.json | 3 ++- .../Scientific/NA12878.ultima.json | 3 ++- .../Scientific/RP-1535.NA17-308.json | 3 ++- .../Scientific/RP-518.NA12878.json | 3 ++- .../exome/Plumbing/RP-929.NA12878.json | 3 ++- .../Scientific/C1963.CHMI_CHMI3_Nex1.json | 3 ++- .../exome/Scientific/C862.NA19238.json | 3 ++- .../exome/Scientific/D5327.NA12878.json | 3 ++- .../exome/Scientific/D5327.NA12891.json | 3 ++- .../exome/Scientific/D5327.NA12892.json | 3 ++- .../exome/Scientific/RP-1535.NA17-308.json | 3 ++- .../wgs/Plumbing/G96830.NA12878.json | 3 ++- .../wgs/Plumbing/NA12878.ultima.json | 3 ++- .../Scientific/G94794.CHMI_CHMI3_WGS2.json | 3 ++- .../wgs/Scientific/G94982.NA12878.json | 3 ++- .../wgs/Scientific/G94982.NA12891.json | 3 ++- .../wgs/Scientific/G94982.NA12892.json | 3 ++- .../wgs/Scientific/G96830.NA12878.json | 3 ++- .../wgs/Scientific/NA12878.ultima.json | 3 ++- .../wgs/Scientific/RP-518.NA12878.json | 3 ++- .../ExomeGermlineSingleSample.changelog.md | 5 +++++ .../exome/ExomeGermlineSingleSample.wdl | 21 +++++++++++++++++-- .../test_inputs/Plumbing/RP-929.NA12878.json | 3 ++- .../Scientific/C1963.CHMI_CHMI3_Nex1.json | 3 ++- .../test_inputs/Scientific/C862.NA19238.json | 3 ++- .../test_inputs/Scientific/D5327.NA12878.json | 3 ++- .../test_inputs/Scientific/D5327.NA12891.json | 3 ++- .../test_inputs/Scientific/D5327.NA12892.json | 3 ++- .../Scientific/RP-1535.NA17-308.json | 3 ++- ...maGenomicsWholeGenomeGermline.changelog.md | 7 ++++++- .../UltimaGenomicsWholeGenomeGermline.wdl | 5 +++-- ...oleGenomeGermlineSingleSample.changelog.md | 5 +++++ .../wgs/WholeGenomeGermlineSingleSample.wdl | 7 +++++-- ...mple.inputs.plumbing.masked_reference.json | 1 + .../test_inputs/Plumbing/G96830.NA12878.json | 3 ++- .../Plumbing/dragen_mode_best_results.json | 3 ++- .../dragen_mode_functional_equivalence.json | 3 ++- .../Scientific/G94794.CHMI_CHMI3_WGS2.json | 3 ++- ...4982.NA12878.dragen_mode_best_results.json | 3 ++- ...78.dragen_mode_functional_equivalence.json | 3 ++- .../Scientific/G94982.NA12878.json | 3 ++- .../Scientific/G94982.NA12891.json | 3 ++- .../Scientific/G94982.NA12892.json | 3 ++- .../Scientific/G96830.NA12878.json | 3 ++- .../Scientific/RP-518.NA12878.json | 3 ++- .../VariantCalling.changelog.md | 5 +++++ .../variant_calling/VariantCalling.wdl | 21 ++++++++++++++++--- .../test_inputs/Plumbing/G96830.NA12878.json | 3 ++- .../test_inputs/Plumbing/RP-929.NA12878.json | 3 ++- .../exome/Plumbing/RP-929.NA12878.json | 3 ++- .../wgs/Plumbing/G96830.NA12878.json | 3 ++- ...maGenomicsWholeGenomeCramOnly.changelog.md | 5 +++++ .../UltimaGenomicsWholeGenomeCramOnly.wdl | 2 +- .../IlluminaGenotypingArray.changelog.md | 5 +++++ .../illumina/IlluminaGenotypingArray.wdl | 2 +- .../BroadInternalArrays.changelog.md | 5 +++++ .../single_sample/BroadInternalArrays.wdl | 2 +- .../BroadInternalUltimaGenomics.changelog.md | 5 +++++ .../BroadInternalUltimaGenomics.wdl | 2 +- .../BroadInternalRNAWithUMIs.changelog.md | 5 +++++ .../rna_seq/BroadInternalRNAWithUMIs.wdl | 2 +- .../broad/qc/CheckFingerprint.changelog.md | 5 +++++ pipelines/broad/qc/CheckFingerprint.wdl | 2 +- .../exome/ExomeReprocessing.changelog.md | 5 +++++ .../reprocessing/exome/ExomeReprocessing.wdl | 5 ++++- .../ExternalExomeReprocessing.changelog.md | 5 +++++ .../exome/ExternalExomeReprocessing.wdl | 7 +++++-- ...ternalWholeGenomeReprocessing.changelog.md | 5 +++++ .../wgs/ExternalWholeGenomeReprocessing.wdl | 7 +++++-- .../wgs/WholeGenomeReprocessing.changelog.md | 5 +++++ .../wgs/WholeGenomeReprocessing.wdl | 7 +++++-- tasks/broad/GermlineVariantDiscovery.wdl | 4 ++-- tasks/broad/Qc.wdl | 4 ++-- .../TestExomeGermlineSingleSample.wdl | 4 +++- verification/test-wdls/TestReblockGVCF.wdl | 4 +++- verification/test-wdls/TestVariantCalling.wdl | 4 +++- .../TestWholeGenomeGermlineSingleSample.wdl | 4 +++- 95 files changed, 299 insertions(+), 91 deletions(-) diff --git a/pipelines/broad/arrays/single_sample/Arrays.changelog.md b/pipelines/broad/arrays/single_sample/Arrays.changelog.md index aede89d29b..cf42113ba8 100644 --- a/pipelines/broad/arrays/single_sample/Arrays.changelog.md +++ b/pipelines/broad/arrays/single_sample/Arrays.changelog.md @@ -1,3 +1,8 @@ +# 2.6.24 +2024-04-08 (Date of Last Commit) + +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. + # 2.6.23 2024-03-26 (Date of Last Commit) diff --git a/pipelines/broad/arrays/single_sample/Arrays.wdl b/pipelines/broad/arrays/single_sample/Arrays.wdl index 461e07aa8d..75e52e5c90 100644 --- a/pipelines/broad/arrays/single_sample/Arrays.wdl +++ b/pipelines/broad/arrays/single_sample/Arrays.wdl @@ -23,7 +23,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils workflow Arrays { - String pipeline_version = "2.6.23" + String pipeline_version = "2.6.24" input { String chip_well_barcode diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md index b06cdec265..7bb0abdfbb 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md @@ -1,3 +1,8 @@ +# 2.2.0 +2024-04-08 (Date of Last Commit) + +* Updated ReblockGVCF.wdl to run in Azure. + # 2.1.12 2024-03-26 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.exome.inputs.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.exome.inputs.json index 58f7ac8dcd..b4e84a89a2 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.exome.inputs.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.exome.inputs.json @@ -3,5 +3,6 @@ "ReblockGVCF.gvcf_index": "gs://broad-gotc-test-storage/ExomeGermlineSingleSample/truth/plumbing/master/RP-929.NA12878/NA12878_PLUMBING.rb.g.vcf.gz.tbi", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider":"gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl index 3ef03fba17..f9a14011dc 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl @@ -2,10 +2,11 @@ version 1.0 import "../../../../../../tasks/broad/GermlineVariantDiscovery.wdl" as Calling import "../../../../../../tasks/broad/Qc.wdl" as QC +import "../../../../../../tasks/broad/Utilities.wdl" as utils workflow ReblockGVCF { - String pipeline_version = "2.1.12" + String pipeline_version = "2.2.0" input { @@ -20,9 +21,22 @@ workflow ReblockGVCF { String? annotations_to_remove_command Boolean? move_filters_to_genotypes String gvcf_file_extension = ".g.vcf.gz" + String cloud_provider } String gvcf_basename = basename(gvcf, gvcf_file_extension) + # docker images + String gatk_docker_gcp = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + String gatk_docker_azure = "dsppipelinedev.azurecr.io/gatk_reduced_layers:latest" + String gatk_docker = if cloud_provider == "gcp" then gatk_docker_gcp else gatk_docker_azure + + # make sure either gcp or azr is supplied as cloud_provider input + if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { + call utils.ErrorWithMessage as ErrorMessageIncorrectInput { + input: + message = "cloud_provider must be supplied with either 'gcp' or 'azure'." + } + } call Calling.Reblock as Reblock { input: @@ -35,7 +49,8 @@ workflow ReblockGVCF { annotations_to_keep_command = annotations_to_keep_command, annotations_to_remove_command = annotations_to_remove_command, move_filters_to_genotypes = move_filters_to_genotypes, - output_vcf_filename = gvcf_basename + ".rb.g.vcf.gz" + output_vcf_filename = gvcf_basename + ".rb.g.vcf.gz", + docker_path = gatk_docker } # Validate the (g)VCF output of HaplotypeCaller @@ -51,7 +66,7 @@ workflow ReblockGVCF { calling_intervals_defined = defined(calling_interval_list), is_gvcf = true, extra_args = "--no-overlaps", - gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + docker_path = gatk_docker } output { diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/G96830.NA12878.index.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/G96830.NA12878.index.json index aa862a064f..2ea7652b7e 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/G96830.NA12878.index.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/G96830.NA12878.index.json @@ -3,6 +3,7 @@ "ReblockGVCF.gvcf_index": "gs://broad-gotc-test-storage/reblock_gvcf/wgs/plumbing/input/G96830.NA12878/index_in_different_location/NA12878_PLUMBING.g.vcf.gz.tbi", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } \ No newline at end of file diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/G96830.NA12878.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/G96830.NA12878.json index 76086ae169..81d7cc66ee 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/G96830.NA12878.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/G96830.NA12878.json @@ -4,5 +4,6 @@ "ReblockGVCF.calling_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/NA12878.ultima.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/NA12878.ultima.json index 33b71d9875..1e903059bf 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/NA12878.ultima.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/NA12878.ultima.json @@ -6,5 +6,6 @@ "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", "ReblockGVCF.tree_score_cutoff": 0.2, - "ReblockGVCF.annotations_to_keep_command": "--annotations-to-keep TREE_SCORE --annotations-to-keep ASSEMBLED_HAPS --annotations-to-keep FILTERED_HAPS" + "ReblockGVCF.annotations_to_keep_command": "--annotations-to-keep TREE_SCORE --annotations-to-keep ASSEMBLED_HAPS --annotations-to-keep FILTERED_HAPS", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/RP-929.NA12878.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/RP-929.NA12878.json index 5bd0ce00af..b1717905be 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/RP-929.NA12878.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Plumbing/RP-929.NA12878.json @@ -3,5 +3,6 @@ "ReblockGVCF.gvcf_index": "gs://broad-gotc-test-storage/reblock_gvcf/exome/plumbing/input/RP-929.NA12878/NA12878_PLUMBING.g.vcf.gz.tbi", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/C1963.CHMI_CHMI3_Nex1.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/C1963.CHMI_CHMI3_Nex1.json index b7dea8da45..757f468933 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/C1963.CHMI_CHMI3_Nex1.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/C1963.CHMI_CHMI3_Nex1.json @@ -3,5 +3,6 @@ "ReblockGVCF.gvcf_index": "gs://broad-gotc-test-storage/reblock_gvcf/exome/scientific/input/C1963.CHMI_CHMI3_Nex1/CHMI_CHMI3_Nex1.g.vcf.gz.tbi", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/C862.NA19238.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/C862.NA19238.json index c2a496da55..3198fdf70d 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/C862.NA19238.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/C862.NA19238.json @@ -3,5 +3,6 @@ "ReblockGVCF.gvcf_index": "gs://broad-gotc-test-storage/reblock_gvcf/exome/scientific/input/C862.NA19238/NA19238.g.vcf.gz.tbi", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/D5327.NA12878.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/D5327.NA12878.json index e5791f69bb..626f8fb268 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/D5327.NA12878.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/D5327.NA12878.json @@ -3,5 +3,6 @@ "ReblockGVCF.gvcf_index": "gs://broad-gotc-test-storage/reblock_gvcf/exome/scientific/input/D5327.NA12878/NA12878.g.vcf.gz.tbi", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/D5327.NA12891.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/D5327.NA12891.json index 28fe2ca47f..35b71a1271 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/D5327.NA12891.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/D5327.NA12891.json @@ -3,5 +3,6 @@ "ReblockGVCF.gvcf_index": "gs://broad-gotc-test-storage/reblock_gvcf/exome/scientific/input/D5327.NA12891/NA12891.g.vcf.gz.tbi", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94794.CHMI_CHMI3_WGS2.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94794.CHMI_CHMI3_WGS2.json index 33eabdc0c5..53554e2d84 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94794.CHMI_CHMI3_WGS2.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94794.CHMI_CHMI3_WGS2.json @@ -4,5 +4,6 @@ "ReblockGVCF.calling_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94982.NA12878.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94982.NA12878.json index 5518401aee..8e1d594362 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94982.NA12878.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94982.NA12878.json @@ -4,5 +4,6 @@ "ReblockGVCF.calling_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94982.NA12891.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94982.NA12891.json index 67cd0891c3..561e7dfea4 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94982.NA12891.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94982.NA12891.json @@ -4,5 +4,6 @@ "ReblockGVCF.calling_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94982.NA12892.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94982.NA12892.json index 84acd3b6eb..c8ae0e0e8f 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94982.NA12892.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G94982.NA12892.json @@ -4,5 +4,6 @@ "ReblockGVCF.calling_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G96830.NA12878.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G96830.NA12878.json index 2ff9d8a64a..881ce23794 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G96830.NA12878.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/G96830.NA12878.json @@ -4,5 +4,6 @@ "ReblockGVCF.calling_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/NA12878.bge.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/NA12878.bge.json index 5e99cbce58..459a89bc2f 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/NA12878.bge.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/NA12878.bge.json @@ -7,5 +7,6 @@ "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", "ReblockGVCF.annotations_to_remove_command": "--format-annotations-to-remove PRI", "ReblockGVCF.move_filters_to_genotypes": true, - "ReblockGVCF.gvcf_file_extension": ".gvcf.gz" + "ReblockGVCF.gvcf_file_extension": ".gvcf.gz", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/NA12878.ultima.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/NA12878.ultima.json index 4dd0f918da..ac12ce5429 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/NA12878.ultima.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/NA12878.ultima.json @@ -6,5 +6,6 @@ "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", "ReblockGVCF.tree_score_cutoff": 0.2, - "ReblockGVCF.annotations_to_keep_command": "--annotations-to-keep TREE_SCORE --annotations-to-keep ASSEMBLED_HAPS --annotations-to-keep FILTERED_HAPS" + "ReblockGVCF.annotations_to_keep_command": "--annotations-to-keep TREE_SCORE --annotations-to-keep ASSEMBLED_HAPS --annotations-to-keep FILTERED_HAPS", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/RP-1535.NA17-308.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/RP-1535.NA17-308.json index 9a2ad60cf6..5bed19c39a 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/RP-1535.NA17-308.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/RP-1535.NA17-308.json @@ -3,5 +3,6 @@ "ReblockGVCF.gvcf_index": "gs://broad-gotc-test-storage/reblock_gvcf/exome/scientific/input/RP-1535.NA17-308/NA17-308.g.vcf.gz.tbi", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/RP-518.NA12878.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/RP-518.NA12878.json index b3fbe04a0d..8136913847 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/RP-518.NA12878.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/Scientific/RP-518.NA12878.json @@ -4,5 +4,6 @@ "ReblockGVCF.calling_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Plumbing/RP-929.NA12878.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Plumbing/RP-929.NA12878.json index 5bd0ce00af..b1717905be 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Plumbing/RP-929.NA12878.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Plumbing/RP-929.NA12878.json @@ -3,5 +3,6 @@ "ReblockGVCF.gvcf_index": "gs://broad-gotc-test-storage/reblock_gvcf/exome/plumbing/input/RP-929.NA12878/NA12878_PLUMBING.g.vcf.gz.tbi", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/C1963.CHMI_CHMI3_Nex1.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/C1963.CHMI_CHMI3_Nex1.json index b7dea8da45..757f468933 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/C1963.CHMI_CHMI3_Nex1.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/C1963.CHMI_CHMI3_Nex1.json @@ -3,5 +3,6 @@ "ReblockGVCF.gvcf_index": "gs://broad-gotc-test-storage/reblock_gvcf/exome/scientific/input/C1963.CHMI_CHMI3_Nex1/CHMI_CHMI3_Nex1.g.vcf.gz.tbi", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/C862.NA19238.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/C862.NA19238.json index c2a496da55..3198fdf70d 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/C862.NA19238.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/C862.NA19238.json @@ -3,5 +3,6 @@ "ReblockGVCF.gvcf_index": "gs://broad-gotc-test-storage/reblock_gvcf/exome/scientific/input/C862.NA19238/NA19238.g.vcf.gz.tbi", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/D5327.NA12878.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/D5327.NA12878.json index e5791f69bb..626f8fb268 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/D5327.NA12878.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/D5327.NA12878.json @@ -3,5 +3,6 @@ "ReblockGVCF.gvcf_index": "gs://broad-gotc-test-storage/reblock_gvcf/exome/scientific/input/D5327.NA12878/NA12878.g.vcf.gz.tbi", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/D5327.NA12891.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/D5327.NA12891.json index 28fe2ca47f..35b71a1271 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/D5327.NA12891.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/D5327.NA12891.json @@ -3,5 +3,6 @@ "ReblockGVCF.gvcf_index": "gs://broad-gotc-test-storage/reblock_gvcf/exome/scientific/input/D5327.NA12891/NA12891.g.vcf.gz.tbi", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/D5327.NA12892.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/D5327.NA12892.json index 9235c26a47..f5e1898ba6 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/D5327.NA12892.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/D5327.NA12892.json @@ -3,5 +3,6 @@ "ReblockGVCF.gvcf_index": "gs://broad-gotc-test-storage/reblock_gvcf/exome/scientific/input/D5327.NA12892/NA12892.g.vcf.gz.tbi", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/RP-1535.NA17-308.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/RP-1535.NA17-308.json index 9a2ad60cf6..5bed19c39a 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/RP-1535.NA17-308.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/exome/Scientific/RP-1535.NA17-308.json @@ -3,5 +3,6 @@ "ReblockGVCF.gvcf_index": "gs://broad-gotc-test-storage/reblock_gvcf/exome/scientific/input/RP-1535.NA17-308/NA17-308.g.vcf.gz.tbi", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Plumbing/G96830.NA12878.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Plumbing/G96830.NA12878.json index 76086ae169..81d7cc66ee 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Plumbing/G96830.NA12878.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Plumbing/G96830.NA12878.json @@ -4,5 +4,6 @@ "ReblockGVCF.calling_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Plumbing/NA12878.ultima.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Plumbing/NA12878.ultima.json index 33b71d9875..1e903059bf 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Plumbing/NA12878.ultima.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Plumbing/NA12878.ultima.json @@ -6,5 +6,6 @@ "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", "ReblockGVCF.tree_score_cutoff": 0.2, - "ReblockGVCF.annotations_to_keep_command": "--annotations-to-keep TREE_SCORE --annotations-to-keep ASSEMBLED_HAPS --annotations-to-keep FILTERED_HAPS" + "ReblockGVCF.annotations_to_keep_command": "--annotations-to-keep TREE_SCORE --annotations-to-keep ASSEMBLED_HAPS --annotations-to-keep FILTERED_HAPS", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94794.CHMI_CHMI3_WGS2.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94794.CHMI_CHMI3_WGS2.json index 33eabdc0c5..53554e2d84 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94794.CHMI_CHMI3_WGS2.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94794.CHMI_CHMI3_WGS2.json @@ -4,5 +4,6 @@ "ReblockGVCF.calling_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94982.NA12878.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94982.NA12878.json index 5518401aee..8e1d594362 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94982.NA12878.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94982.NA12878.json @@ -4,5 +4,6 @@ "ReblockGVCF.calling_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94982.NA12891.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94982.NA12891.json index 67cd0891c3..561e7dfea4 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94982.NA12891.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94982.NA12891.json @@ -4,5 +4,6 @@ "ReblockGVCF.calling_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94982.NA12892.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94982.NA12892.json index 84acd3b6eb..c8ae0e0e8f 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94982.NA12892.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G94982.NA12892.json @@ -4,5 +4,6 @@ "ReblockGVCF.calling_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G96830.NA12878.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G96830.NA12878.json index 2ff9d8a64a..881ce23794 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G96830.NA12878.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/G96830.NA12878.json @@ -4,5 +4,6 @@ "ReblockGVCF.calling_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/NA12878.ultima.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/NA12878.ultima.json index 4dd0f918da..ac12ce5429 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/NA12878.ultima.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/NA12878.ultima.json @@ -6,5 +6,6 @@ "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", "ReblockGVCF.tree_score_cutoff": 0.2, - "ReblockGVCF.annotations_to_keep_command": "--annotations-to-keep TREE_SCORE --annotations-to-keep ASSEMBLED_HAPS --annotations-to-keep FILTERED_HAPS" + "ReblockGVCF.annotations_to_keep_command": "--annotations-to-keep TREE_SCORE --annotations-to-keep ASSEMBLED_HAPS --annotations-to-keep FILTERED_HAPS", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/RP-518.NA12878.json b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/RP-518.NA12878.json index b3fbe04a0d..8136913847 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/RP-518.NA12878.json +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/test_inputs/wgs/Scientific/RP-518.NA12878.json @@ -4,5 +4,6 @@ "ReblockGVCF.calling_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", "ReblockGVCF.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "ReblockGVCF.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict" + "ReblockGVCF.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "ReblockGVCF.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md index e0c9f8af81..acfffcef76 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md @@ -1,3 +1,8 @@ +# 3.1.20 +2024-04-08 (Date of Last Commit) + +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline + # 3.1.19 2024-03-26 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl index 7bbc434227..f5efc80b60 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl @@ -40,11 +40,12 @@ import "../../../../../../tasks/broad/BamProcessing.wdl" as Processing import "../../../../../../tasks/broad/BamToCram.wdl" as ToCram import "../../../../../../pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl" as ToGvcf import "../../../../../../structs/dna_seq/DNASeqStructs.wdl" +import "../../../../../../tasks/broad/Utilities.wdl" as utils # WORKFLOW DEFINITION workflow ExomeGermlineSingleSample { - String pipeline_version = "3.1.19" + String pipeline_version = "3.1.20" input { @@ -62,6 +63,21 @@ workflow ExomeGermlineSingleSample { Boolean skip_reblocking = false Boolean provide_bam_output = false + + String cloud_provider + } + + # docker images + String gatk_docker_gcp = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + String gatk_docker_azure = "dsppipelinedev.azurecr.io/gatk_reduced_layers:latest" + String gatk_docker = if cloud_provider == "gcp" then gatk_docker_gcp else gatk_docker_azure + + # make sure either gcp or azr is supplied as cloud_provider input + if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { + call utils.ErrorWithMessage as ErrorMessageIncorrectInput { + input: + message = "cloud_provider must be supplied with either 'gcp' or 'azure'." + } } # Not overridable: @@ -141,7 +157,8 @@ workflow ExomeGermlineSingleSample { base_file_name = sample_and_unmapped_bams.base_file_name, final_vcf_base_name = final_gvcf_base_name, agg_preemptible_tries = papi_settings.agg_preemptible_tries, - skip_reblocking = skip_reblocking + skip_reblocking = skip_reblocking, + cloud_provider = cloud_provider } call QC.CollectHsMetrics as CollectHsMetrics { diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Plumbing/RP-929.NA12878.json b/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Plumbing/RP-929.NA12878.json index a2f7bbfb29..17c06f79b6 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Plumbing/RP-929.NA12878.json +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Plumbing/RP-929.NA12878.json @@ -57,5 +57,6 @@ "ExomeGermlineSingleSample.AggregatedBamQC.CollectReadgroupBamQualityMetrics.collect_gc_bias_metrics": false, "ExomeGermlineSingleSample.AggregatedBamQC.CollectAggregationMetrics.collect_gc_bias_metrics": false, - "ExomeGermlineSingleSample.UnmappedBamToAlignedBam.CheckContamination.disable_sanity_check": true + "ExomeGermlineSingleSample.UnmappedBamToAlignedBam.CheckContamination.disable_sanity_check": true, + "ExomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/C1963.CHMI_CHMI3_Nex1.json b/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/C1963.CHMI_CHMI3_Nex1.json index 1c4ba00d72..163e2f8265 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/C1963.CHMI_CHMI3_Nex1.json +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/C1963.CHMI_CHMI3_Nex1.json @@ -63,5 +63,6 @@ }, "ExomeGermlineSingleSample.AggregatedBamQC.CollectReadgroupBamQualityMetrics.collect_gc_bias_metrics": false, - "ExomeGermlineSingleSample.AggregatedBamQC.CollectAggregationMetrics.collect_gc_bias_metrics": false + "ExomeGermlineSingleSample.AggregatedBamQC.CollectAggregationMetrics.collect_gc_bias_metrics": false, + "ExomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/C862.NA19238.json b/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/C862.NA19238.json index f884c22730..c90ddcf59f 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/C862.NA19238.json +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/C862.NA19238.json @@ -71,5 +71,6 @@ }, "ExomeGermlineSingleSample.AggregatedBamQC.CollectReadgroupBamQualityMetrics.collect_gc_bias_metrics": false, - "ExomeGermlineSingleSample.AggregatedBamQC.CollectAggregationMetrics.collect_gc_bias_metrics": false + "ExomeGermlineSingleSample.AggregatedBamQC.CollectAggregationMetrics.collect_gc_bias_metrics": false, + "ExomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/D5327.NA12878.json b/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/D5327.NA12878.json index 79b98889b0..a302f38a4f 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/D5327.NA12878.json +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/D5327.NA12878.json @@ -56,5 +56,6 @@ }, "ExomeGermlineSingleSample.AggregatedBamQC.CollectReadgroupBamQualityMetrics.collect_gc_bias_metrics": false, - "ExomeGermlineSingleSample.AggregatedBamQC.CollectAggregationMetrics.collect_gc_bias_metrics": false + "ExomeGermlineSingleSample.AggregatedBamQC.CollectAggregationMetrics.collect_gc_bias_metrics": false, + "ExomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/D5327.NA12891.json b/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/D5327.NA12891.json index 72722de383..945d7bb79c 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/D5327.NA12891.json +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/D5327.NA12891.json @@ -56,5 +56,6 @@ }, "ExomeGermlineSingleSample.AggregatedBamQC.CollectReadgroupBamQualityMetrics.collect_gc_bias_metrics": false, - "ExomeGermlineSingleSample.AggregatedBamQC.CollectAggregationMetrics.collect_gc_bias_metrics": false + "ExomeGermlineSingleSample.AggregatedBamQC.CollectAggregationMetrics.collect_gc_bias_metrics": false, + "ExomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/D5327.NA12892.json b/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/D5327.NA12892.json index 028be345a3..67ee0a8bd0 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/D5327.NA12892.json +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/D5327.NA12892.json @@ -57,5 +57,6 @@ }, "ExomeGermlineSingleSample.AggregatedBamQC.CollectReadgroupBamQualityMetrics.collect_gc_bias_metrics": false, - "ExomeGermlineSingleSample.AggregatedBamQC.CollectAggregationMetrics.collect_gc_bias_metrics": false + "ExomeGermlineSingleSample.AggregatedBamQC.CollectAggregationMetrics.collect_gc_bias_metrics": false, + "ExomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/RP-1535.NA17-308.json b/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/RP-1535.NA17-308.json index ab6c472216..1d8834a98e 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/RP-1535.NA17-308.json +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/test_inputs/Scientific/RP-1535.NA17-308.json @@ -73,5 +73,6 @@ }, "ExomeGermlineSingleSample.AggregatedBamQC.CollectReadgroupBamQualityMetrics.collect_gc_bias_metrics": false, - "ExomeGermlineSingleSample.AggregatedBamQC.CollectAggregationMetrics.collect_gc_bias_metrics": false + "ExomeGermlineSingleSample.AggregatedBamQC.CollectAggregationMetrics.collect_gc_bias_metrics": false, + "ExomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md index 9370fb1fa6..388d75b7fb 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md @@ -1,3 +1,8 @@ +# 1.0.17 +2024-04-08 (Date of Last Commit) + +* Changed ReblockGVCFs.wdl to be multicloud + # 1.0.16 2024-03-26 (Date of Last Commit) @@ -91,4 +96,4 @@ 2022-05-05 (Date of Last Commit) * Initial Release of UltimaGenomicsWholeGenomeGermline pipeline. -* The UltimaGenomicsWholeGenomeGermline pipeline is an open-source, cloud-optimized workflow created for processing Ultima Genomics Whole Genome Sequenced Germline samples. Overall, the workflow aligns reads to the genome, marks duplicates, calls variants, and calculates quality metrics to produce a CRAM, CRAI, GVCF, filtered VCF, and quality metrics. \ No newline at end of file +* The UltimaGenomicsWholeGenomeGermline pipeline is an open-source, cloud-optimized workflow created for processing Ultima Genomics Whole Genome Sequenced Germline samples. Overall, the workflow aligns reads to the genome, marks duplicates, calls variants, and calculates quality metrics to produce a CRAM, CRAI, GVCF, filtered VCF, and quality metrics. diff --git a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl index 997686a999..a404f5d561 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl +++ b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl @@ -50,7 +50,7 @@ workflow UltimaGenomicsWholeGenomeGermline { filtering_model_no_gt_name: "String describing the optional filtering model; default set to rf_model_ignore_gt_incl_hpol_runs" } - String pipeline_version = "1.0.16" + String pipeline_version = "1.0.17" References references = alignment_references.references @@ -196,7 +196,8 @@ workflow UltimaGenomicsWholeGenomeGermline { ref_fasta = alignment_references.references.ref_fasta, ref_fasta_index = alignment_references.references.ref_fasta_index, tree_score_cutoff = vcf_post_processing.remove_low_tree_score_sites_cutoff, - annotations_to_keep_command = vcf_post_processing.annotations_to_keep_command_for_reblocking + annotations_to_keep_command = vcf_post_processing.annotations_to_keep_command_for_reblocking, + cloud_provider = "gcp" } # Outputs that will be retained when execution is complete diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md index f1f4d4b0e7..747a7030a1 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md @@ -1,3 +1,8 @@ +# 3.1.21 +2024-04-08 (Date of Last Commit) + +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. + # 3.1.20 2024-03-26 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl index 2883780473..48af86c619 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl @@ -40,7 +40,7 @@ import "../../../../../../structs/dna_seq/DNASeqStructs.wdl" workflow WholeGenomeGermlineSingleSample { - String pipeline_version = "3.1.20" + String pipeline_version = "3.1.21" input { @@ -68,6 +68,8 @@ workflow WholeGenomeGermlineSingleSample { Boolean use_bwa_mem = true Boolean allow_empty_ref_alt = false Boolean use_dragen_hard_filtering = false + + String cloud_provider } if (dragen_functional_equivalence_mode && dragen_maximum_quality_mode) { @@ -192,7 +194,8 @@ workflow WholeGenomeGermlineSingleSample { final_vcf_base_name = final_gvcf_base_name, agg_preemptible_tries = papi_settings.agg_preemptible_tries, use_gatk3_haplotype_caller = use_gatk3_haplotype_caller_, - use_dragen_hard_filtering = use_dragen_hard_filtering_ + use_dragen_hard_filtering = use_dragen_hard_filtering_, + cloud_provider = cloud_provider } if (provide_bam_output) { diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/input_files/WholeGenomeGermlineSingleSample.inputs.plumbing.masked_reference.json b/pipelines/broad/dna_seq/germline/single_sample/wgs/input_files/WholeGenomeGermlineSingleSample.inputs.plumbing.masked_reference.json index a2f8532cf7..309e93f9bd 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/input_files/WholeGenomeGermlineSingleSample.inputs.plumbing.masked_reference.json +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/input_files/WholeGenomeGermlineSingleSample.inputs.plumbing.masked_reference.json @@ -50,6 +50,7 @@ "WholeGenomeGermlineSingleSample.fingerprint_genotypes_file": "gs://broad-gotc-test-storage/single_sample/plumbing/bams/G96830.NA12878/G96830.NA12878.hg38.reference.fingerprint.vcf.gz", "WholeGenomeGermlineSingleSample.fingerprint_genotypes_index": "gs://broad-gotc-test-storage/single_sample/plumbing/bams/G96830.NA12878/G96830.NA12878.hg38.reference.fingerprint.vcf.gz.tbi", "WholeGenomeGermlineSingleSample.wgs_coverage_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list", + "WholeGenomeGermlineSingleSample.cloud_provider": "gcp", "WholeGenomeGermlineSingleSample.papi_settings": { "preemptible_tries": 3, diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Plumbing/G96830.NA12878.json b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Plumbing/G96830.NA12878.json index 772ee521b8..321ecbcc02 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Plumbing/G96830.NA12878.json +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Plumbing/G96830.NA12878.json @@ -58,5 +58,6 @@ "WholeGenomeGermlineSingleSample.UnmappedBamToAlignedBam.CheckContamination.disable_sanity_check": true, "WholeGenomeGermlineSingleSample.CollectWgsMetrics.read_length": 250, - "WholeGenomeGermlineSingleSample.CollectRawWgsMetrics.read_length": 250 + "WholeGenomeGermlineSingleSample.CollectRawWgsMetrics.read_length": 250, + "WholeGenomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Plumbing/dragen_mode_best_results.json b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Plumbing/dragen_mode_best_results.json index 96f903e80d..a06a620b6c 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Plumbing/dragen_mode_best_results.json +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Plumbing/dragen_mode_best_results.json @@ -62,5 +62,6 @@ }, "WholeGenomeGermlineSingleSample.dragen_maximum_quality_mode": true, - "WholeGenomeGermlineSingleSample.UnmappedBamToAlignedBam.CheckContamination.disable_sanity_check": true + "WholeGenomeGermlineSingleSample.UnmappedBamToAlignedBam.CheckContamination.disable_sanity_check": true, + "WholeGenomeGermlineSingleSample.cloud_provider": "gcp" } \ No newline at end of file diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Plumbing/dragen_mode_functional_equivalence.json b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Plumbing/dragen_mode_functional_equivalence.json index 50b81f310b..928deacdb5 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Plumbing/dragen_mode_functional_equivalence.json +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Plumbing/dragen_mode_functional_equivalence.json @@ -63,5 +63,6 @@ }, "WholeGenomeGermlineSingleSample.dragen_functional_equivalence_mode": true, - "WholeGenomeGermlineSingleSample.UnmappedBamToAlignedBam.CheckContamination.disable_sanity_check": true + "WholeGenomeGermlineSingleSample.UnmappedBamToAlignedBam.CheckContamination.disable_sanity_check": true, + "WholeGenomeGermlineSingleSample.cloud_provider": "gcp" } \ No newline at end of file diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94794.CHMI_CHMI3_WGS2.json b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94794.CHMI_CHMI3_WGS2.json index 8371849045..33374a597f 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94794.CHMI_CHMI3_WGS2.json +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94794.CHMI_CHMI3_WGS2.json @@ -73,5 +73,6 @@ "WholeGenomeGermlineSingleSample.papi_settings": { "preemptible_tries": 3, "agg_preemptible_tries": 3 - } + }, + "WholeGenomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_best_results.json b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_best_results.json index 94f90073c8..c625c8b4c3 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_best_results.json +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_best_results.json @@ -83,5 +83,6 @@ }, "WholeGenomeGermlineSingleSample.dragen_maximum_quality_mode": true, - "WholeGenomeGermlineSingleSample.BamToGvcf.HaplotypeCallerGATK4.memory_multiplier":2 + "WholeGenomeGermlineSingleSample.BamToGvcf.HaplotypeCallerGATK4.memory_multiplier":2, + "WholeGenomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_functional_equivalence.json b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_functional_equivalence.json index c4b9608f29..271675b702 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_functional_equivalence.json +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.dragen_mode_functional_equivalence.json @@ -82,5 +82,6 @@ }, "WholeGenomeGermlineSingleSample.dragen_functional_equivalence_mode": true, - "WholeGenomeGermlineSingleSample.BamToGvcf.HaplotypeCallerGATK4.memory_multiplier":2 + "WholeGenomeGermlineSingleSample.BamToGvcf.HaplotypeCallerGATK4.memory_multiplier":2, + "WholeGenomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.json b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.json index 344e66dd9a..96cac538de 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.json +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12878.json @@ -73,5 +73,6 @@ "WholeGenomeGermlineSingleSample.papi_settings": { "preemptible_tries": 3, "agg_preemptible_tries": 3 - } + }, + "WholeGenomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12891.json b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12891.json index 650c41990f..eeccd9275b 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12891.json +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12891.json @@ -76,5 +76,6 @@ "WholeGenomeGermlineSingleSample.papi_settings": { "preemptible_tries": 3, "agg_preemptible_tries": 3 - } + }, + "WholeGenomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12892.json b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12892.json index 9372e66905..5558036b60 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12892.json +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G94982.NA12892.json @@ -74,5 +74,6 @@ "WholeGenomeGermlineSingleSample.papi_settings": { "preemptible_tries": 3, "agg_preemptible_tries": 3 - } + }, + "WholeGenomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G96830.NA12878.json b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G96830.NA12878.json index 7f5e219d59..b4e3b1574a 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G96830.NA12878.json +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/G96830.NA12878.json @@ -73,5 +73,6 @@ "WholeGenomeGermlineSingleSample.papi_settings": { "preemptible_tries": 3, "agg_preemptible_tries": 3 - } + }, + "WholeGenomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/RP-518.NA12878.json b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/RP-518.NA12878.json index 2032139bad..035b62a322 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/RP-518.NA12878.json +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/test_inputs/Scientific/RP-518.NA12878.json @@ -50,5 +50,6 @@ "WholeGenomeGermlineSingleSample.papi_settings": { "preemptible_tries": 3, "agg_preemptible_tries": 3 - } + }, + "WholeGenomeGermlineSingleSample.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md index ee3a4be465..e0752ba664 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md @@ -1,3 +1,8 @@ +# 2.1.19 +2024-04-08 (Date of Last Commit) + +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. + # 2.1.18 2024-03-26 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index 1de2cb2361..e703fd99d6 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -9,7 +9,7 @@ import "../../../../../tasks/broad/DragenTasks.wdl" as DragenTasks workflow VariantCalling { - String pipeline_version = "2.1.18" + String pipeline_version = "2.1.19" input { @@ -36,6 +36,20 @@ workflow VariantCalling { Boolean use_gatk3_haplotype_caller = false Boolean skip_reblocking = false Boolean use_dragen_hard_filtering = false + String cloud_provider + } + + # docker images + String gatk_docker_gcp = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + String gatk_docker_azure = "dsppipelinedev.azurecr.io/gatk_reduced_layers:latest" + String gatk_docker = if cloud_provider == "gcp" then gatk_docker_gcp else gatk_docker_azure + + # make sure either gcp or azr is supplied as cloud_provider input + if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { + call Utils.ErrorWithMessage as ErrorMessageIncorrectInput { + input: + message = "cloud_provider must be supplied with either 'gcp' or 'azure'." + } } parameter_meta { @@ -158,7 +172,8 @@ workflow VariantCalling { ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, ref_dict = ref_dict, - output_vcf_filename = basename(MergeVCFs.output_vcf, ".g.vcf.gz") + ".rb.g.vcf.gz" + output_vcf_filename = basename(MergeVCFs.output_vcf, ".g.vcf.gz") + ".rb.g.vcf.gz", + docker_path = gatk_docker } } @@ -183,7 +198,7 @@ workflow VariantCalling { calling_interval_list = calling_interval_list, is_gvcf = make_gvcf, extra_args = if (skip_reblocking == false) then "--no-overlaps" else "", - gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0", + docker_path = gatk_docker, preemptible_tries = agg_preemptible_tries } diff --git a/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/Plumbing/G96830.NA12878.json b/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/Plumbing/G96830.NA12878.json index 4e4be85272..c13ceb45f8 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/Plumbing/G96830.NA12878.json +++ b/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/Plumbing/G96830.NA12878.json @@ -17,5 +17,6 @@ "VariantCalling.haplotype_scatter_count": 10, "VariantCalling.break_bands_at_multiples_of": 100000, "VariantCalling.agg_preemptible_tries": 3, - "VariantCalling.use_gatk3_haplotype_caller": true + "VariantCalling.use_gatk3_haplotype_caller": true, + "VariantCalling.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/Plumbing/RP-929.NA12878.json b/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/Plumbing/RP-929.NA12878.json index 1e89ca58f5..78f6c994e7 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/Plumbing/RP-929.NA12878.json +++ b/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/Plumbing/RP-929.NA12878.json @@ -17,5 +17,6 @@ "VariantCalling.haplotype_scatter_count": 10, "VariantCalling.break_bands_at_multiples_of": 0, "VariantCalling.agg_preemptible_tries": 3, - "VariantCalling.use_gatk3_haplotype_caller": false + "VariantCalling.use_gatk3_haplotype_caller": false, + "VariantCalling.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/exome/Plumbing/RP-929.NA12878.json b/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/exome/Plumbing/RP-929.NA12878.json index 1e89ca58f5..78f6c994e7 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/exome/Plumbing/RP-929.NA12878.json +++ b/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/exome/Plumbing/RP-929.NA12878.json @@ -17,5 +17,6 @@ "VariantCalling.haplotype_scatter_count": 10, "VariantCalling.break_bands_at_multiples_of": 0, "VariantCalling.agg_preemptible_tries": 3, - "VariantCalling.use_gatk3_haplotype_caller": false + "VariantCalling.use_gatk3_haplotype_caller": false, + "VariantCalling.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/wgs/Plumbing/G96830.NA12878.json b/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/wgs/Plumbing/G96830.NA12878.json index 4e4be85272..c13ceb45f8 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/wgs/Plumbing/G96830.NA12878.json +++ b/pipelines/broad/dna_seq/germline/variant_calling/test_inputs/wgs/Plumbing/G96830.NA12878.json @@ -17,5 +17,6 @@ "VariantCalling.haplotype_scatter_count": 10, "VariantCalling.break_bands_at_multiples_of": 100000, "VariantCalling.agg_preemptible_tries": 3, - "VariantCalling.use_gatk3_haplotype_caller": true + "VariantCalling.use_gatk3_haplotype_caller": true, + "VariantCalling.cloud_provider": "gcp" } diff --git a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md index 53cdb52510..aabda5be46 100644 --- a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md +++ b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md @@ -1,3 +1,8 @@ +# 1.0.17 +2024-04-08 (Date of Last Commit) + +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. + # 1.0.16 2024-03-26 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl index 9139aef12c..17d4fecfb8 100644 --- a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl +++ b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl @@ -43,7 +43,7 @@ workflow UltimaGenomicsWholeGenomeCramOnly { save_bam_file: "If true, then save intermeidate ouputs used by germline pipeline (such as the output BAM) otherwise they won't be kept as outputs." } - String pipeline_version = "1.0.16" + String pipeline_version = "1.0.17" References references = alignment_references.references diff --git a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md index e404639206..a698100417 100644 --- a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md +++ b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md @@ -1,3 +1,8 @@ +# 1.12.18 +2024-04-08 (Date of Last Commit) + +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. + # 1.12.17 2024-03-26 (Date of Last Commit) diff --git a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl index 75a8dc1d7b..2443bc8bcb 100644 --- a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl +++ b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl @@ -21,7 +21,7 @@ import "../../../../tasks/broad/Qc.wdl" as Qc workflow IlluminaGenotypingArray { - String pipeline_version = "1.12.17" + String pipeline_version = "1.12.18" input { String sample_alias diff --git a/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.changelog.md b/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.changelog.md index ffe7eece0c..e31bff0008 100644 --- a/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.changelog.md +++ b/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.changelog.md @@ -1,3 +1,8 @@ +# 1.1.8 +2024-04-08 (Date of Last Commit) + +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. + # 1.1.7 2024-03-26 (Date of Last Commit) diff --git a/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl b/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl index 3dd62b09ae..b7bf1c183e 100644 --- a/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl +++ b/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl @@ -9,7 +9,7 @@ workflow BroadInternalArrays { description: "Push outputs of Arrays.wdl to TDR dataset table ArraysOutputsTable." } - String pipeline_version = "1.1.7" + String pipeline_version = "1.1.8" input { # inputs to wrapper task diff --git a/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.changelog.md b/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.changelog.md index 645e25f8fa..ce366234a3 100644 --- a/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.changelog.md +++ b/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.changelog.md @@ -1,3 +1,8 @@ +# 1.0.18 +2024-04-08 (Date of Last Commit) + +* Updated ReblockGVCF.wdl to run in Azure. + # 1.0.17 2024-03-26 (Date of Last Commit) diff --git a/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl b/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl index fbd0ef4b53..946c9196dd 100644 --- a/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl +++ b/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl @@ -6,7 +6,7 @@ import "../../../../../../../pipelines/broad/qc/CheckFingerprint.wdl" as FP workflow BroadInternalUltimaGenomics { - String pipeline_version = "1.0.17" + String pipeline_version = "1.0.18" input { diff --git a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md index b455d24e9f..cfb9d14ae1 100644 --- a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md +++ b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md @@ -1,3 +1,8 @@ +# 1.0.30 +2024-04-08 (Date of Last Commit) + +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. + # 1.0.29 2024-03-26 (Date of Last Commit) diff --git a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl index 95edae4bb7..d4f5316e89 100644 --- a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl +++ b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl @@ -7,7 +7,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils workflow BroadInternalRNAWithUMIs { - String pipeline_version = "1.0.29" + String pipeline_version = "1.0.30" input { # input needs to be either "hg19" or "hg38" diff --git a/pipelines/broad/qc/CheckFingerprint.changelog.md b/pipelines/broad/qc/CheckFingerprint.changelog.md index fd7517251c..a8ed8c3e4d 100644 --- a/pipelines/broad/qc/CheckFingerprint.changelog.md +++ b/pipelines/broad/qc/CheckFingerprint.changelog.md @@ -1,3 +1,8 @@ +# 1.0.17 +2024-04-08 (Date of Last Commit) + +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. + # 1.0.16 2024-03-26 (Date of Last Commit) diff --git a/pipelines/broad/qc/CheckFingerprint.wdl b/pipelines/broad/qc/CheckFingerprint.wdl index 2dbe67b878..0338466c3b 100644 --- a/pipelines/broad/qc/CheckFingerprint.wdl +++ b/pipelines/broad/qc/CheckFingerprint.wdl @@ -24,7 +24,7 @@ import "../../../tasks/broad/Qc.wdl" as Qc workflow CheckFingerprint { - String pipeline_version = "1.0.16" + String pipeline_version = "1.0.17" input { File? input_vcf diff --git a/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md b/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md index a5c4f30605..0cee3decbe 100644 --- a/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md @@ -1,3 +1,8 @@ +# 3.1.20 +2024-04-08 (Date of Last Commit) + +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. + # 3.1.19 2024-03-26 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl b/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl index 11cc7ef033..0f4fadb666 100644 --- a/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl @@ -7,7 +7,7 @@ import "../../../../structs/dna_seq/DNASeqStructs.wdl" workflow ExomeReprocessing { - String pipeline_version = "3.1.19" + String pipeline_version = "3.1.20" input { File? input_cram @@ -32,6 +32,8 @@ workflow ExomeReprocessing { File target_interval_list File bait_interval_list String bait_set_name + + String cloud_provider } call ToUbams.CramToUnmappedBams { @@ -64,6 +66,7 @@ workflow ExomeReprocessing { target_interval_list = target_interval_list, bait_interval_list = bait_interval_list, bait_set_name = bait_set_name, + cloud_provider = cloud_provider } output { diff --git a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md index 0312d1bea5..d7bbf05bdc 100644 --- a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md @@ -1,3 +1,8 @@ +# 3.1.22 +2024-04-08 (Date of Last Commit) + +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. + # 3.1.21 2024-03-26 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl index 7fc309a1e0..3ff6daaa8b 100644 --- a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl @@ -5,7 +5,7 @@ import "../../../../../tasks/broad/CopyFilesFromCloudToCloud.wdl" as Copy workflow ExternalExomeReprocessing { - String pipeline_version = "3.1.21" + String pipeline_version = "3.1.22" input { @@ -34,6 +34,8 @@ workflow ExternalExomeReprocessing { String destination_cloud_path String vault_token_path String google_account_vault_path + + String cloud_provider } call ExomeReprocessing.ExomeReprocessing { @@ -53,7 +55,8 @@ workflow ExternalExomeReprocessing { fingerprint_genotypes_index = fingerprint_genotypes_index, cram_ref_fasta = cram_ref_fasta, cram_ref_fasta_index = cram_ref_fasta_index, - papi_settings = papi_settings + papi_settings = papi_settings, + cloud_provider = cloud_provider } call Copy.CopyFilesFromCloudToCloud { diff --git a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md index 71b139eb3e..57fce7e75e 100644 --- a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md @@ -1,3 +1,8 @@ +# 2.1.22 +2024-04-08 (Date of Last Commit) + +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. + # 2.1.21 2024-03-26 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl index 609c70bc09..9776ce06d5 100644 --- a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl @@ -6,7 +6,7 @@ import "../../../../../tasks/broad/CopyFilesFromCloudToCloud.wdl" as Copy workflow ExternalWholeGenomeReprocessing { - String pipeline_version = "2.1.21" + String pipeline_version = "2.1.22" input { File? input_cram @@ -33,6 +33,8 @@ workflow ExternalWholeGenomeReprocessing { String destination_cloud_path String vault_token_path String google_account_vault_path + + String cloud_provider } call WholeGenomeReprocessing.WholeGenomeReprocessing { @@ -51,7 +53,8 @@ workflow ExternalWholeGenomeReprocessing { fingerprint_genotypes_index = fingerprint_genotypes_index, papi_settings = papi_settings, wgs_coverage_interval_list = wgs_coverage_interval_list, - scatter_settings = scatter_settings + scatter_settings = scatter_settings, + cloud_provider = cloud_provider } call Copy.CopyFilesFromCloudToCloud { diff --git a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md index fa7dd2579d..f32bf69607 100644 --- a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md @@ -1,3 +1,8 @@ +# 3.1.21 +2024-04-08 (Date of Last Commit) + +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. + # 3.1.20 2024-03-26 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl index ac48aab3ed..cd4afd70b5 100644 --- a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl @@ -6,7 +6,7 @@ import "../../../../structs/dna_seq/DNASeqStructs.wdl" workflow WholeGenomeReprocessing { - String pipeline_version = "3.1.20" + String pipeline_version = "3.1.21" input { File? input_cram @@ -29,6 +29,8 @@ workflow WholeGenomeReprocessing { File? fingerprint_genotypes_index File wgs_coverage_interval_list + + String cloud_provider } call ToUbams.CramToUnmappedBams { @@ -57,7 +59,8 @@ workflow WholeGenomeReprocessing { fingerprint_genotypes_file = fingerprint_genotypes_file, fingerprint_genotypes_index = fingerprint_genotypes_index, papi_settings = papi_settings, - wgs_coverage_interval_list = wgs_coverage_interval_list + wgs_coverage_interval_list = wgs_coverage_interval_list, + cloud_provider = cloud_provider } output { diff --git a/tasks/broad/GermlineVariantDiscovery.wdl b/tasks/broad/GermlineVariantDiscovery.wdl index 0e3c8f2e6e..7294f2d0b5 100644 --- a/tasks/broad/GermlineVariantDiscovery.wdl +++ b/tasks/broad/GermlineVariantDiscovery.wdl @@ -203,7 +203,7 @@ task Reblock { File ref_fasta File ref_fasta_index String output_vcf_filename - String docker_image = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + String docker_path Int additional_disk = 20 String? annotations_to_keep_command String? annotations_to_remove_command @@ -240,7 +240,7 @@ task Reblock { disks: "local-disk " + disk_size + " HDD" bootDiskSizeGb: 15 preemptible: 3 - docker: docker_image + docker: docker_path } output { diff --git a/tasks/broad/Qc.wdl b/tasks/broad/Qc.wdl index dfc6581f43..58c94f46e9 100644 --- a/tasks/broad/Qc.wdl +++ b/tasks/broad/Qc.wdl @@ -622,7 +622,7 @@ task ValidateVCF { Int preemptible_tries = 3 Boolean is_gvcf = true String? extra_args - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + String docker_path Int machine_mem_mb = 7000 } @@ -657,7 +657,7 @@ task ValidateVCF { ~{extra_args} } runtime { - docker: gatk_docker + docker: docker_path preemptible: preemptible_tries memory: machine_mem_mb + " MiB" bootDiskSizeGb: 15 diff --git a/verification/test-wdls/TestExomeGermlineSingleSample.wdl b/verification/test-wdls/TestExomeGermlineSingleSample.wdl index e6324a420c..59110d09be 100644 --- a/verification/test-wdls/TestExomeGermlineSingleSample.wdl +++ b/verification/test-wdls/TestExomeGermlineSingleSample.wdl @@ -28,6 +28,7 @@ workflow TestExomeGermlineSingleSample { Boolean update_truth String vault_token_path String google_account_vault_path + String cloud_provider } meta { @@ -46,7 +47,8 @@ workflow TestExomeGermlineSingleSample { target_interval_list = target_interval_list, bait_interval_list = bait_interval_list, bait_set_name = bait_set_name, - provide_bam_output = provide_bam_output + provide_bam_output = provide_bam_output, + cloud_provider = cloud_provider } # Collect all of the pipeline outputs into a single Array[String]] diff --git a/verification/test-wdls/TestReblockGVCF.wdl b/verification/test-wdls/TestReblockGVCF.wdl index f34e22f1b7..01607636c7 100644 --- a/verification/test-wdls/TestReblockGVCF.wdl +++ b/verification/test-wdls/TestReblockGVCF.wdl @@ -27,6 +27,7 @@ workflow TestReblockGVCF { Boolean update_truth String vault_token_path String google_account_vault_path + String cloud_provider } meta { @@ -45,7 +46,8 @@ workflow TestReblockGVCF { annotations_to_keep_command = annotations_to_keep_command, annotations_to_remove_command = annotations_to_remove_command, move_filters_to_genotypes = move_filters_to_genotypes, - gvcf_file_extension = gvcf_file_extension + gvcf_file_extension = gvcf_file_extension, + cloud_provider = cloud_provider } diff --git a/verification/test-wdls/TestVariantCalling.wdl b/verification/test-wdls/TestVariantCalling.wdl index b2c3b29273..3054e0a1b9 100644 --- a/verification/test-wdls/TestVariantCalling.wdl +++ b/verification/test-wdls/TestVariantCalling.wdl @@ -39,6 +39,7 @@ workflow TestVariantCalling { Boolean update_truth String vault_token_path String google_account_vault_path + String cloud_provider } meta { @@ -69,7 +70,8 @@ workflow TestVariantCalling { make_bamout = make_bamout, use_gatk3_haplotype_caller = use_gatk3_haplotype_caller, skip_reblocking = skip_reblocking, - use_dragen_hard_filtering = use_dragen_hard_filtering + use_dragen_hard_filtering = use_dragen_hard_filtering, + cloud_provider = cloud_provider } diff --git a/verification/test-wdls/TestWholeGenomeGermlineSingleSample.wdl b/verification/test-wdls/TestWholeGenomeGermlineSingleSample.wdl index d3f775dcc7..16b54c3876 100644 --- a/verification/test-wdls/TestWholeGenomeGermlineSingleSample.wdl +++ b/verification/test-wdls/TestWholeGenomeGermlineSingleSample.wdl @@ -32,6 +32,7 @@ workflow TestWholeGenomeGermlineSingleSample { Boolean use_bwa_mem = true Boolean allow_empty_ref_alt = false Boolean use_dragen_hard_filtering = false + String cloud_provider # These values will be determined and injected into the inputs by the scala test framework String truth_path @@ -66,7 +67,8 @@ workflow TestWholeGenomeGermlineSingleSample { perform_bqsr = perform_bqsr, use_bwa_mem = use_bwa_mem, allow_empty_ref_alt = allow_empty_ref_alt, - use_dragen_hard_filtering = use_dragen_hard_filtering + use_dragen_hard_filtering = use_dragen_hard_filtering, + cloud_provider = cloud_provider } # Collect all of the pipeline outputs into a single Array[String] From 0e38aa89df4a757da9f50457db67515a4610af5f Mon Sep 17 00:00:00 2001 From: John Scira Date: Thu, 18 Apr 2024 13:19:27 -0400 Subject: [PATCH 042/186] edited wdl and changelog for ToA support --- pipelines/skylab/snm3C/snm3C.changelog.md | 4 ++++ pipelines/skylab/snm3C/snm3C.wdl | 21 ++++++++++++++------- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.changelog.md b/pipelines/skylab/snm3C/snm3C.changelog.md index afd28595c3..4ee5f7f128 100644 --- a/pipelines/skylab/snm3C/snm3C.changelog.md +++ b/pipelines/skylab/snm3C/snm3C.changelog.md @@ -1,3 +1,7 @@ +# 4.0.1 +2024-04-18 (Date of Last Commit) +* Updated the snM3C wdl to run on Azure. This change does not affect the snM3C pipeline. + # 4.0.0 2024-03-15 (Date of Last Commit) * Reconstructed code and merged tasks to optimize pipeline and reduce cost diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 0aa726d1b4..3b14bb8fd2 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -11,6 +11,7 @@ workflow snm3C { File tarred_index_files File genome_fa File chromosome_sizes + String cloud_provider String r1_adapter = "AGATCGGAAGAGCACACGTCTGAAC" String r2_adapter = "AGATCGGAAGAGCGTCGTGTAGGGA" @@ -23,11 +24,17 @@ workflow snm3C { Int num_downstr_bases = 2 Int compress_level = 5 Int batch_number - String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:2.4" } + # Determine docker prefix based on cloud provider + String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" + String acr_docker_prefix = "dsppipelinedev.azurecr.io/" + String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix + + String snm3C_docker_image = "m3c-yap-hisat:2.4" + # version of the pipeline - String pipeline_version = "4.0.0" + String pipeline_version = "4.0.1" call Demultiplexing { input: @@ -35,7 +42,7 @@ workflow snm3C { fastq_input_read2 = fastq_input_read2, random_primer_indexes = random_primer_indexes, plate_id = plate_id, - docker = docker, + docker = docker_prefix + snm3C_docker_image, batch_number = batch_number } @@ -54,7 +61,7 @@ workflow snm3C { r2_left_cut = r2_left_cut, r2_right_cut = r2_right_cut, plate_id = plate_id, - docker = docker + docker = docker_prefix + snm3C_docker_image } call Hisat_single_end as Hisat_single_end { @@ -63,7 +70,7 @@ workflow snm3C { tarred_index_files = tarred_index_files, genome_fa = genome_fa, plate_id = plate_id, - docker = docker + docker = docker_prefix + snm3C_docker_image } call Merge_sort_analyze as Merge_sort_analyze { @@ -76,7 +83,7 @@ workflow snm3C { compress_level = compress_level, chromosome_sizes = chromosome_sizes, plate_id = plate_id, - docker = docker + docker = docker_prefix + snm3C_docker_image } } @@ -91,7 +98,7 @@ workflow snm3C { allc_uniq_reads_stats = Merge_sort_analyze.allc_uniq_reads_stats, unique_reads_cgn_extraction_tbi = Merge_sort_analyze.extract_allc_output_tbi_tar, plate_id = plate_id, - docker = docker + docker = docker_prefix + snm3C_docker_image } meta { From 2d7e5fc26f8b11a8700b8afce0f41ec45852151f Mon Sep 17 00:00:00 2001 From: John Scira Date: Thu, 18 Apr 2024 13:20:35 -0400 Subject: [PATCH 043/186] added cloud provider input to test inputs json --- pipelines/skylab/snm3C/test_inputs/Plumbing/miseq_M16_G13.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/test_inputs/Plumbing/miseq_M16_G13.json b/pipelines/skylab/snm3C/test_inputs/Plumbing/miseq_M16_G13.json index af100dea41..fcacdc5069 100644 --- a/pipelines/skylab/snm3C/test_inputs/Plumbing/miseq_M16_G13.json +++ b/pipelines/skylab/snm3C/test_inputs/Plumbing/miseq_M16_G13.json @@ -19,5 +19,6 @@ "snm3C.batch_number": 2, "snm3C.Hisat_paired_end.cpu_platform" : "Intel Cascade Lake", "snm3C.Hisat_single_end.cpu_platform" : "Intel Cascade Lake", - "snm3C.Merge_sort_analyze.cpu_platform" : "Intel Cascade Lake" + "snm3C.Merge_sort_analyze.cpu_platform" : "Intel Cascade Lake", + "snm3C.cloud_provider" : "gcp" } From ed008c6c6273ca04b354abcf1f895021c9af8945 Mon Sep 17 00:00:00 2001 From: John Scira Date: Thu, 18 Apr 2024 13:24:46 -0400 Subject: [PATCH 044/186] added util class to log error if unsupported cloud provider used for cloud_provider input --- pipelines/skylab/snm3C/snm3C.wdl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 3b14bb8fd2..e1c69183d2 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -1,4 +1,6 @@ version 1.0 +import "../../../tasks/broad/Utilities.wdl" as utils + workflow snm3C { @@ -31,7 +33,13 @@ workflow snm3C { String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix String snm3C_docker_image = "m3c-yap-hisat:2.4" - + # make sure either gcp or azr is supplied as cloud_provider input + if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { + call utils.ErrorWithMessage as ErrorMessageIncorrectInput { + input: + message = "cloud_provider must be supplied with either 'gcp' or 'azure'." + } + } # version of the pipeline String pipeline_version = "4.0.1" From 1f6559b8ff7fc73ff8b3b6157b34958c19b2b283 Mon Sep 17 00:00:00 2001 From: John Scira Date: Thu, 18 Apr 2024 14:31:51 -0400 Subject: [PATCH 045/186] added cloud provider parameter to test json --- .../skylab/snm3C/test_inputs/Scientific/novaseq_M16_G13.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/test_inputs/Scientific/novaseq_M16_G13.json b/pipelines/skylab/snm3C/test_inputs/Scientific/novaseq_M16_G13.json index 0709e99fb9..e53437328d 100644 --- a/pipelines/skylab/snm3C/test_inputs/Scientific/novaseq_M16_G13.json +++ b/pipelines/skylab/snm3C/test_inputs/Scientific/novaseq_M16_G13.json @@ -19,5 +19,6 @@ "snm3C.batch_number": 2, "snm3C.Hisat_paired_end.cpu_platform" : "Intel Cascade Lake", "snm3C.Hisat_single_end.cpu_platform" : "Intel Cascade Lake", - "snm3C.Merge_sort_analyze.cpu_platform" : "Intel Cascade Lake" + "snm3C.Merge_sort_analyze.cpu_platform" : "Intel Cascade Lake", + "snm3C.cloud_provider" : "gcp" } From 45e918089a55fff507811658aba418852379b76b Mon Sep 17 00:00:00 2001 From: John Scira Date: Thu, 18 Apr 2024 14:45:44 -0400 Subject: [PATCH 046/186] refactored task inputs --- pipelines/skylab/snm3C/snm3C.wdl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index e1c69183d2..eb321f1d8c 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -9,11 +9,11 @@ workflow snm3C { Array[File] fastq_input_read2 File random_primer_indexes String plate_id + String cloud_provider # mapping inputs File tarred_index_files File genome_fa File chromosome_sizes - String cloud_provider String r1_adapter = "AGATCGGAAGAGCACACGTCTGAAC" String r2_adapter = "AGATCGGAAGAGCGTCGTGTAGGGA" @@ -50,7 +50,6 @@ workflow snm3C { fastq_input_read2 = fastq_input_read2, random_primer_indexes = random_primer_indexes, plate_id = plate_id, - docker = docker_prefix + snm3C_docker_image, batch_number = batch_number } @@ -135,7 +134,7 @@ task Demultiplexing { File random_primer_indexes String plate_id Int batch_number - String docker + String docker = docker_prefix + snm3C_docker_image Int disk_size = 1000 Int mem_size = 10 @@ -245,7 +244,7 @@ task Hisat_paired_end { File genome_fa File chromosome_sizes String plate_id - String docker + String docker = docker_prefix + snm3C_docker_image String r1_adapter String r2_adapter From 1b376dfee475b017fc5706f0c8434b587915b020 Mon Sep 17 00:00:00 2001 From: John Scira Date: Thu, 18 Apr 2024 15:20:47 -0400 Subject: [PATCH 047/186] change to inputs --- pipelines/skylab/snm3C/snm3C.wdl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index eb321f1d8c..22dcead23f 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -50,7 +50,9 @@ workflow snm3C { fastq_input_read2 = fastq_input_read2, random_primer_indexes = random_primer_indexes, plate_id = plate_id, - batch_number = batch_number + batch_number = batch_number, + docker = docker_prefix + snm3C_docker_image + } scatter(tar in Demultiplexing.tarred_demultiplexed_fastqs) { @@ -134,7 +136,7 @@ task Demultiplexing { File random_primer_indexes String plate_id Int batch_number - String docker = docker_prefix + snm3C_docker_image + String docker Int disk_size = 1000 Int mem_size = 10 @@ -244,7 +246,7 @@ task Hisat_paired_end { File genome_fa File chromosome_sizes String plate_id - String docker = docker_prefix + snm3C_docker_image + String docker String r1_adapter String r2_adapter From 3c588e3084745090db25534eb8456d49002c635c Mon Sep 17 00:00:00 2001 From: John Scira Date: Thu, 18 Apr 2024 15:29:54 -0400 Subject: [PATCH 048/186] change to docker path generation --- pipelines/skylab/snm3C/snm3C.wdl | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 22dcead23f..1e08eb723c 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -28,11 +28,10 @@ workflow snm3C { Int batch_number } # Determine docker prefix based on cloud provider - String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" - String acr_docker_prefix = "dsppipelinedev.azurecr.io/" - String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix + String gcr_docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:2.4" + String acr_docker = "dsppipelinedev.azurecr.io/m3c-yap-hisat:2.4" + String snm3c_docker = if cloud_provider == "gcp" then gcr_docker else acr_docker - String snm3C_docker_image = "m3c-yap-hisat:2.4" # make sure either gcp or azr is supplied as cloud_provider input if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { call utils.ErrorWithMessage as ErrorMessageIncorrectInput { @@ -51,7 +50,7 @@ workflow snm3C { random_primer_indexes = random_primer_indexes, plate_id = plate_id, batch_number = batch_number, - docker = docker_prefix + snm3C_docker_image + docker = snm3c_docker } @@ -70,7 +69,7 @@ workflow snm3C { r2_left_cut = r2_left_cut, r2_right_cut = r2_right_cut, plate_id = plate_id, - docker = docker_prefix + snm3C_docker_image + docker = snm3c_docker } call Hisat_single_end as Hisat_single_end { @@ -79,7 +78,7 @@ workflow snm3C { tarred_index_files = tarred_index_files, genome_fa = genome_fa, plate_id = plate_id, - docker = docker_prefix + snm3C_docker_image + docker = snm3c_docker } call Merge_sort_analyze as Merge_sort_analyze { @@ -92,7 +91,7 @@ workflow snm3C { compress_level = compress_level, chromosome_sizes = chromosome_sizes, plate_id = plate_id, - docker = docker_prefix + snm3C_docker_image + docker = snm3c_docker } } @@ -107,7 +106,7 @@ workflow snm3C { allc_uniq_reads_stats = Merge_sort_analyze.allc_uniq_reads_stats, unique_reads_cgn_extraction_tbi = Merge_sort_analyze.extract_allc_output_tbi_tar, plate_id = plate_id, - docker = docker_prefix + snm3C_docker_image + docker = snm3c_docker } meta { From dd7c16759b1c4c4aefb37bbad80400e548ffb80b Mon Sep 17 00:00:00 2001 From: John Scira Date: Thu, 18 Apr 2024 15:47:36 -0400 Subject: [PATCH 049/186] changes made to test wdl --- verification/test-wdls/Testsnm3C.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/verification/test-wdls/Testsnm3C.wdl b/verification/test-wdls/Testsnm3C.wdl index 7409e08311..ec54ae128e 100644 --- a/verification/test-wdls/Testsnm3C.wdl +++ b/verification/test-wdls/Testsnm3C.wdl @@ -36,7 +36,7 @@ workflow Testsnm3C { String vault_token_path String google_account_vault_path - String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:2.4" + String cloud_provider } meta { @@ -63,7 +63,7 @@ workflow Testsnm3C { num_downstr_bases = num_downstr_bases, compress_level = compress_level, batch_number = batch_number, - docker = docker + cloud_provider = cloud_provider } From 0f6ce91520446eaa6daa6d2d9e462488e95d6cb0 Mon Sep 17 00:00:00 2001 From: John Scira Date: Fri, 19 Apr 2024 11:51:51 -0400 Subject: [PATCH 050/186] updated reference to docker images for consistency with other azurized wdls --- pipelines/skylab/snm3C/snm3C.wdl | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 1e08eb723c..44a0293c33 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -27,9 +27,11 @@ workflow snm3C { Int compress_level = 5 Int batch_number } + #docker images + String m3c_yap_hisat_docker = "m3c-yap-hisat:2.4" # Determine docker prefix based on cloud provider - String gcr_docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:2.4" - String acr_docker = "dsppipelinedev.azurecr.io/m3c-yap-hisat:2.4" + String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:2.4" + String acr_docker_prefix = "dsppipelinedev.azurecr.io/" String snm3c_docker = if cloud_provider == "gcp" then gcr_docker else acr_docker # make sure either gcp or azr is supplied as cloud_provider input @@ -50,7 +52,7 @@ workflow snm3C { random_primer_indexes = random_primer_indexes, plate_id = plate_id, batch_number = batch_number, - docker = snm3c_docker + docker = docker_prefix + m3c_yap_hisat_docker } @@ -69,7 +71,7 @@ workflow snm3C { r2_left_cut = r2_left_cut, r2_right_cut = r2_right_cut, plate_id = plate_id, - docker = snm3c_docker + docker = docker_prefix + m3c_yap_hisat_docker } call Hisat_single_end as Hisat_single_end { @@ -78,7 +80,7 @@ workflow snm3C { tarred_index_files = tarred_index_files, genome_fa = genome_fa, plate_id = plate_id, - docker = snm3c_docker + docker = docker_prefix + m3c_yap_hisat_docker } call Merge_sort_analyze as Merge_sort_analyze { @@ -91,7 +93,7 @@ workflow snm3C { compress_level = compress_level, chromosome_sizes = chromosome_sizes, plate_id = plate_id, - docker = snm3c_docker + docker = docker_prefix + m3c_yap_hisat_docker } } @@ -106,7 +108,7 @@ workflow snm3C { allc_uniq_reads_stats = Merge_sort_analyze.allc_uniq_reads_stats, unique_reads_cgn_extraction_tbi = Merge_sort_analyze.extract_allc_output_tbi_tar, plate_id = plate_id, - docker = snm3c_docker + docker = docker_prefix + m3c_yap_hisat_docker } meta { From d0b8f05216ca4c8a39b21907d30e029347b3144b Mon Sep 17 00:00:00 2001 From: John Scira Date: Fri, 19 Apr 2024 11:52:31 -0400 Subject: [PATCH 051/186] fix to docker image reference --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 44a0293c33..22c15ba2c1 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -30,7 +30,7 @@ workflow snm3C { #docker images String m3c_yap_hisat_docker = "m3c-yap-hisat:2.4" # Determine docker prefix based on cloud provider - String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:2.4" + String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" String acr_docker_prefix = "dsppipelinedev.azurecr.io/" String snm3c_docker = if cloud_provider == "gcp" then gcr_docker else acr_docker From 790f39e4466b8eaa7a43d5f37269d769f53a62a3 Mon Sep 17 00:00:00 2001 From: John Scira Date: Fri, 19 Apr 2024 16:18:04 -0400 Subject: [PATCH 052/186] fix to docker prefix call --- pipelines/skylab/snm3C/snm3C.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 22c15ba2c1..8d18cc2d35 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -32,7 +32,7 @@ workflow snm3C { # Determine docker prefix based on cloud provider String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" String acr_docker_prefix = "dsppipelinedev.azurecr.io/" - String snm3c_docker = if cloud_provider == "gcp" then gcr_docker else acr_docker + String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix # make sure either gcp or azr is supplied as cloud_provider input if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { @@ -80,7 +80,7 @@ workflow snm3C { tarred_index_files = tarred_index_files, genome_fa = genome_fa, plate_id = plate_id, - docker = docker_prefix + m3c_yap_hisat_docker + docker = snm3c_docker + m3c_yap_hisat_docker } call Merge_sort_analyze as Merge_sort_analyze { From ea3530c16abe5c5f34c6848d3e0696395f1ad6c4 Mon Sep 17 00:00:00 2001 From: John Scira Date: Mon, 22 Apr 2024 09:23:07 -0400 Subject: [PATCH 053/186] updated var name --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 8d18cc2d35..5e209fb6a9 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -80,7 +80,7 @@ workflow snm3C { tarred_index_files = tarred_index_files, genome_fa = genome_fa, plate_id = plate_id, - docker = snm3c_docker + m3c_yap_hisat_docker + docker = docker_prefix + m3c_yap_hisat_docker } call Merge_sort_analyze as Merge_sort_analyze { From dc85702fa3f48c15fa3485e1b9bf1571711e8fd1 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 29 Apr 2024 11:35:20 -0400 Subject: [PATCH 054/186] replace hard-coded cromwell_root with variable based on cloud env --- pipelines/skylab/snm3C/snm3C.wdl | 92 ++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 41 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 5e209fb6a9..e628c39091 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -33,6 +33,7 @@ workflow snm3C { String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" String acr_docker_prefix = "dsppipelinedev.azurecr.io/" String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix + String cromwell_root_dir = if cloud_provider == "gcp" then "/cromwell_root" else "/cromwell-executions" # make sure either gcp or azr is supplied as cloud_provider input if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { @@ -52,8 +53,8 @@ workflow snm3C { random_primer_indexes = random_primer_indexes, plate_id = plate_id, batch_number = batch_number, - docker = docker_prefix + m3c_yap_hisat_docker - + docker = docker_prefix + m3c_yap_hisat_docker, + cromwell_root_dir = cromwell_root_dir } scatter(tar in Demultiplexing.tarred_demultiplexed_fastqs) { @@ -71,7 +72,8 @@ workflow snm3C { r2_left_cut = r2_left_cut, r2_right_cut = r2_right_cut, plate_id = plate_id, - docker = docker_prefix + m3c_yap_hisat_docker + docker = docker_prefix + m3c_yap_hisat_docker, + cromwell_root_dir = cromwell_root_dir } call Hisat_single_end as Hisat_single_end { @@ -80,7 +82,8 @@ workflow snm3C { tarred_index_files = tarred_index_files, genome_fa = genome_fa, plate_id = plate_id, - docker = docker_prefix + m3c_yap_hisat_docker + docker = docker_prefix + m3c_yap_hisat_docker, + cromwell_root_dir = cromwell_root_dir } call Merge_sort_analyze as Merge_sort_analyze { @@ -93,7 +96,8 @@ workflow snm3C { compress_level = compress_level, chromosome_sizes = chromosome_sizes, plate_id = plate_id, - docker = docker_prefix + m3c_yap_hisat_docker + docker = docker_prefix + m3c_yap_hisat_docker, + cromwell_root_dir = cromwell_root_dir } } @@ -108,7 +112,8 @@ workflow snm3C { allc_uniq_reads_stats = Merge_sort_analyze.allc_uniq_reads_stats, unique_reads_cgn_extraction_tbi = Merge_sort_analyze.extract_allc_output_tbi_tar, plate_id = plate_id, - docker = docker_prefix + m3c_yap_hisat_docker + docker = docker_prefix + m3c_yap_hisat_docker, + cromwell_root_dir = cromwell_root_dir } meta { @@ -138,6 +143,7 @@ task Demultiplexing { String plate_id Int batch_number String docker + String cromwell_root_dir Int disk_size = 1000 Int mem_size = 10 @@ -169,7 +175,7 @@ task Demultiplexing { import os # Parsing stats.txt file - stats_file_path = '/cromwell_root/~{plate_id}.stats.txt' + stats_file_path = '~{cromwell_root_dir}/~{plate_id}.stats.txt' adapter_counts = {} with open(stats_file_path, 'r') as file: content = file.read() @@ -181,7 +187,7 @@ task Demultiplexing { adapter_counts[adapter_name] = trimmed_count # Removing fastq files with trimmed reads greater than 30 - directory_path = '/cromwell_root' + directory_path = ~{cromwell_root_dir} threshold = 10000000 for filename in os.listdir(directory_path): @@ -248,6 +254,7 @@ task Hisat_paired_end { File chromosome_sizes String plate_id String docker + String cromwell_root_dir String r1_adapter String r2_adapter @@ -315,7 +322,7 @@ task Hisat_paired_end { # sort start=$(date +%s) echo "Run sort r1" - zcat /cromwell_root/batch*/"$r1_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R1_sorted.fq" + zcat ~{cromwell_root_dir}/batch*/"$r1_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R1_sorted.fq" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort r1: $elapsed seconds" @@ -323,7 +330,7 @@ task Hisat_paired_end { # sort start=$(date +%s) echo "Run sort r2" - zcat /cromwell_root/batch*/"$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" + zcat ~{cromwell_root_dir}/batch*/"$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort r2: $elapsed seconds" @@ -353,7 +360,7 @@ task Hisat_paired_end { # hisat run start=$(date +%s) echo "Run hisat" - hisat-3n /cromwell_root/$genome_fa_basename \ + hisat-3n ~{cromwell_root_dir}/$genome_fa_basename \ -q \ -1 ${sample_id}-R1_trimmed.fq.gz \ -2 ${sample_id}-R2_trimmed.fq.gz \ @@ -385,7 +392,7 @@ task Hisat_paired_end { elapsed=$((end - start)) echo "Elapsed time to run split_hisat3n_unmapped_reads: $elapsed seconds" - rm /cromwell_root/batch*/${sample_id}-R1.fq.gz /cromwell_root/batch*/${sample_id}-R2.fq.gz + rm ~{cromwell_root_dir}/batch*/${sample_id}-R1.fq.gz ~{cromwell_root_dir}/batch*/${sample_id}-R2.fq.gz rm ${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq rm ${sample_id}-R1_trimmed.fq.gz ${sample_id}-R2_trimmed.fq.gz rm ${sample_id}.hisat3n_dna.unsort.bam ${sample_id}.hisat3n_dna.multi_aligned.bam @@ -498,6 +505,7 @@ task Hisat_single_end { File tarred_index_files String plate_id String docker + String cromwell_root_dir Int disk_size = 1000 Int mem_size = 64 @@ -541,8 +549,8 @@ task Hisat_single_end { echo "Elapsed time to untar split_fq_tar: $elapsed seconds" # make directories - mkdir -p /cromwell_root/merged_sort_bams - mkdir -p /cromwell_root/read_overlap + mkdir -p ~{cromwell_root_dir}/merged_sort_bams + mkdir -p ~{cromwell_root_dir}/read_overlap # define lists of r1 and r2 fq files R1_files=($(ls | grep "\.hisat3n_dna.split_reads.R1.fastq")) @@ -557,7 +565,7 @@ task Hisat_single_end { start=$(date +%s) # hisat on R1 single end - hisat-3n /cromwell_root/$genome_fa_basename \ + hisat-3n ~{cromwell_root_dir}/$genome_fa_basename \ -q \ -U ${BASE}.hisat3n_dna.split_reads.R1.fastq \ -S ${BASE}.hisat3n_dna.split_reads.R1.sam --directional-mapping-reverse --base-change C,T \ @@ -579,7 +587,7 @@ task Hisat_single_end { echo "Running hisat on sample_id_R2" $BASE # hisat on R2 single end - hisat-3n /cromwell_root/$genome_fa_basename \ + hisat-3n ~{cromwell_root_dir}/$genome_fa_basename \ -q \ -U ${BASE}.hisat3n_dna.split_reads.R2.fastq \ -S ${BASE}.hisat3n_dna.split_reads.R2.sam --directional-mapping --base-change C,T \ @@ -622,7 +630,7 @@ task Hisat_single_end { # remove_overlap_read_parts echo "call remove_overlap_read_parts" start=$(date +%s) - python3 -c 'from cemba_data.hisat3n import *;import os;remove_overlap_read_parts(in_bam_path=os.path.join(os.path.sep,"cromwell_root","'"$BASE"'.name_sorted.filtered.bam"),out_bam_path=os.path.join(os.path.sep,"cromwell_root","'"$BASE"'.hisat3n_dna.split_reads.read_overlap.bam"))' + python3 -c 'from cemba_data.hisat3n import *;import os;remove_overlap_read_parts(in_bam_path=os.path.join(os.path.sep,~{cromwell_root_dir},"'"$BASE"'.name_sorted.filtered.bam"),out_bam_path=os.path.join(os.path.sep,~{cromwell_root_dir},"'"$BASE"'.hisat3n_dna.split_reads.read_overlap.bam"))' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run remove overlap $elapsed seconds" @@ -708,6 +716,7 @@ task Merge_sort_analyze { File paired_end_unique_tar File read_overlap_tar String docker + String cromwell_root_dir #input for allcools bam-to-allc File genome_fa @@ -769,9 +778,9 @@ task Merge_sort_analyze { fi # make directories - mkdir /cromwell_root/output_bams - mkdir /cromwell_root/temp - mkdir /cromwell_root/allc-${mcg_context} + mkdir ~{cromwell_root_dir}/output_bams + mkdir ~{cromwell_root_dir}temp + mkdir ~{cromwell_root_dir}allc-${mcg_context} task() { local file=$1 @@ -802,16 +811,16 @@ task Merge_sort_analyze { start=$(date +%s) echo "Call Picard remove duplicates" name=${sample_id}.hisat3n_dna.all_reads.deduped - picard MarkDuplicates I=${sample_id}.hisat3n_dna.all_reads.pos_sort.bam O=/cromwell_root/output_bams/${name}.bam \ - M=/cromwell_root/output_bams/${name}.matrix.txt \ - REMOVE_DUPLICATES=true TMP_DIR=/cromwell_root/temp + picard MarkDuplicates I=${sample_id}.hisat3n_dna.all_reads.pos_sort.bam O=~{cromwell_root_dir}/output_bams/${name}.bam \ + M=~{cromwell_root_dir}/output_bams/${name}.matrix.txt \ + REMOVE_DUPLICATES=true TMP_DIR=~{cromwell_root_dir}/temp end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run picard $elapsed seconds" start=$(date +%s) echo "Call samtools index" - samtools index /cromwell_root/output_bams/${name}.bam + samtools index ~{cromwell_root_dir}/output_bams/${name}.bam end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to samtools index $elapsed seconds" @@ -826,8 +835,8 @@ task Merge_sort_analyze { start=$(date +%s) echo "Call allcools bam-to-allc from deduped.bams" /opt/conda/bin/allcools bam-to-allc \ - --bam_path /cromwell_root/output_bams/${name}.bam \ - --reference_fasta /cromwell_root/reference/~{genome_base} \ + --bam_path ~{cromwell_root_dir}/output_bams/${name}.bam \ + --reference_fasta ~{cromwell_root_dir}/reference/~{genome_base} \ --output_path "${sample_id}.allc.tsv.gz" \ --num_upstr_bases ~{num_upstr_bases} \ --num_downstr_bases ~{num_downstr_bases} \ @@ -842,7 +851,7 @@ task Merge_sort_analyze { echo "Call allcools extract-all" allcools extract-allc --strandness merge \ --allc_path ${sample_id}.allc.tsv.gz \ - --output_prefix /cromwell_root/allc-${mcg_context}/${sample_id} \ + --output_prefix ~{cromwell_root_dir}/allc-${mcg_context}/${sample_id} \ --mc_contexts ${mcg_context} \ --chrom_size_path ~{chromosome_sizes} end=$(date +%s) @@ -852,8 +861,8 @@ task Merge_sort_analyze { echo "Remove some bams" rm ${sample_id}.hisat3n_dna.all_reads.bam rm ${sample_id}.hisat3n_dna.all_reads.pos_sort.bam - rm /cromwell_root/${sample_id}.hisat3n_dna.split_reads.read_overlap.bam - rm /cromwell_root/${sample_id}.hisat3n_dna.unique_aligned.bam + rm ~{cromwell_root_dir}/${sample_id}.hisat3n_dna.split_reads.read_overlap.bam + rm ~{cromwell_root_dir}/${sample_id}.hisat3n_dna.unique_aligned.bam } # run 4 instances of task in parallel @@ -908,8 +917,8 @@ task Merge_sort_analyze { tar -cf - *.allc.tsv.gz | pigz > ~{plate_id}.allc.tsv.tar.gz tar -cf - *.allc.tsv.gz.tbi | pigz > ~{plate_id}.allc.tbi.tar.gz tar -cf - *.allc.tsv.gz.count.csv | pigz > ~{plate_id}.allc.count.tar.gz - tar -cf - /cromwell_root/allc-${mcg_context}/*.gz | pigz > ~{plate_id}.extract-allc.tar.gz - tar -cf - /cromwell_root/allc-${mcg_context}/*.tbi | pigz > ~{plate_id}.extract-allc_tbi.tar.gz + tar -cf - ~{cromwell_root_dir}allc-${mcg_context}/*.gz | pigz > ~{plate_id}.extract-allc.tar.gz + tar -cf - ~{cromwell_root_dir}/allc-${mcg_context}/*.tbi | pigz > ~{plate_id}.extract-allc_tbi.tar.gz >>> runtime { @@ -946,6 +955,7 @@ task Summary { Array[File] allc_uniq_reads_stats Array[File] unique_reads_cgn_extraction_tbi String plate_id + String cromwell_root_dir String docker Int disk_size = 80 @@ -956,10 +966,10 @@ task Summary { command <<< set -euo pipefail - mkdir /cromwell_root/fastq - mkdir /cromwell_root/bam - mkdir /cromwell_root/allc - mkdir /cromwell_root/hic + mkdir ~{cromwell_root_dir}/fastq + mkdir ~{cromwell_root_dir}/bam + mkdir ~{cromwell_root_dir}/allc + mkdir ~{cromwell_root_dir}/hic extract_and_remove() { if [ $# -eq 0 ]; @@ -982,12 +992,12 @@ task Summary { extract_and_remove ~{sep=' ' allc_uniq_reads_stats} extract_and_remove ~{sep=' ' unique_reads_cgn_extraction_tbi} - mv *.trimmed.stats.txt /cromwell_root/fastq - mv *.hisat3n_dna_summary.txt *.hisat3n_dna_split_reads_summary.R1.txt *.hisat3n_dna_split_reads_summary.R2.txt /cromwell_root/bam - mv output_bams/*.hisat3n_dna.all_reads.deduped.matrix.txt /cromwell_root/bam - mv *.hisat3n_dna.all_reads.contact_stats.csv /cromwell_root/hic - mv *.allc.tsv.gz.count.csv /cromwell_root/allc - mv cromwell_root/allc-CGN/*.allc.tsv.gz.tbi /cromwell_root/allc + mv *.trimmed.stats.txt ~{cromwell_root_dir}/fastq + mv *.hisat3n_dna_summary.txt *.hisat3n_dna_split_reads_summary.R1.txt *.hisat3n_dna_split_reads_summary.R2.txt ~{cromwell_root_dir}/bam + mv output_bams/*.hisat3n_dna.all_reads.deduped.matrix.txt ~{cromwell_root_dir}/bam + mv *.hisat3n_dna.all_reads.contact_stats.csv ~{cromwell_root_dir}/hic + mv *.allc.tsv.gz.count.csv ~{cromwell_root_dir}/allc + mv ~{cromwell_root_dir}/allc-CGN/*.allc.tsv.gz.tbi ~{cromwell_root_dir}/allc python3 -c 'from cemba_data.hisat3n import *;snm3c_summary()' mv MappingSummary.csv.gz ~{plate_id}_MappingSummary.csv.gz From be700227245282ff69692faf65873c158b51ab45 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 29 Apr 2024 11:55:06 -0400 Subject: [PATCH 055/186] wrap param name in quotations --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index e628c39091..3abe45ac29 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -187,7 +187,7 @@ task Demultiplexing { adapter_counts[adapter_name] = trimmed_count # Removing fastq files with trimmed reads greater than 30 - directory_path = ~{cromwell_root_dir} + directory_path = '~{cromwell_root_dir}' threshold = 10000000 for filename in os.listdir(directory_path): From 73b2e90278beda985fa72a4efc3b0a38dfe893a0 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 29 Apr 2024 14:29:02 -0400 Subject: [PATCH 056/186] provide absolute path --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 3abe45ac29..b757719f7a 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -165,7 +165,7 @@ task Demultiplexing { -p ~{plate_id}-{name}-R2.fq.gz \ r1.fastq.gz \ r2.fastq.gz \ - > ~{plate_id}.stats.txt + > ~{cromwell_root_dir}/~{plate_id}.stats.txt # remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz rm *-unknown-R{1,2}.fq.gz From 0df475b2ab992ab2644e969cddb725f794e72145 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 29 Apr 2024 14:55:47 -0400 Subject: [PATCH 057/186] write out files using ls --- pipelines/skylab/snm3C/snm3C.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index b757719f7a..d5f7f78a6a 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -165,8 +165,8 @@ task Demultiplexing { -p ~{plate_id}-{name}-R2.fq.gz \ r1.fastq.gz \ r2.fastq.gz \ - > ~{cromwell_root_dir}/~{plate_id}.stats.txt - + > ~{plate_id}.stats.txt + ls -lh # remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz rm *-unknown-R{1,2}.fq.gz From 8297ab24694235d5a0cff95c8dad757dc541f906 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Wed, 1 May 2024 12:55:57 -0400 Subject: [PATCH 058/186] add print for working dir --- pipelines/skylab/snm3C/snm3C.wdl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index d5f7f78a6a..0af7533eed 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -154,6 +154,10 @@ task Demultiplexing { command <<< set -euo pipefail + ls -lR + pwd + + # Cat files for each r1, r2 cat ~{sep=' ' fastq_input_read1} > r1.fastq.gz cat ~{sep=' ' fastq_input_read2} > r2.fastq.gz @@ -166,7 +170,7 @@ task Demultiplexing { r1.fastq.gz \ r2.fastq.gz \ > ~{plate_id}.stats.txt - ls -lh + # remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz rm *-unknown-R{1,2}.fq.gz From 7a255c38aeb05f2134ac2ac9e2bad36f94e8fa1f Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Wed, 1 May 2024 14:20:15 -0400 Subject: [PATCH 059/186] provide absolute paths for demultiplexing --- pipelines/skylab/snm3C/snm3C.wdl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 0af7533eed..a22b66a7f0 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -159,8 +159,8 @@ task Demultiplexing { # Cat files for each r1, r2 - cat ~{sep=' ' fastq_input_read1} > r1.fastq.gz - cat ~{sep=' ' fastq_input_read2} > r2.fastq.gz + cat ~{sep=' ' fastq_input_read1} > ~{cromwell_root_dir}/r1.fastq.gz + cat ~{sep=' ' fastq_input_read2} > ~{cromwell_root_dir}/r2.fastq.gz # Run cutadapt /opt/conda/bin/cutadapt -Z -e 0.01 --no-indels -j 8 \ @@ -169,10 +169,10 @@ task Demultiplexing { -p ~{plate_id}-{name}-R2.fq.gz \ r1.fastq.gz \ r2.fastq.gz \ - > ~{plate_id}.stats.txt + > ~{cromwell_root_dir}/~{plate_id}.stats.txt # remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz - rm *-unknown-R{1,2}.fq.gz + rm ~{cromwell_root_dir}/*-unknown-R{1,2}.fq.gz python3 < ~{plate_id}.${i}.cutadapt_output_files.tar.gz + tar -cf - ~{cromwell_root_dir}/batch${i}/*.fq.gz | pigz > ~{cromwell_root_dir}/~{plate_id}.${i}.cutadapt_output_files.tar.gz done echo "TAR files created successfully." >>> From 49b84a3e20fa30707717e308b3f6320cef87005b Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 2 May 2024 13:17:21 -0400 Subject: [PATCH 060/186] testing --- pipelines/skylab/snm3C/snm3C.wdl | 156 ++++++++++++++++--------------- 1 file changed, 80 insertions(+), 76 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index a22b66a7f0..6493d31661 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -152,88 +152,92 @@ task Demultiplexing { } command <<< + echo "TEST" + + set -euo pipefail ls -lR pwd - # Cat files for each r1, r2 - cat ~{sep=' ' fastq_input_read1} > ~{cromwell_root_dir}/r1.fastq.gz - cat ~{sep=' ' fastq_input_read2} > ~{cromwell_root_dir}/r2.fastq.gz - - # Run cutadapt - /opt/conda/bin/cutadapt -Z -e 0.01 --no-indels -j 8 \ - -g file:~{random_primer_indexes} \ - -o ~{plate_id}-{name}-R1.fq.gz \ - -p ~{plate_id}-{name}-R2.fq.gz \ - r1.fastq.gz \ - r2.fastq.gz \ - > ~{cromwell_root_dir}/~{plate_id}.stats.txt - - # remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz - rm ~{cromwell_root_dir}/*-unknown-R{1,2}.fq.gz - - python3 < threshold: - os.remove(file_path) - print(f'Removed file: {filename}') - CODE - - # Batch the fastq files into folders of batch_number size - batch_number=~{batch_number} - for i in $(seq 1 "${batch_number}"); do # Use seq for reliable brace expansion - mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs - done - - # Counter for the folder index - folder_index=1 - - # Define lists of r1 and r2 fq files - R1_files=($(ls ~{cromwell_root_dir} | grep "\-R1.fq.gz")) - R2_files=($(ls ~{cromwell_root_dir} | grep "\-R2.fq.gz")) - - # Distribute the FASTQ files and create TAR files - for file in "${R1_files[@]}"; do - sample_id=$(basename "$file" "-R1.fq.gz") - r2_file="${sample_id}-R2.fq.gz" - mv ~{cromwell_root_dir}/$file batch$((folder_index))/$file - mv ~{cromwell_root_dir}/$r2_file batch$((folder_index))/$r2_file - # Increment the counter - folder_index=$(( (folder_index % $batch_number) + 1 )) - done - - # Tar up files per batch - echo "TAR files" - for i in $(seq 1 "${batch_number}"); do - tar -cf - ~{cromwell_root_dir}/batch${i}/*.fq.gz | pigz > ~{cromwell_root_dir}/~{plate_id}.${i}.cutadapt_output_files.tar.gz - done - echo "TAR files created successfully." +# # Cat files for each r1, r2 +# cat ~{sep=' ' fastq_input_read1} > ~{cromwell_root_dir}/r1.fastq.gz +# cat ~{sep=' ' fastq_input_read2} > ~{cromwell_root_dir}/r2.fastq.gz +# +# # Run cutadapt +# /opt/conda/bin/cutadapt -Z -e 0.01 --no-indels -j 8 \ +# -g file:~{random_primer_indexes} \ +# -o ~{plate_id}-{name}-R1.fq.gz \ +# -p ~{plate_id}-{name}-R2.fq.gz \ +# r1.fastq.gz \ +# r2.fastq.gz \ +# > ~{cromwell_root_dir}/~{plate_id}.stats.txt +# +# # remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz +# rm ~{cromwell_root_dir}/*-unknown-R{1,2}.fq.gz +# +# python3 < threshold: +# os.remove(file_path) +# print(f'Removed file: {filename}') +# CODE +# +# # Batch the fastq files into folders of batch_number size +# batch_number=~{batch_number} +# for i in $(seq 1 "${batch_number}"); do # Use seq for reliable brace expansion +# mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs +# done +# +# # Counter for the folder index +# folder_index=1 +# +# # Define lists of r1 and r2 fq files +# R1_files=($(ls ~{cromwell_root_dir} | grep "\-R1.fq.gz")) +# R2_files=($(ls ~{cromwell_root_dir} | grep "\-R2.fq.gz")) +# +# # Distribute the FASTQ files and create TAR files +# for file in "${R1_files[@]}"; do +# sample_id=$(basename "$file" "-R1.fq.gz") +# r2_file="${sample_id}-R2.fq.gz" +# mv ~{cromwell_root_dir}/$file batch$((folder_index))/$file +# mv ~{cromwell_root_dir}/$r2_file batch$((folder_index))/$r2_file +# # Increment the counter +# folder_index=$(( (folder_index % $batch_number) + 1 )) +# done +# +# # Tar up files per batch +# echo "TAR files" +# for i in $(seq 1 "${batch_number}"); do +# tar -cf - ~{cromwell_root_dir}/batch${i}/*.fq.gz | pigz > ~{cromwell_root_dir}/~{plate_id}.${i}.cutadapt_output_files.tar.gz +# done +# echo "TAR files created successfully." + >>> runtime { From 0f8c6e376c5ef3ef8ec239fabaf3312922050f1c Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 2 May 2024 13:33:47 -0400 Subject: [PATCH 061/186] more testing --- pipelines/skylab/snm3C/snm3C.wdl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 6493d31661..5a2b56786d 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -156,7 +156,7 @@ task Demultiplexing { set -euo pipefail - + touch test.txt ls -lR pwd @@ -249,8 +249,9 @@ task Demultiplexing { } output { - Array[File] tarred_demultiplexed_fastqs = glob("*.tar.gz") - File stats = "~{plate_id}.stats.txt" + #Array[File] tarred_demultiplexed_fastqs = glob("*.tar.gz") + #File stats = "~{plate_id}.stats.txt" + File test = "test.txt" } } From 7eca252e95c961727e2561c1abdeac559f5d8002 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 2 May 2024 13:44:29 -0400 Subject: [PATCH 062/186] more testing --- pipelines/skylab/snm3C/snm3C.wdl | 67 -------------------------------- 1 file changed, 67 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 5a2b56786d..70d75baf0a 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -57,80 +57,13 @@ workflow snm3C { cromwell_root_dir = cromwell_root_dir } - scatter(tar in Demultiplexing.tarred_demultiplexed_fastqs) { - call Hisat_paired_end as Hisat_paired_end { - input: - tarred_demultiplexed_fastqs = tar, - tarred_index_files = tarred_index_files, - genome_fa = genome_fa, - chromosome_sizes = chromosome_sizes, - min_read_length = min_read_length, - r1_adapter = r1_adapter, - r2_adapter = r2_adapter, - r1_left_cut = r1_left_cut, - r1_right_cut = r1_right_cut, - r2_left_cut = r2_left_cut, - r2_right_cut = r2_right_cut, - plate_id = plate_id, - docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir - } - - call Hisat_single_end as Hisat_single_end { - input: - split_fq_tar = Hisat_paired_end.split_fq_tar, - tarred_index_files = tarred_index_files, - genome_fa = genome_fa, - plate_id = plate_id, - docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir - } - - call Merge_sort_analyze as Merge_sort_analyze { - input: - paired_end_unique_tar = Hisat_paired_end.unique_bam_tar, - read_overlap_tar = Hisat_single_end.remove_overlaps_output_bam_tar, - genome_fa = genome_fa, - num_upstr_bases = num_upstr_bases, - num_downstr_bases = num_downstr_bases, - compress_level = compress_level, - chromosome_sizes = chromosome_sizes, - plate_id = plate_id, - docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir - } - } - call Summary { - input: - trimmed_stats = Hisat_paired_end.trim_stats_tar, - hisat3n_stats = Hisat_paired_end.hisat3n_paired_end_stats_tar, - r1_hisat3n_stats = Hisat_single_end.hisat3n_dna_split_reads_summary_R1_tar, - r2_hisat3n_stats = Hisat_single_end.hisat3n_dna_split_reads_summary_R2_tar, - dedup_stats = Merge_sort_analyze.dedup_stats_tar, - chromatin_contact_stats = Merge_sort_analyze.chromatin_contact_stats, - allc_uniq_reads_stats = Merge_sort_analyze.allc_uniq_reads_stats, - unique_reads_cgn_extraction_tbi = Merge_sort_analyze.extract_allc_output_tbi_tar, - plate_id = plate_id, - docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir - } meta { allowNestedInputs: true } output { - File MappingSummary = Summary.mapping_summary - Array[File] name_sorted_bams = Merge_sort_analyze.name_sorted_bam - Array[File] unique_reads_cgn_extraction_allc= Merge_sort_analyze.allc - Array[File] unique_reads_cgn_extraction_tbi = Merge_sort_analyze.tbi - Array[File] reference_version = Hisat_paired_end.reference_version - Array[File] all_reads_dedup_contacts = Merge_sort_analyze.all_reads_dedup_contacts - Array[File] all_reads_3C_contacts = Merge_sort_analyze.all_reads_3C_contacts - Array[File] chromatin_contact_stats = Merge_sort_analyze.chromatin_contact_stats - Array[File] unique_reads_cgn_extraction_allc_extract = Merge_sort_analyze.extract_allc_output_allc_tar - Array[File] unique_reads_cgn_extraction_tbi_extract = Merge_sort_analyze.extract_allc_output_tbi_tar } } From abc62cd0c156e65b873d680c1362d1a4d84c02d9 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 2 May 2024 14:05:54 -0400 Subject: [PATCH 063/186] put things back --- pipelines/skylab/snm3C/snm3C.wdl | 228 ++++++++++++++++++++----------- 1 file changed, 146 insertions(+), 82 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 70d75baf0a..0ba3f9bc10 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -57,13 +57,80 @@ workflow snm3C { cromwell_root_dir = cromwell_root_dir } + scatter(tar in Demultiplexing.tarred_demultiplexed_fastqs) { + call Hisat_paired_end as Hisat_paired_end { + input: + tarred_demultiplexed_fastqs = tar, + tarred_index_files = tarred_index_files, + genome_fa = genome_fa, + chromosome_sizes = chromosome_sizes, + min_read_length = min_read_length, + r1_adapter = r1_adapter, + r2_adapter = r2_adapter, + r1_left_cut = r1_left_cut, + r1_right_cut = r1_right_cut, + r2_left_cut = r2_left_cut, + r2_right_cut = r2_right_cut, + plate_id = plate_id, + docker = docker_prefix + m3c_yap_hisat_docker, + cromwell_root_dir = cromwell_root_dir + } + + call Hisat_single_end as Hisat_single_end { + input: + split_fq_tar = Hisat_paired_end.split_fq_tar, + tarred_index_files = tarred_index_files, + genome_fa = genome_fa, + plate_id = plate_id, + docker = docker_prefix + m3c_yap_hisat_docker, + cromwell_root_dir = cromwell_root_dir + } + call Merge_sort_analyze as Merge_sort_analyze { + input: + paired_end_unique_tar = Hisat_paired_end.unique_bam_tar, + read_overlap_tar = Hisat_single_end.remove_overlaps_output_bam_tar, + genome_fa = genome_fa, + num_upstr_bases = num_upstr_bases, + num_downstr_bases = num_downstr_bases, + compress_level = compress_level, + chromosome_sizes = chromosome_sizes, + plate_id = plate_id, + docker = docker_prefix + m3c_yap_hisat_docker, + cromwell_root_dir = cromwell_root_dir + } + } + + call Summary { + input: + trimmed_stats = Hisat_paired_end.trim_stats_tar, + hisat3n_stats = Hisat_paired_end.hisat3n_paired_end_stats_tar, + r1_hisat3n_stats = Hisat_single_end.hisat3n_dna_split_reads_summary_R1_tar, + r2_hisat3n_stats = Hisat_single_end.hisat3n_dna_split_reads_summary_R2_tar, + dedup_stats = Merge_sort_analyze.dedup_stats_tar, + chromatin_contact_stats = Merge_sort_analyze.chromatin_contact_stats, + allc_uniq_reads_stats = Merge_sort_analyze.allc_uniq_reads_stats, + unique_reads_cgn_extraction_tbi = Merge_sort_analyze.extract_allc_output_tbi_tar, + plate_id = plate_id, + docker = docker_prefix + m3c_yap_hisat_docker, + cromwell_root_dir = cromwell_root_dir + } meta { allowNestedInputs: true } output { + File MappingSummary = Summary.mapping_summary + Array[File] name_sorted_bams = Merge_sort_analyze.name_sorted_bam + Array[File] unique_reads_cgn_extraction_allc= Merge_sort_analyze.allc + Array[File] unique_reads_cgn_extraction_tbi = Merge_sort_analyze.tbi + Array[File] reference_version = Hisat_paired_end.reference_version + Array[File] all_reads_dedup_contacts = Merge_sort_analyze.all_reads_dedup_contacts + Array[File] all_reads_3C_contacts = Merge_sort_analyze.all_reads_3C_contacts + Array[File] chromatin_contact_stats = Merge_sort_analyze.chromatin_contact_stats + Array[File] unique_reads_cgn_extraction_allc_extract = Merge_sort_analyze.extract_allc_output_allc_tar + Array[File] unique_reads_cgn_extraction_tbi_extract = Merge_sort_analyze.extract_allc_output_tbi_tar } } @@ -86,90 +153,88 @@ task Demultiplexing { command <<< echo "TEST" - - set -euo pipefail - touch test.txt + ls -lR pwd -# # Cat files for each r1, r2 -# cat ~{sep=' ' fastq_input_read1} > ~{cromwell_root_dir}/r1.fastq.gz -# cat ~{sep=' ' fastq_input_read2} > ~{cromwell_root_dir}/r2.fastq.gz -# -# # Run cutadapt -# /opt/conda/bin/cutadapt -Z -e 0.01 --no-indels -j 8 \ -# -g file:~{random_primer_indexes} \ -# -o ~{plate_id}-{name}-R1.fq.gz \ -# -p ~{plate_id}-{name}-R2.fq.gz \ -# r1.fastq.gz \ -# r2.fastq.gz \ -# > ~{cromwell_root_dir}/~{plate_id}.stats.txt -# -# # remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz -# rm ~{cromwell_root_dir}/*-unknown-R{1,2}.fq.gz -# -# python3 < threshold: -# os.remove(file_path) -# print(f'Removed file: {filename}') -# CODE -# -# # Batch the fastq files into folders of batch_number size -# batch_number=~{batch_number} -# for i in $(seq 1 "${batch_number}"); do # Use seq for reliable brace expansion -# mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs -# done -# -# # Counter for the folder index -# folder_index=1 -# -# # Define lists of r1 and r2 fq files -# R1_files=($(ls ~{cromwell_root_dir} | grep "\-R1.fq.gz")) -# R2_files=($(ls ~{cromwell_root_dir} | grep "\-R2.fq.gz")) -# -# # Distribute the FASTQ files and create TAR files -# for file in "${R1_files[@]}"; do -# sample_id=$(basename "$file" "-R1.fq.gz") -# r2_file="${sample_id}-R2.fq.gz" -# mv ~{cromwell_root_dir}/$file batch$((folder_index))/$file -# mv ~{cromwell_root_dir}/$r2_file batch$((folder_index))/$r2_file -# # Increment the counter -# folder_index=$(( (folder_index % $batch_number) + 1 )) -# done -# -# # Tar up files per batch -# echo "TAR files" -# for i in $(seq 1 "${batch_number}"); do -# tar -cf - ~{cromwell_root_dir}/batch${i}/*.fq.gz | pigz > ~{cromwell_root_dir}/~{plate_id}.${i}.cutadapt_output_files.tar.gz -# done -# echo "TAR files created successfully." + # Cat files for each r1, r2 + cat ~{sep=' ' fastq_input_read1} > ~{cromwell_root_dir}/r1.fastq.gz + cat ~{sep=' ' fastq_input_read2} > ~{cromwell_root_dir}/r2.fastq.gz + + # Run cutadapt + /opt/conda/bin/cutadapt -Z -e 0.01 --no-indels -j 8 \ + -g file:~{random_primer_indexes} \ + -o ~{plate_id}-{name}-R1.fq.gz \ + -p ~{plate_id}-{name}-R2.fq.gz \ + r1.fastq.gz \ + r2.fastq.gz \ + > ~{cromwell_root_dir}/~{plate_id}.stats.txt + + # remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz + rm ~{cromwell_root_dir}/*-unknown-R{1,2}.fq.gz + + python3 < threshold: + os.remove(file_path) + print(f'Removed file: {filename}') + CODE + + # Batch the fastq files into folders of batch_number size + batch_number=~{batch_number} + for i in $(seq 1 "${batch_number}"); do # Use seq for reliable brace expansion + mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs + done + + # Counter for the folder index + folder_index=1 + + # Define lists of r1 and r2 fq files + R1_files=($(ls ~{cromwell_root_dir} | grep "\-R1.fq.gz")) + R2_files=($(ls ~{cromwell_root_dir} | grep "\-R2.fq.gz")) + + # Distribute the FASTQ files and create TAR files + for file in "${R1_files[@]}"; do + sample_id=$(basename "$file" "-R1.fq.gz") + r2_file="${sample_id}-R2.fq.gz" + mv ~{cromwell_root_dir}/$file batch$((folder_index))/$file + mv ~{cromwell_root_dir}/$r2_file batch$((folder_index))/$r2_file + # Increment the counter + folder_index=$(( (folder_index % $batch_number) + 1 )) + done + + # Tar up files per batch + echo "TAR files" + for i in $(seq 1 "${batch_number}"); do + tar -cf - ~{cromwell_root_dir}/batch${i}/*.fq.gz | pigz > ~{cromwell_root_dir}/~{plate_id}.${i}.cutadapt_output_files.tar.gz + done + echo "TAR files created successfully." >>> @@ -182,9 +247,8 @@ task Demultiplexing { } output { - #Array[File] tarred_demultiplexed_fastqs = glob("*.tar.gz") - #File stats = "~{plate_id}.stats.txt" - File test = "test.txt" + Array[File] tarred_demultiplexed_fastqs = glob("*.tar.gz") + File stats = "~{plate_id}.stats.txt" } } From c465b4fc2eaab5fbde3aadc6f8b50fd6c6c40edb Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Fri, 3 May 2024 11:10:10 -0400 Subject: [PATCH 064/186] add lots more logging --- pipelines/skylab/snm3C/snm3C.wdl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 0ba3f9bc10..427a37deff 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -172,9 +172,13 @@ task Demultiplexing { r2.fastq.gz \ > ~{cromwell_root_dir}/~{plate_id}.stats.txt + echo "RAN CUT ADAPT" + # remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz rm ~{cromwell_root_dir}/*-unknown-R{1,2}.fq.gz + echo "REMOVED FILES" + python3 < Date: Fri, 3 May 2024 13:33:30 -0400 Subject: [PATCH 065/186] only run cutadapt --- pipelines/skylab/snm3C/snm3C.wdl | 98 ++++++++++++++++---------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 427a37deff..cf210940d9 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -179,72 +179,72 @@ task Demultiplexing { echo "REMOVED FILES" - python3 < threshold: - os.remove(file_path) - print(f'Removed file: {filename}') - CODE - - echo "RAN PYTHON SNIPPET" + #directory_path = '~{cromwell_root_dir}' + #threshold = 10000000 + + #for filename in os.listdir(directory_path): + # if filename.endswith('.fq.gz'): + # file_path = os.path.join(directory_path, filename) + # adapter_name = re.search(r'A(\d+)-R', filename) + # if adapter_name: + # adapter_name = 'A' + adapter_name.group(1) + # if adapter_name in adapter_counts and adapter_counts[adapter_name] > threshold: + # os.remove(file_path) + # print(f'Removed file: {filename}') + #CODE + + #echo "RAN PYTHON SNIPPET" # Batch the fastq files into folders of batch_number size - batch_number=~{batch_number} - for i in $(seq 1 "${batch_number}"); do # Use seq for reliable brace expansion - mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs - done + #batch_number=~{batch_number} + #for i in $(seq 1 "${batch_number}"); do # Use seq for reliable brace expansion + # mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs + #done - echo "BATCHED FASTQ FILES INTO FOLDERS" + #echo "BATCHED FASTQ FILES INTO FOLDERS" # Counter for the folder index - folder_index=1 + #folder_index=1 # Define lists of r1 and r2 fq files - R1_files=($(ls ~{cromwell_root_dir} | grep "\-R1.fq.gz")) - R2_files=($(ls ~{cromwell_root_dir} | grep "\-R2.fq.gz")) + #R1_files=($(ls ~{cromwell_root_dir} | grep "\-R1.fq.gz")) + #R2_files=($(ls ~{cromwell_root_dir} | grep "\-R2.fq.gz")) - echo "STARTING TAR JOB" + #echo "STARTING TAR JOB" # Distribute the FASTQ files and create TAR files - for file in "${R1_files[@]}"; do - sample_id=$(basename "$file" "-R1.fq.gz") - r2_file="${sample_id}-R2.fq.gz" - mv ~{cromwell_root_dir}/$file batch$((folder_index))/$file - mv ~{cromwell_root_dir}/$r2_file batch$((folder_index))/$r2_file + #for file in "${R1_files[@]}"; do + # sample_id=$(basename "$file" "-R1.fq.gz") + # r2_file="${sample_id}-R2.fq.gz" + # mv ~{cromwell_root_dir}/$file batch$((folder_index))/$file + # mv ~{cromwell_root_dir}/$r2_file batch$((folder_index))/$r2_file # Increment the counter - folder_index=$(( (folder_index % $batch_number) + 1 )) - done + # folder_index=$(( (folder_index % $batch_number) + 1 )) + #done # Tar up files per batch - echo "TAR files" - for i in $(seq 1 "${batch_number}"); do - tar -cf - ~{cromwell_root_dir}/batch${i}/*.fq.gz | pigz > ~{cromwell_root_dir}/~{plate_id}.${i}.cutadapt_output_files.tar.gz - done - echo "TAR files created successfully." + #echo "TAR files" + #for i in $(seq 1 "${batch_number}"); do + # tar -cf - ~{cromwell_root_dir}/batch${i}/*.fq.gz | pigz > ~{cromwell_root_dir}/~{plate_id}.${i}.cutadapt_output_files.tar.gz + #done + #echo "TAR files created successfully." >>> @@ -257,7 +257,7 @@ task Demultiplexing { } output { - Array[File] tarred_demultiplexed_fastqs = glob("*.tar.gz") + #Array[File] tarred_demultiplexed_fastqs = glob("*.tar.gz") File stats = "~{plate_id}.stats.txt" } } From f545666541eb2bd08bf25b3c0169923c88a8e4be Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Fri, 3 May 2024 13:37:09 -0400 Subject: [PATCH 066/186] only run cutadapt --- pipelines/skylab/snm3C/snm3C.wdl | 152 +++++++++++++++---------------- 1 file changed, 76 insertions(+), 76 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index cf210940d9..708acb018f 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -57,82 +57,82 @@ workflow snm3C { cromwell_root_dir = cromwell_root_dir } - scatter(tar in Demultiplexing.tarred_demultiplexed_fastqs) { - call Hisat_paired_end as Hisat_paired_end { - input: - tarred_demultiplexed_fastqs = tar, - tarred_index_files = tarred_index_files, - genome_fa = genome_fa, - chromosome_sizes = chromosome_sizes, - min_read_length = min_read_length, - r1_adapter = r1_adapter, - r2_adapter = r2_adapter, - r1_left_cut = r1_left_cut, - r1_right_cut = r1_right_cut, - r2_left_cut = r2_left_cut, - r2_right_cut = r2_right_cut, - plate_id = plate_id, - docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir - } - - call Hisat_single_end as Hisat_single_end { - input: - split_fq_tar = Hisat_paired_end.split_fq_tar, - tarred_index_files = tarred_index_files, - genome_fa = genome_fa, - plate_id = plate_id, - docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir - } - - call Merge_sort_analyze as Merge_sort_analyze { - input: - paired_end_unique_tar = Hisat_paired_end.unique_bam_tar, - read_overlap_tar = Hisat_single_end.remove_overlaps_output_bam_tar, - genome_fa = genome_fa, - num_upstr_bases = num_upstr_bases, - num_downstr_bases = num_downstr_bases, - compress_level = compress_level, - chromosome_sizes = chromosome_sizes, - plate_id = plate_id, - docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir - } - } - - call Summary { - input: - trimmed_stats = Hisat_paired_end.trim_stats_tar, - hisat3n_stats = Hisat_paired_end.hisat3n_paired_end_stats_tar, - r1_hisat3n_stats = Hisat_single_end.hisat3n_dna_split_reads_summary_R1_tar, - r2_hisat3n_stats = Hisat_single_end.hisat3n_dna_split_reads_summary_R2_tar, - dedup_stats = Merge_sort_analyze.dedup_stats_tar, - chromatin_contact_stats = Merge_sort_analyze.chromatin_contact_stats, - allc_uniq_reads_stats = Merge_sort_analyze.allc_uniq_reads_stats, - unique_reads_cgn_extraction_tbi = Merge_sort_analyze.extract_allc_output_tbi_tar, - plate_id = plate_id, - docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir - } - - meta { - allowNestedInputs: true - } - - output { - File MappingSummary = Summary.mapping_summary - Array[File] name_sorted_bams = Merge_sort_analyze.name_sorted_bam - Array[File] unique_reads_cgn_extraction_allc= Merge_sort_analyze.allc - Array[File] unique_reads_cgn_extraction_tbi = Merge_sort_analyze.tbi - Array[File] reference_version = Hisat_paired_end.reference_version - Array[File] all_reads_dedup_contacts = Merge_sort_analyze.all_reads_dedup_contacts - Array[File] all_reads_3C_contacts = Merge_sort_analyze.all_reads_3C_contacts - Array[File] chromatin_contact_stats = Merge_sort_analyze.chromatin_contact_stats - Array[File] unique_reads_cgn_extraction_allc_extract = Merge_sort_analyze.extract_allc_output_allc_tar - Array[File] unique_reads_cgn_extraction_tbi_extract = Merge_sort_analyze.extract_allc_output_tbi_tar - - } + #scatter(tar in Demultiplexing.tarred_demultiplexed_fastqs) { + # call Hisat_paired_end as Hisat_paired_end { + # input: + # tarred_demultiplexed_fastqs = tar, + # tarred_index_files = tarred_index_files, + # genome_fa = genome_fa, + # chromosome_sizes = chromosome_sizes, + # min_read_length = min_read_length, + # r1_adapter = r1_adapter, + # r2_adapter = r2_adapter, + # r1_left_cut = r1_left_cut, + # r1_right_cut = r1_right_cut, + # r2_left_cut = r2_left_cut, + # r2_right_cut = r2_right_cut, + # plate_id = plate_id, + # docker = docker_prefix + m3c_yap_hisat_docker, + # cromwell_root_dir = cromwell_root_dir + # } + + # call Hisat_single_end as Hisat_single_end { + # input: + # split_fq_tar = Hisat_paired_end.split_fq_tar, + # tarred_index_files = tarred_index_files, + # genome_fa = genome_fa, + # plate_id = plate_id, + # docker = docker_prefix + m3c_yap_hisat_docker, + # cromwell_root_dir = cromwell_root_dir + # } + + # call Merge_sort_analyze as Merge_sort_analyze { + # input: + # paired_end_unique_tar = Hisat_paired_end.unique_bam_tar, + # read_overlap_tar = Hisat_single_end.remove_overlaps_output_bam_tar, + # genome_fa = genome_fa, + # num_upstr_bases = num_upstr_bases, + # num_downstr_bases = num_downstr_bases, + # compress_level = compress_level, + # chromosome_sizes = chromosome_sizes, + # plate_id = plate_id, + # docker = docker_prefix + m3c_yap_hisat_docker, + # cromwell_root_dir = cromwell_root_dir + #} + #} + + #call Summary { + # input: + # trimmed_stats = Hisat_paired_end.trim_stats_tar, + # hisat3n_stats = Hisat_paired_end.hisat3n_paired_end_stats_tar, + # r1_hisat3n_stats = Hisat_single_end.hisat3n_dna_split_reads_summary_R1_tar, + # r2_hisat3n_stats = Hisat_single_end.hisat3n_dna_split_reads_summary_R2_tar, + # dedup_stats = Merge_sort_analyze.dedup_stats_tar, + # chromatin_contact_stats = Merge_sort_analyze.chromatin_contact_stats, + # allc_uniq_reads_stats = Merge_sort_analyze.allc_uniq_reads_stats, + # unique_reads_cgn_extraction_tbi = Merge_sort_analyze.extract_allc_output_tbi_tar, + # plate_id = plate_id, + # docker = docker_prefix + m3c_yap_hisat_docker, + # cromwell_root_dir = cromwell_root_dir + #} + + #meta { + # allowNestedInputs: true + #} + + #output { + # File MappingSummary = Summary.mapping_summary + ## Array[File] name_sorted_bams = Merge_sort_analyze.name_sorted_bam + # Array[File] unique_reads_cgn_extraction_allc= Merge_sort_analyze.allc + # Array[File] unique_reads_cgn_extraction_tbi = Merge_sort_analyze.tbi + # Array[File] reference_version = Hisat_paired_end.reference_version + # Array[File] all_reads_dedup_contacts = Merge_sort_analyze.all_reads_dedup_contacts + # Array[File] all_reads_3C_contacts = Merge_sort_analyze.all_reads_3C_contacts + # Array[File] chromatin_contact_stats = Merge_sort_analyze.chromatin_contact_stats + # Array[File] unique_reads_cgn_extraction_allc_extract = Merge_sort_analyze.extract_allc_output_allc_tar + # Array[File] unique_reads_cgn_extraction_tbi_extract = Merge_sort_analyze.extract_allc_output_tbi_tar + + #} } task Demultiplexing { From 6de5e9e780e6a52cdbe1ebdd416288a3f53144f6 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 6 May 2024 12:50:33 -0400 Subject: [PATCH 067/186] write files to current working dir --- pipelines/skylab/snm3C/snm3C.wdl | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 708acb018f..3bf1175ab6 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -157,11 +157,16 @@ task Demultiplexing { ls -lR pwd - + working_directory = `pwd` + echo $working_directory # Cat files for each r1, r2 - cat ~{sep=' ' fastq_input_read1} > ~{cromwell_root_dir}/r1.fastq.gz - cat ~{sep=' ' fastq_input_read2} > ~{cromwell_root_dir}/r2.fastq.gz + cat ~{sep=' ' fastq_input_read1} > $working_directory/r1.fastq.gz + cat ~{sep=' ' fastq_input_read2} > $working_directory/r2.fastq.gz + + echo "successfully catted files" + pwd + ls # Run cutadapt /opt/conda/bin/cutadapt -Z -e 0.01 --no-indels -j 8 \ @@ -170,7 +175,7 @@ task Demultiplexing { -p ~{plate_id}-{name}-R2.fq.gz \ r1.fastq.gz \ r2.fastq.gz \ - > ~{cromwell_root_dir}/~{plate_id}.stats.txt + > $working_directory/~{plate_id}.stats.txt echo "RAN CUT ADAPT" From bee07db56845850bef96f1a47c106a340184c1bd Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 6 May 2024 13:11:47 -0400 Subject: [PATCH 068/186] write files to current working dir --- pipelines/skylab/snm3C/snm3C.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 3bf1175ab6..f9826ed4cc 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -157,7 +157,8 @@ task Demultiplexing { ls -lR pwd - working_directory = `pwd` + echo "setting directory" + working_directory=`pwd` echo $working_directory # Cat files for each r1, r2 From 19a95560b09db90e23b3eaae0afb6eeeea21a38f Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 6 May 2024 13:27:13 -0400 Subject: [PATCH 069/186] write files to current working dir --- pipelines/skylab/snm3C/snm3C.wdl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index f9826ed4cc..f5aae4b2d3 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -158,12 +158,12 @@ task Demultiplexing { ls -lR pwd echo "setting directory" - working_directory=`pwd` - echo $working_directory + WORKING_DIR=`pwd` + echo $WORKING_DIR # Cat files for each r1, r2 - cat ~{sep=' ' fastq_input_read1} > $working_directory/r1.fastq.gz - cat ~{sep=' ' fastq_input_read2} > $working_directory/r2.fastq.gz + cat ~{sep=' ' fastq_input_read1} > $WORKING_DIR/r1.fastq.gz + cat ~{sep=' ' fastq_input_read2} > $WORKING_DIR/r2.fastq.gz echo "successfully catted files" pwd @@ -174,14 +174,14 @@ task Demultiplexing { -g file:~{random_primer_indexes} \ -o ~{plate_id}-{name}-R1.fq.gz \ -p ~{plate_id}-{name}-R2.fq.gz \ - r1.fastq.gz \ - r2.fastq.gz \ - > $working_directory/~{plate_id}.stats.txt + $WORKING_DIR/r1.fastq.gz \ + $WORKING_DIR/r2.fastq.gz \ + > $WORKING_DIR/~{plate_id}.stats.txt echo "RAN CUT ADAPT" # remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz - rm ~{cromwell_root_dir}/*-unknown-R{1,2}.fq.gz + rm $WORKING_DIR/*-unknown-R{1,2}.fq.gz echo "REMOVED FILES" From 0605e65303708b9941a45f153cb1e82159edde58 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 6 May 2024 13:55:54 -0400 Subject: [PATCH 070/186] add some of demultiplexing steps back in --- pipelines/skylab/snm3C/snm3C.wdl | 105 ++++++++++++++++--------------- 1 file changed, 55 insertions(+), 50 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index f5aae4b2d3..ee83007f16 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -185,72 +185,77 @@ task Demultiplexing { echo "REMOVED FILES" - #python3 < threshold: - # os.remove(file_path) - # print(f'Removed file: {filename}') - #CODE - - #echo "RAN PYTHON SNIPPET" + threshold = 10000000 + + # TODO remove these prints: + all_fastqs = os.listdir(working_dir) + print(f"all fastq files: {all_fastqs}" + + for filename in os.listdir(working_dir): + if filename.endswith('.fq.gz'): + file_path = os.path.join(working_dir, filename) + adapter_name = re.search(r'A(\d+)-R', filename) + if adapter_name: + adapter_name = 'A' + adapter_name.group(1) + if adapter_name in adapter_counts and adapter_counts[adapter_name] > threshold: + os.remove(file_path) + print(f'Removed file: {filename}') + CODE + + echo "RAN PYTHON SNIPPET" # Batch the fastq files into folders of batch_number size - #batch_number=~{batch_number} - #for i in $(seq 1 "${batch_number}"); do # Use seq for reliable brace expansion - # mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs - #done + batch_number=~{batch_number} + for i in $(seq 1 "${batch_number}"); do # Use seq for reliable brace expansion + mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs + done - #echo "BATCHED FASTQ FILES INTO FOLDERS" + echo "BATCHED FASTQ FILES INTO FOLDERS" # Counter for the folder index - #folder_index=1 + folder_index=1 + WORKING_DIR=`pwd` # Define lists of r1 and r2 fq files - #R1_files=($(ls ~{cromwell_root_dir} | grep "\-R1.fq.gz")) - #R2_files=($(ls ~{cromwell_root_dir} | grep "\-R2.fq.gz")) + R1_files=($(ls $WORKING_DIR | grep "\-R1.fq.gz")) + R2_files=($(ls $WORKING_DIR | grep "\-R2.fq.gz")) - #echo "STARTING TAR JOB" + echo "STARTING TAR JOB" # Distribute the FASTQ files and create TAR files - #for file in "${R1_files[@]}"; do - # sample_id=$(basename "$file" "-R1.fq.gz") - # r2_file="${sample_id}-R2.fq.gz" - # mv ~{cromwell_root_dir}/$file batch$((folder_index))/$file - # mv ~{cromwell_root_dir}/$r2_file batch$((folder_index))/$r2_file + for file in "${R1_files[@]}"; do + sample_id=$(basename "$file" "-R1.fq.gz") + r2_file="${sample_id}-R2.fq.gz" + mv $WORKING_DIR/$file batch$((folder_index))/$file + mv $WORKING_DIR/$r2_file batch$((folder_index))/$r2_file # Increment the counter - # folder_index=$(( (folder_index % $batch_number) + 1 )) - #done + folder_index=$(( (folder_index % $batch_number) + 1 )) + done # Tar up files per batch - #echo "TAR files" - #for i in $(seq 1 "${batch_number}"); do - # tar -cf - ~{cromwell_root_dir}/batch${i}/*.fq.gz | pigz > ~{cromwell_root_dir}/~{plate_id}.${i}.cutadapt_output_files.tar.gz - #done - #echo "TAR files created successfully." + echo "TAR files" + for i in $(seq 1 "${batch_number}"); do + tar -cf - $WORKING_DIR/batch${i}/*.fq.gz | pigz > $WORKING_DIR/~{plate_id}.${i}.cutadapt_output_files.tar.gz + done + echo "TAR files created successfully." >>> @@ -263,7 +268,7 @@ task Demultiplexing { } output { - #Array[File] tarred_demultiplexed_fastqs = glob("*.tar.gz") + Array[File] tarred_demultiplexed_fastqs = glob("*.tar.gz") File stats = "~{plate_id}.stats.txt" } } From e2f98c85dc78478b19da78eca007f1170f54b0ff Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 6 May 2024 14:04:31 -0400 Subject: [PATCH 071/186] typo --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index ee83007f16..e74e651d05 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -191,7 +191,7 @@ task Demultiplexing { # Parsing stats.txt file working_dir = os.getcwd() - stats_file_path = os.path.join(working_dir, ~{plate_id}.stats.txt') + stats_file_path = os.path.join(working_dir, '~{plate_id}.stats.txt') adapter_counts = {} with open(stats_file_path, 'r') as file: content = file.read() From 1d8745cabf1664e246f42fe24103c89f216f5917 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 6 May 2024 14:14:54 -0400 Subject: [PATCH 072/186] typo --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index e74e651d05..4b381407e5 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -195,7 +195,7 @@ task Demultiplexing { adapter_counts = {} with open(stats_file_path, 'r') as file: content = file.read() - print("opened stats file) + print("opened stats file") adapter_matches = re.findall(r'=== First read: Adapter (\w+) ===\n\nSequence: .+; Type: .+; Length: \d+; Trimmed: (\d+) times', content) for adapter_match in adapter_matches: adapter_name = adapter_match[0] From e672e3de0a7a96734b223eb62c7223e173e31cb9 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 6 May 2024 14:24:44 -0400 Subject: [PATCH 073/186] typo --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 4b381407e5..8e8df191ca 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -207,7 +207,7 @@ task Demultiplexing { # TODO remove these prints: all_fastqs = os.listdir(working_dir) - print(f"all fastq files: {all_fastqs}" + print(f"all fastq files: {all_fastqs}") for filename in os.listdir(working_dir): if filename.endswith('.fq.gz'): From 14b6958247dd0573d495ceb7aeb9b13d2966db51 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 6 May 2024 14:29:20 -0400 Subject: [PATCH 074/186] remove print --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 8e8df191ca..47bfb38483 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -207,7 +207,7 @@ task Demultiplexing { # TODO remove these prints: all_fastqs = os.listdir(working_dir) - print(f"all fastq files: {all_fastqs}") + for filename in os.listdir(working_dir): if filename.endswith('.fq.gz'): From 84a8777b5e05c2afdfddb898d9d184afff05ec8c Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 6 May 2024 15:10:43 -0400 Subject: [PATCH 075/186] uncomment rest of workflow --- pipelines/skylab/snm3C/snm3C.wdl | 196 +++++++++++++------------------ 1 file changed, 84 insertions(+), 112 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 47bfb38483..059e80a897 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -54,85 +54,84 @@ workflow snm3C { plate_id = plate_id, batch_number = batch_number, docker = docker_prefix + m3c_yap_hisat_docker, + } + + scatter(tar in Demultiplexing.tarred_demultiplexed_fastqs) { + call Hisat_paired_end as Hisat_paired_end { + input: + tarred_demultiplexed_fastqs = tar, + tarred_index_files = tarred_index_files, + genome_fa = genome_fa, + chromosome_sizes = chromosome_sizes, + min_read_length = min_read_length, + r1_adapter = r1_adapter, + r2_adapter = r2_adapter, + r1_left_cut = r1_left_cut, + r1_right_cut = r1_right_cut, + r2_left_cut = r2_left_cut, + r2_right_cut = r2_right_cut, + plate_id = plate_id, + docker = docker_prefix + m3c_yap_hisat_docker, + cromwell_root_dir = cromwell_root_dir + } + + call Hisat_single_end as Hisat_single_end { + input: + split_fq_tar = Hisat_paired_end.split_fq_tar, + tarred_index_files = tarred_index_files, + genome_fa = genome_fa, + plate_id = plate_id, + docker = docker_prefix + m3c_yap_hisat_docker, + cromwell_root_dir = cromwell_root_dir + } + + call Merge_sort_analyze as Merge_sort_analyze { + input: + paired_end_unique_tar = Hisat_paired_end.unique_bam_tar, + read_overlap_tar = Hisat_single_end.remove_overlaps_output_bam_tar, + genome_fa = genome_fa, + num_upstr_bases = num_upstr_bases, + num_downstr_bases = num_downstr_bases, + compress_level = compress_level, + chromosome_sizes = chromosome_sizes, + plate_id = plate_id, + docker = docker_prefix + m3c_yap_hisat_docker, + cromwell_root_dir = cromwell_root_dir + } + } + + call Summary { + input: + trimmed_stats = Hisat_paired_end.trim_stats_tar, + hisat3n_stats = Hisat_paired_end.hisat3n_paired_end_stats_tar, + r1_hisat3n_stats = Hisat_single_end.hisat3n_dna_split_reads_summary_R1_tar, + r2_hisat3n_stats = Hisat_single_end.hisat3n_dna_split_reads_summary_R2_tar, + dedup_stats = Merge_sort_analyze.dedup_stats_tar, + chromatin_contact_stats = Merge_sort_analyze.chromatin_contact_stats, + allc_uniq_reads_stats = Merge_sort_analyze.allc_uniq_reads_stats, + unique_reads_cgn_extraction_tbi = Merge_sort_analyze.extract_allc_output_tbi_tar, + plate_id = plate_id, + docker = docker_prefix + m3c_yap_hisat_docker, cromwell_root_dir = cromwell_root_dir } - #scatter(tar in Demultiplexing.tarred_demultiplexed_fastqs) { - # call Hisat_paired_end as Hisat_paired_end { - # input: - # tarred_demultiplexed_fastqs = tar, - # tarred_index_files = tarred_index_files, - # genome_fa = genome_fa, - # chromosome_sizes = chromosome_sizes, - # min_read_length = min_read_length, - # r1_adapter = r1_adapter, - # r2_adapter = r2_adapter, - # r1_left_cut = r1_left_cut, - # r1_right_cut = r1_right_cut, - # r2_left_cut = r2_left_cut, - # r2_right_cut = r2_right_cut, - # plate_id = plate_id, - # docker = docker_prefix + m3c_yap_hisat_docker, - # cromwell_root_dir = cromwell_root_dir - # } - - # call Hisat_single_end as Hisat_single_end { - # input: - # split_fq_tar = Hisat_paired_end.split_fq_tar, - # tarred_index_files = tarred_index_files, - # genome_fa = genome_fa, - # plate_id = plate_id, - # docker = docker_prefix + m3c_yap_hisat_docker, - # cromwell_root_dir = cromwell_root_dir - # } - - # call Merge_sort_analyze as Merge_sort_analyze { - # input: - # paired_end_unique_tar = Hisat_paired_end.unique_bam_tar, - # read_overlap_tar = Hisat_single_end.remove_overlaps_output_bam_tar, - # genome_fa = genome_fa, - # num_upstr_bases = num_upstr_bases, - # num_downstr_bases = num_downstr_bases, - # compress_level = compress_level, - # chromosome_sizes = chromosome_sizes, - # plate_id = plate_id, - # docker = docker_prefix + m3c_yap_hisat_docker, - # cromwell_root_dir = cromwell_root_dir - #} - #} - - #call Summary { - # input: - # trimmed_stats = Hisat_paired_end.trim_stats_tar, - # hisat3n_stats = Hisat_paired_end.hisat3n_paired_end_stats_tar, - # r1_hisat3n_stats = Hisat_single_end.hisat3n_dna_split_reads_summary_R1_tar, - # r2_hisat3n_stats = Hisat_single_end.hisat3n_dna_split_reads_summary_R2_tar, - # dedup_stats = Merge_sort_analyze.dedup_stats_tar, - # chromatin_contact_stats = Merge_sort_analyze.chromatin_contact_stats, - # allc_uniq_reads_stats = Merge_sort_analyze.allc_uniq_reads_stats, - # unique_reads_cgn_extraction_tbi = Merge_sort_analyze.extract_allc_output_tbi_tar, - # plate_id = plate_id, - # docker = docker_prefix + m3c_yap_hisat_docker, - # cromwell_root_dir = cromwell_root_dir - #} - - #meta { - # allowNestedInputs: true - #} - - #output { - # File MappingSummary = Summary.mapping_summary - ## Array[File] name_sorted_bams = Merge_sort_analyze.name_sorted_bam - # Array[File] unique_reads_cgn_extraction_allc= Merge_sort_analyze.allc - # Array[File] unique_reads_cgn_extraction_tbi = Merge_sort_analyze.tbi - # Array[File] reference_version = Hisat_paired_end.reference_version - # Array[File] all_reads_dedup_contacts = Merge_sort_analyze.all_reads_dedup_contacts - # Array[File] all_reads_3C_contacts = Merge_sort_analyze.all_reads_3C_contacts - # Array[File] chromatin_contact_stats = Merge_sort_analyze.chromatin_contact_stats - # Array[File] unique_reads_cgn_extraction_allc_extract = Merge_sort_analyze.extract_allc_output_allc_tar - # Array[File] unique_reads_cgn_extraction_tbi_extract = Merge_sort_analyze.extract_allc_output_tbi_tar - - #} + meta { + allowNestedInputs: true + } + + output { + File MappingSummary = Summary.mapping_summary + Array[File] name_sorted_bams = Merge_sort_analyze.name_sorted_bam + Array[File] unique_reads_cgn_extraction_allc= Merge_sort_analyze.allc + Array[File] unique_reads_cgn_extraction_tbi = Merge_sort_analyze.tbi + Array[File] reference_version = Hisat_paired_end.reference_version + Array[File] all_reads_dedup_contacts = Merge_sort_analyze.all_reads_dedup_contacts + Array[File] all_reads_3C_contacts = Merge_sort_analyze.all_reads_3C_contacts + Array[File] chromatin_contact_stats = Merge_sort_analyze.chromatin_contact_stats + Array[File] unique_reads_cgn_extraction_allc_extract = Merge_sort_analyze.extract_allc_output_allc_tar + Array[File] unique_reads_cgn_extraction_tbi_extract = Merge_sort_analyze.extract_allc_output_tbi_tar + + } } task Demultiplexing { @@ -143,7 +142,6 @@ task Demultiplexing { String plate_id Int batch_number String docker - String cromwell_root_dir Int disk_size = 1000 Int mem_size = 10 @@ -152,23 +150,13 @@ task Demultiplexing { } command <<< - echo "TEST" set -euo pipefail - - ls -lR - pwd - echo "setting directory" WORKING_DIR=`pwd` - echo $WORKING_DIR # Cat files for each r1, r2 cat ~{sep=' ' fastq_input_read1} > $WORKING_DIR/r1.fastq.gz cat ~{sep=' ' fastq_input_read2} > $WORKING_DIR/r2.fastq.gz - echo "successfully catted files" - pwd - ls - # Run cutadapt /opt/conda/bin/cutadapt -Z -e 0.01 --no-indels -j 8 \ -g file:~{random_primer_indexes} \ @@ -178,13 +166,9 @@ task Demultiplexing { $WORKING_DIR/r2.fastq.gz \ > $WORKING_DIR/~{plate_id}.stats.txt - echo "RAN CUT ADAPT" - # remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz rm $WORKING_DIR/*-unknown-R{1,2}.fq.gz - echo "REMOVED FILES" - python3 < $WORKING_DIR/~{plate_id}.${i}.cutadapt_output_files.tar.gz done - echo "TAR files created successfully." - >>> runtime { @@ -301,6 +272,7 @@ task Hisat_paired_end { set -euo pipefail set -x lscpu + WORKING_DIR=`pwd` # check genomic reference version and print to output txt file STRING=~{genome_fa} @@ -349,7 +321,7 @@ task Hisat_paired_end { # sort start=$(date +%s) echo "Run sort r1" - zcat ~{cromwell_root_dir}/batch*/"$r1_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R1_sorted.fq" + zcat $WORKING_DIR/batch*/"$r1_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R1_sorted.fq" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort r1: $elapsed seconds" @@ -357,7 +329,7 @@ task Hisat_paired_end { # sort start=$(date +%s) echo "Run sort r2" - zcat ~{cromwell_root_dir}/batch*/"$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" + zcat $WORKING_DIR/batch*/"$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort r2: $elapsed seconds" @@ -376,10 +348,10 @@ task Hisat_paired_end { -Z \ -m ${min_read_length}:${min_read_length} \ --pair-filter 'both' \ - -o ${sample_id}-R1_trimmed.fq.gz \ - -p ${sample_id}-R2_trimmed.fq.gz \ - ${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq \ - > ${sample_id}.trimmed.stats.txt + -o $WORKING_DIR/${sample_id}-R1_trimmed.fq.gz \ + -p $WORKING_DIR/${sample_id}-R2_trimmed.fq.gz \ + $WORKING_DIR/${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq \ + > $WORKING_DIR/${sample_id}.trimmed.stats.txt end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run cutadapt: $elapsed seconds" @@ -574,7 +546,7 @@ task Hisat_single_end { end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to untar split_fq_tar: $elapsed seconds" - + # make directories mkdir -p ~{cromwell_root_dir}/merged_sort_bams mkdir -p ~{cromwell_root_dir}/read_overlap From a4386ce7bc4dbe59a1d9dd149f2f6ff702621ace Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 7 May 2024 09:22:01 -0400 Subject: [PATCH 076/186] add working dir to batch subdir --- pipelines/skylab/snm3C/snm3C.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 059e80a897..9636e3d13e 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -391,7 +391,7 @@ task Hisat_paired_end { elapsed=$((end - start)) echo "Elapsed time to run split_hisat3n_unmapped_reads: $elapsed seconds" - rm ~{cromwell_root_dir}/batch*/${sample_id}-R1.fq.gz ~{cromwell_root_dir}/batch*/${sample_id}-R2.fq.gz + rm $WORKING_DIR/batch*/${sample_id}-R1.fq.gz $WORKING_DIR/batch*/${sample_id}-R2.fq.gz rm ${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq rm ${sample_id}-R1_trimmed.fq.gz ${sample_id}-R2_trimmed.fq.gz rm ${sample_id}.hisat3n_dna.unsort.bam ${sample_id}.hisat3n_dna.multi_aligned.bam @@ -399,8 +399,8 @@ task Hisat_paired_end { } # define lists of r1 and r2 fq files - R1_files=($(ls batch*/ | grep "\-R1.fq.gz")) - R2_files=($(ls batch*/ | grep "\-R2.fq.gz")) + R1_files=($(ls $WORKING_DIR/batch*/ | grep "\-R1.fq.gz")) + R2_files=($(ls $WORKING_DIR/batch*/ | grep "\-R2.fq.gz")) # for file in "${R1_files[@]}"; do # ( From 2098784872003ceec15b8c827ffe0d745dedf7b0 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 7 May 2024 10:55:15 -0400 Subject: [PATCH 077/186] ls cromwell root batch --- pipelines/skylab/snm3C/snm3C.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 9636e3d13e..0ccea08aee 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -399,8 +399,8 @@ task Hisat_paired_end { } # define lists of r1 and r2 fq files - R1_files=($(ls $WORKING_DIR/batch*/ | grep "\-R1.fq.gz")) - R2_files=($(ls $WORKING_DIR/batch*/ | grep "\-R2.fq.gz")) + R1_files=($(ls ~{cromwell_root_dir}/batch*/ | grep "\-R1.fq.gz")) + R2_files=($(ls ~{cromwell_root_dir}/batch*/ | grep "\-R2.fq.gz")) # for file in "${R1_files[@]}"; do # ( From 7704e7d1b5a7317b260aab73d4de037ecc3a27b8 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 7 May 2024 12:08:44 -0400 Subject: [PATCH 078/186] ls directories to find batch dir --- pipelines/skylab/snm3C/snm3C.wdl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 0ccea08aee..1e4b3d58ff 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -309,6 +309,11 @@ task Hisat_paired_end { end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to untar: $elapsed seconds" + + echo "lsing current dir:" + ls -lR + echo "lsing cromwell root:" + ls -lR ~{cromwell_root_dir} task() { local file=$1 From 63466c5087175010b2e06376b8a2ddc8ad899046 Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 7 May 2024 13:11:56 -0400 Subject: [PATCH 079/186] update changelogs --- pipelines/skylab/multiome/Multiome.wdl | 2 +- pipelines/skylab/optimus/Optimus.wdl | 2 +- pipelines/skylab/slideseq/SlideSeq.wdl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 8bfd9c7222..e8b901fad9 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow Multiome { - String pipeline_version = "3.4.3" + String pipeline_version = "3.4.4" input { String cloud_provider diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index d1965fdd9b..b4b0196f89 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -68,7 +68,7 @@ workflow Optimus { # version of this pipeline - String pipeline_version = "6.6.2" + String pipeline_version = "6.6.3" # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index 8005922895..0502a32fcd 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -25,7 +25,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow SlideSeq { - String pipeline_version = "3.1.5" + String pipeline_version = "3.1.6" input { Array[File] r1_fastq From 5ba8021c1572d2f64b3a8c1ee7aa887f3d81efa0 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 7 May 2024 13:29:46 -0400 Subject: [PATCH 080/186] set batch dir --- pipelines/skylab/snm3C/snm3C.wdl | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 1e4b3d58ff..15bd22d844 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -314,7 +314,16 @@ task Hisat_paired_end { ls -lR echo "lsing cromwell root:" ls -lR ~{cromwell_root_dir} - + + # define lists of r1 and r2 fq files + if [ ~{cromwell_root_dir} = "gcp" ]; then + batch_dir="batch*/" + else + batch_dir="/~{cromwell_root_dir}/*/*/call-Demultiplexing/execution/batch*/" + fi + echo "batchdirectory: $batch_dir" + + task() { local file=$1 sample_id=$(basename "$file" "-R1.fq.gz") @@ -326,7 +335,7 @@ task Hisat_paired_end { # sort start=$(date +%s) echo "Run sort r1" - zcat $WORKING_DIR/batch*/"$r1_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R1_sorted.fq" + zcat $batch_dir/"$r1_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R1_sorted.fq" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort r1: $elapsed seconds" @@ -334,7 +343,7 @@ task Hisat_paired_end { # sort start=$(date +%s) echo "Run sort r2" - zcat $WORKING_DIR/batch*/"$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" + zcat $batch_dir/batch*/"$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort r2: $elapsed seconds" @@ -396,30 +405,33 @@ task Hisat_paired_end { elapsed=$((end - start)) echo "Elapsed time to run split_hisat3n_unmapped_reads: $elapsed seconds" - rm $WORKING_DIR/batch*/${sample_id}-R1.fq.gz $WORKING_DIR/batch*/${sample_id}-R2.fq.gz + rm $batch_dir/${sample_id}-R1.fq.gz $batch_dir/batch*/${sample_id}-R2.fq.gz rm ${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq rm ${sample_id}-R1_trimmed.fq.gz ${sample_id}-R2_trimmed.fq.gz rm ${sample_id}.hisat3n_dna.unsort.bam ${sample_id}.hisat3n_dna.multi_aligned.bam rm ${sample_id}.hisat3n_dna.unmapped.fastq } - # define lists of r1 and r2 fq files - R1_files=($(ls ~{cromwell_root_dir}/batch*/ | grep "\-R1.fq.gz")) - R2_files=($(ls ~{cromwell_root_dir}/batch*/ | grep "\-R2.fq.gz")) + + R1_files=($(ls $batch_dir | grep "\-R1.fq.gz")) + R2_files=($(ls $batch_dir | grep "\-R2.fq.gz")) + + echo "r1 files: $R1_files" + echo "r2 files: $R2_files" # for file in "${R1_files[@]}"; do # ( # echo "starting task $file.." # du -h batch*/$file # task "$file" - # ) + # ) # done # run 6 instances of task in parallel for file in "${R1_files[@]}"; do ( echo "starting task $file.." - du -h batch*/$file + du -h $batch_dir/$file task "$file" sleep $(( (RANDOM % 3) + 1)) ) & From 988a92048ba7568118b620c90ff4ac3c7cee9fe0 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 7 May 2024 13:55:27 -0400 Subject: [PATCH 081/186] remove extra leading slash --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 15bd22d844..22d8fdfcc6 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -319,7 +319,7 @@ task Hisat_paired_end { if [ ~{cromwell_root_dir} = "gcp" ]; then batch_dir="batch*/" else - batch_dir="/~{cromwell_root_dir}/*/*/call-Demultiplexing/execution/batch*/" + batch_dir="~{cromwell_root_dir}/*/*/call-Demultiplexing/execution/batch*/" fi echo "batchdirectory: $batch_dir" From 03f5870e20e627ba4131e77b8b3abf3e357bc667 Mon Sep 17 00:00:00 2001 From: npetrill Date: Wed, 8 May 2024 09:54:26 -0400 Subject: [PATCH 082/186] fixing what i messed up in resvoling conflicts --- tasks/skylab/StarAlign.wdl | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/tasks/skylab/StarAlign.wdl b/tasks/skylab/StarAlign.wdl index 1ad3126d21..d6fe440302 100644 --- a/tasks/skylab/StarAlign.wdl +++ b/tasks/skylab/StarAlign.wdl @@ -344,8 +344,11 @@ task STARsoloFastq { then SoloDirectory="Solo.out/Gene/raw" echo "SoloDirectory is $SoloDirectory" - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} echo mv {} /cromwell_root/ - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} mv {} /cromwell_root/ + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} echo mv {} /cromwell_root/ + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} mv {} /cromwell_root/ + echo "list matrix files in $SoloDirectory" + ls "$SoloDirectory"/*.mtx + mv $SoloDirectory/matrix.mtx matrix.mtx mv "Solo.out/Gene/raw/barcodes.tsv" barcodes.tsv mv "Solo.out/Gene/raw/features.tsv" features.tsv mv "Solo.out/Gene/CellReads.stats" CellReads.stats @@ -358,8 +361,11 @@ task STARsoloFastq { then SoloDirectory="Solo.out/GeneFull_Ex50pAS/raw" echo "SoloDirectory is $SoloDirectory" - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} echo mv {} /cromwell_root/ - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} mv {} /cromwell_root/ + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} echo mv {} /cromwell_root/ + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} mv {} /cromwell_root/ + echo "list matrix files in $SoloDirectory" + ls "$SoloDirectory"/*.mtx + mv $SoloDirectory/matrix.mtx matrix.mtx mv "Solo.out/GeneFull_Ex50pAS/raw/barcodes.tsv" barcodes.tsv mv "Solo.out/GeneFull_Ex50pAS/raw/features.tsv" features.tsv mv "Solo.out/GeneFull_Ex50pAS/CellReads.stats" CellReads.stats @@ -369,12 +375,18 @@ task STARsoloFastq { else SoloDirectory="Solo.out/GeneFull_Ex50pAS/raw" echo "SoloDirectory is $SoloDirectory" - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} echo mv {} /cromwell_root/ - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} mv {} /cromwell_root/ + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} echo mv {} /cromwell_root/ + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} mv {} /cromwell_root/ + echo "list matrix files in $SoloDirectory" + ls "$SoloDirectory"/*.mtx + mv $SoloDirectory/matrix.mtx matrix.mtx SoloDirectory="Solo.out/Gene/raw" echo "SoloDirectory is $SoloDirectory" - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} sh -c 'new_name="$(basename {} .mtx)_sn_rna.mtx"; echo mv {} "/cromwell_root/$new_name"' - find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} sh -c 'new_name="$(basename {} .mtx)_sn_rna.mtx"; mv {} "/cromwell_root/$new_name"' + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} sh -c 'new_name="$(basename {} .mtx)_sn_rna.mtx"; echo mv {} "/cromwell_root/$new_name"' + #find "$SoloDirectory" -maxdepth 1 -type f -name "*.mtx" -print0 | xargs -0 -I{} sh -c 'new_name="$(basename {} .mtx)_sn_rna.mtx"; mv {} "/cromwell_root/$new_name"' + echo "list matrix files in $SoloDirectory" + ls "$SoloDirectory"/*.mtx + mv $SoloDirectory/matrix.mtx matrix_sn_rna.mtx mv "Solo.out/GeneFull_Ex50pAS/raw/barcodes.tsv" barcodes.tsv mv "Solo.out/GeneFull_Ex50pAS/raw/features.tsv" features.tsv mv "Solo.out/GeneFull_Ex50pAS/CellReads.stats" CellReads.stats From cd9d4608dff063c8df969bf7c8d28127671693ec Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Wed, 8 May 2024 16:26:40 -0400 Subject: [PATCH 083/186] fix batch dir --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 22d8fdfcc6..689bba7849 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -319,7 +319,7 @@ task Hisat_paired_end { if [ ~{cromwell_root_dir} = "gcp" ]; then batch_dir="batch*/" else - batch_dir="~{cromwell_root_dir}/*/*/call-Demultiplexing/execution/batch*/" + batch_dir="~{cromwell_root_dir}/*/*/*/*/*/~{cromwell_root_dir}/*/*/*/*/batch*/" fi echo "batchdirectory: $batch_dir" From e63ff3ddc3120690c944e3f46f1a31e761e1884c Mon Sep 17 00:00:00 2001 From: npetrill Date: Thu, 9 May 2024 09:27:46 -0400 Subject: [PATCH 084/186] need to loop through array --- tasks/skylab/FastqProcessing.wdl | 45 ++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/tasks/skylab/FastqProcessing.wdl b/tasks/skylab/FastqProcessing.wdl index c7ae558cf2..20a7169d29 100644 --- a/tasks/skylab/FastqProcessing.wdl +++ b/tasks/skylab/FastqProcessing.wdl @@ -294,16 +294,39 @@ task FastqProcessATAC { echo $read1_fastq_files # Make downsample fq for barcode orientation check of R2 barcodes - mkdir input_fastq - mv $read1_fastq_files input_fastq/ - mv $read2_fastq_files input_fastq/ - mv $read3_fastq_files input_fastq/ + mkdir -p input_fastqs - #gcloud storage cp $read1_fastq_files /cromwell_root/input_fastq - #gcloud storage cp $read2_fastq_files /cromwell_root/input_fastq - #gcloud storage cp $read3_fastq_files /cromwell_root/input_fastq + # Function to move files into the input_fastqs directory + move_files_to_input_dir() { + local -n array=$1 # Reference to the array passed as argument + local destination_dir=$2 - path="input_fastq/" + for file in "${array[@]}"; do + if [ -f "$file" ]; then # Check if file exists + echo "Moving $file to $destination_dir" + mv "$file" "$destination_dir" + else + echo "File $file not found" + fi + done + } + + # Move files from FASTQ1_ARRAY to input_fastqs directory + move_files_to_input_dir FASTQ1_ARRAY input_fastqs + + # Move files from FASTQ2_ARRAY to input_fastqs directory + move_files_to_input_dir FASTQ2_ARRAY input_fastqs + + # Move files from FASTQ3_ARRAY to input_fastqs directory + move_files_to_input_dir FASTQ3_ARRAY input_fastqs + + echo "All files moved to input_fastqs directory" + + #gcloud storage cp $read1_fastq_files /cromwell_root/input_fastqs + #gcloud storage cp $read2_fastq_files /cromwell_root/input_fastqs + #gcloud storage cp $read3_fastq_files /cromwell_root/input_fastqs + + path="input_fastqs/" barcode_index="~{barcode_index1}" file="${path}${barcode_index}" zcat "$file" | sed -n '2~4p' | shuf -n 1000 > downsample.fq @@ -313,7 +336,7 @@ task FastqProcessATAC { for fastq in "${FASTQ2_ARRAY[@]}" do BASE=`basename $fastq` - BASE=`echo --R1 input_fastq/$BASE` + BASE=`echo --R1 input_fastqs/$BASE` R1_FILES_CONCAT+="$BASE " done echo $R1_FILES_CONCAT @@ -323,7 +346,7 @@ task FastqProcessATAC { for fastq in "${FASTQ1_ARRAY[@]}" do BASE=`basename $fastq` - BASE=`echo --R2 input_fastq/$BASE` + BASE=`echo --R2 input_fastqs/$BASE` R2_FILES_CONCAT+="$BASE " done echo $R2_FILES_CONCAT @@ -333,7 +356,7 @@ task FastqProcessATAC { for fastq in "${FASTQ3_ARRAY[@]}" do BASE=`basename $fastq` - BASE=`echo --R3 input_fastq/$BASE` + BASE=`echo --R3 input_fastqs/$BASE` R3_FILES_CONCAT+="$BASE " done echo $R3_FILES_CONCAT From 49f6df10000f95a990ff45e257e0e8ee59697dd9 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 9 May 2024 12:02:41 -0400 Subject: [PATCH 085/186] add lots of logging to batch logic --- pipelines/skylab/snm3C/snm3C.wdl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 689bba7849..f08eee5596 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -202,7 +202,9 @@ task Demultiplexing { # Batch the fastq files into folders of batch_number size batch_number=~{batch_number} + echo "batch number: $batch_number" for i in $(seq 1 "${batch_number}"); do # Use seq for reliable brace expansion + echo "making batch directory: batch${i}" mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs done @@ -213,19 +215,28 @@ task Demultiplexing { # Define lists of r1 and r2 fq files R1_files=($(ls $WORKING_DIR | grep "\-R1.fq.gz")) R2_files=($(ls $WORKING_DIR | grep "\-R2.fq.gz")) + echo "R1 files: $R1_files" + echo "R2 files: $R2_files" # Distribute the FASTQ files and create TAR files + echo "starting loop of files" for file in "${R1_files[@]}"; do sample_id=$(basename "$file" "-R1.fq.gz") + echo "sampleId: $sample_id" r2_file="${sample_id}-R2.fq.gz" + echo "r2 file: $r2_file" mv $WORKING_DIR/$file batch$((folder_index))/$file + echo "moved $WORKING_DIR/$file to: batch$((folder_index))/$file" mv $WORKING_DIR/$r2_file batch$((folder_index))/$r2_file + echo "moved $WORKING_DIR/$r2_file to: batch$((folder_index))/$r2_file" # Increment the counter folder_index=$(( (folder_index % $batch_number) + 1 )) + echo "folder index is now: $folder_index" done # Tar up files per batch for i in $(seq 1 "${batch_number}"); do + echo "tarring $WORKING_DIR/batch${i}/*.fq.gz and outputting: $WORKING_DIR/~{plate_id}.${i}.cutadapt_output_files.tar.gz" tar -cf - $WORKING_DIR/batch${i}/*.fq.gz | pigz > $WORKING_DIR/~{plate_id}.${i}.cutadapt_output_files.tar.gz done >>> From 9823190a708a88b044ece0d1beb938beb6f00388 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 9 May 2024 12:05:19 -0400 Subject: [PATCH 086/186] add lots of logging to batch logic --- pipelines/skylab/snm3C/snm3C.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index f08eee5596..263aa2a370 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -236,6 +236,7 @@ task Demultiplexing { # Tar up files per batch for i in $(seq 1 "${batch_number}"); do + echo " working on batch: batch${i}" echo "tarring $WORKING_DIR/batch${i}/*.fq.gz and outputting: $WORKING_DIR/~{plate_id}.${i}.cutadapt_output_files.tar.gz" tar -cf - $WORKING_DIR/batch${i}/*.fq.gz | pigz > $WORKING_DIR/~{plate_id}.${i}.cutadapt_output_files.tar.gz done From 76ef8e5500ce3b6308b7a6d6bfff5efa7ed899fd Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 9 May 2024 13:18:31 -0400 Subject: [PATCH 087/186] change output dir of bams --- pipelines/skylab/snm3C/snm3C.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 263aa2a370..83dc7ded4e 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -404,7 +404,7 @@ task Hisat_paired_end { # call separate_unique_and_multi_align_reads start=$(date +%s) echo "Run separate_unique_and_multi_align_reads" - python3 -c 'from cemba_data.hisat3n import separate_unique_and_multi_align_reads;separate_unique_and_multi_align_reads(in_bam_path="'"$sample_id"'.hisat3n_dna.unsort.bam", out_unique_path="'"$sample_id"'.hisat3n_dna.unique_aligned.bam", out_multi_path="'"$sample_id"'.hisat3n_dna.multi_aligned.bam", out_unmappable_path="'"$sample_id"'.hisat3n_dna.unmapped.fastq", unmappable_format="fastq", mapq_cutoff=10, qlen_cutoff='"$min_read_length"')' + python3 -c 'from cemba_data.hisat3n import separate_unique_and_multi_align_reads;separate_unique_and_multi_align_reads(in_bam_path="'"$sample_id"'.hisat3n_dna.unsort.bam", out_unique_path="'"$WORKING_DIR"'/"'"$sample_id"'.hisat3n_dna.unique_aligned.bam", out_multi_path="'"$WORKING_DIR"'/"'"$sample_id"'.hisat3n_dna.multi_aligned.bam", out_unmappable_path="'"$WORKING_DIR"'/"'"$sample_id"'.hisat3n_dna.unmapped.fastq", unmappable_format="fastq", mapq_cutoff=10, qlen_cutoff='"$min_read_length"')' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run separate_unique_and_multi_align_reads: $elapsed seconds" @@ -412,7 +412,7 @@ task Hisat_paired_end { # call split_hisat3n_unmapped_reads start=$(date +%s) echo "Run split_hisat3n_unmapped_reads" - python3 -c 'from cemba_data.hisat3n import *;split_hisat3n_unmapped_reads(fastq_path="'"$sample_id"'.hisat3n_dna.unmapped.fastq",output_prefix="'"$sample_id"'.hisat3n_dna.split_reads",min_length='"$min_read_length"')' + python3 -c 'from cemba_data.hisat3n import *;split_hisat3n_unmapped_reads(fastq_path="'"$sample_id"'.hisat3n_dna.unmapped.fastq",output_prefix="'"$WORKING_DIR"'/"'"$sample_id"'.hisat3n_dna.split_reads",min_length='"$min_read_length"')' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run split_hisat3n_unmapped_reads: $elapsed seconds" From 7c28296f398a1be1fa568a061baac57371cdb244 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 9 May 2024 14:13:52 -0400 Subject: [PATCH 088/186] fix path of fastq --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 83dc7ded4e..aaf178b047 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -355,7 +355,7 @@ task Hisat_paired_end { # sort start=$(date +%s) echo "Run sort r2" - zcat $batch_dir/batch*/"$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" + zcat $batch_dir/$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort r2: $elapsed seconds" From cf243676a86be7bd846bbfffbef91e2c22510749 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 9 May 2024 14:51:51 -0400 Subject: [PATCH 089/186] fix path of fastq --- pipelines/skylab/snm3C/snm3C.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index aaf178b047..24374eae4e 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -404,7 +404,7 @@ task Hisat_paired_end { # call separate_unique_and_multi_align_reads start=$(date +%s) echo "Run separate_unique_and_multi_align_reads" - python3 -c 'from cemba_data.hisat3n import separate_unique_and_multi_align_reads;separate_unique_and_multi_align_reads(in_bam_path="'"$sample_id"'.hisat3n_dna.unsort.bam", out_unique_path="'"$WORKING_DIR"'/"'"$sample_id"'.hisat3n_dna.unique_aligned.bam", out_multi_path="'"$WORKING_DIR"'/"'"$sample_id"'.hisat3n_dna.multi_aligned.bam", out_unmappable_path="'"$WORKING_DIR"'/"'"$sample_id"'.hisat3n_dna.unmapped.fastq", unmappable_format="fastq", mapq_cutoff=10, qlen_cutoff='"$min_read_length"')' + python3 -c 'from cemba_data.hisat3n import separate_unique_and_multi_align_reads;separate_unique_and_multi_align_reads(in_bam_path="'"$sample_id"'.hisat3n_dna.unsort.bam", out_unique_path="'"$sample_id"'.hisat3n_dna.unique_aligned.bam", out_multi_path="'"$sample_id"'.hisat3n_dna.multi_aligned.bam", out_unmappable_path="'"$sample_id"'.hisat3n_dna.unmapped.fastq", unmappable_format="fastq", mapq_cutoff=10, qlen_cutoff='"$min_read_length"')' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run separate_unique_and_multi_align_reads: $elapsed seconds" @@ -412,7 +412,7 @@ task Hisat_paired_end { # call split_hisat3n_unmapped_reads start=$(date +%s) echo "Run split_hisat3n_unmapped_reads" - python3 -c 'from cemba_data.hisat3n import *;split_hisat3n_unmapped_reads(fastq_path="'"$sample_id"'.hisat3n_dna.unmapped.fastq",output_prefix="'"$WORKING_DIR"'/"'"$sample_id"'.hisat3n_dna.split_reads",min_length='"$min_read_length"')' + python3 -c 'from cemba_data.hisat3n import *;split_hisat3n_unmapped_reads(fastq_path="'"$sample_id"'.hisat3n_dna.unmapped.fastq",output_prefix="'"$sample_id"'.hisat3n_dna.split_reads",min_length='"$min_read_length"')' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run split_hisat3n_unmapped_reads: $elapsed seconds" From 4d0f214fd7330a30c1b24e2ad8f1c16f84a148b0 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 9 May 2024 14:53:00 -0400 Subject: [PATCH 090/186] fix fastq remove path --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 24374eae4e..470ac66ee5 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -417,7 +417,7 @@ task Hisat_paired_end { elapsed=$((end - start)) echo "Elapsed time to run split_hisat3n_unmapped_reads: $elapsed seconds" - rm $batch_dir/${sample_id}-R1.fq.gz $batch_dir/batch*/${sample_id}-R2.fq.gz + rm $batch_dir/${sample_id}-R1.fq.gz $batch_dir/${sample_id}-R2.fq.gz rm ${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq rm ${sample_id}-R1_trimmed.fq.gz ${sample_id}-R2_trimmed.fq.gz rm ${sample_id}.hisat3n_dna.unsort.bam ${sample_id}.hisat3n_dna.multi_aligned.bam From 63ea99a94257cbce22f90a45aa0996a460e067ee Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 9 May 2024 14:53:53 -0400 Subject: [PATCH 091/186] add more echo statements --- pipelines/skylab/snm3C/snm3C.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 470ac66ee5..69ad65b3a6 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -416,7 +416,9 @@ task Hisat_paired_end { end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run split_hisat3n_unmapped_reads: $elapsed seconds" - + + + echo "removing files now!" rm $batch_dir/${sample_id}-R1.fq.gz $batch_dir/${sample_id}-R2.fq.gz rm ${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq rm ${sample_id}-R1_trimmed.fq.gz ${sample_id}-R2_trimmed.fq.gz From 259f13fd5586d5c98e0860774357ddddf5d9033e Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 9 May 2024 14:54:57 -0400 Subject: [PATCH 092/186] add more echo statements again --- pipelines/skylab/snm3C/snm3C.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 69ad65b3a6..27ceda7868 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -417,7 +417,6 @@ task Hisat_paired_end { elapsed=$((end - start)) echo "Elapsed time to run split_hisat3n_unmapped_reads: $elapsed seconds" - echo "removing files now!" rm $batch_dir/${sample_id}-R1.fq.gz $batch_dir/${sample_id}-R2.fq.gz rm ${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq From f19fd649bb76ca6c0dba27cca2653b7bf0e9af4f Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 9 May 2024 14:58:03 -0400 Subject: [PATCH 093/186] trigger update of wdl in workpace --- pipelines/skylab/snm3C/snm3C.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 27ceda7868..e2bd3508b3 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -429,8 +429,8 @@ task Hisat_paired_end { R1_files=($(ls $batch_dir | grep "\-R1.fq.gz")) R2_files=($(ls $batch_dir | grep "\-R2.fq.gz")) - echo "r1 files: $R1_files" - echo "r2 files: $R2_files" + echo "Found r1 files: $R1_files" + echo "Found r2 files: $R2_files" # for file in "${R1_files[@]}"; do # ( From 9c9155419cb8f7262b6b6e3d4c1982bb50498e63 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 9 May 2024 15:52:56 -0400 Subject: [PATCH 094/186] remove some logging --- pipelines/skylab/snm3C/snm3C.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index e2bd3508b3..596e77ce49 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -417,7 +417,6 @@ task Hisat_paired_end { elapsed=$((end - start)) echo "Elapsed time to run split_hisat3n_unmapped_reads: $elapsed seconds" - echo "removing files now!" rm $batch_dir/${sample_id}-R1.fq.gz $batch_dir/${sample_id}-R2.fq.gz rm ${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq rm ${sample_id}-R1_trimmed.fq.gz ${sample_id}-R2_trimmed.fq.gz From 834f00babd52cfaafecc6143e647d243b172d9e1 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Fri, 10 May 2024 09:22:45 -0400 Subject: [PATCH 095/186] fix logging error syntax --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 596e77ce49..51e90ad865 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -474,7 +474,7 @@ task Hisat_paired_end { # Check if the count of FASTQ files matches the length of the array ${R1_files[@]} if [ "$fastq_counts" -ne "$((2 * array_length))" ]; then - echo "Error: Number of FASTQ files ($fastq_count) does not match the 2 * length of the array (${#R1_files[@]})." + echo "Error: Number of FASTQ files: $fastq_count does not match the 2 * length of the array: ${#R1_files[@]}." exit 1 fi From a82b92eb0b1ccf960bd2b07c889d89212ff48fbd Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Fri, 10 May 2024 11:08:27 -0400 Subject: [PATCH 096/186] add missing quote --- pipelines/skylab/snm3C/snm3C.wdl | 91 ++++++++++++++++---------------- 1 file changed, 45 insertions(+), 46 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 51e90ad865..65a7567ea3 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -281,11 +281,14 @@ task Hisat_paired_end { } command <<< - set -euo pipefail + echo "Tar up stats" + start=$(date +%s) + tar -cf - *.trimmed.stats.txt | pigz > ~{plate_id}.trimmed_stats_files.tar.gz + tar -cf - *.hisat3n_set -euo pipefail set -x lscpu WORKING_DIR=`pwd` - + # check genomic reference version and print to output txt file STRING=~{genome_fa} BASE=$(basename $STRING .fa) @@ -295,31 +298,31 @@ task Hisat_paired_end { # untar the index files for hisat task start=$(date +%s) echo "Untarring tarred_index_files" - pigz -dc ~{tarred_index_files} | tar -xf - + pigz -dc ~{tarred_index_files} | tar -xf - rm ~{tarred_index_files} - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to untar tarred_index_files: $elapsed seconds" - + # get the basename of the genome_fa file cp ~{genome_fa} . genome_fa_basename=$(basename ~{genome_fa} .fa) - + start=$(date +%s) echo "samtools faidx $genome_fa_basename.fa" samtools faidx $genome_fa_basename.fa - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to samtools faidx: $elapsed seconds" min_read_length=~{min_read_length} - + # untar the demultiplexed fastqs for sort and trim task start=$(date +%s) echo "Untar demultiplexed fastqs" - pigz -dc ~{tarred_demultiplexed_fastqs} | tar -xf - - end=$(date +%s) - elapsed=$((end - start)) + pigz -dc ~{tarred_demultiplexed_fastqs} | tar -xf - + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to untar: $elapsed seconds" echo "lsing current dir:" @@ -343,23 +346,23 @@ task Hisat_paired_end { r2_file="${sample_id}-R2.fq.gz" r1_file="${sample_id}-R1.fq.gz" - - # sort + + # sort start=$(date +%s) echo "Run sort r1" zcat $batch_dir/"$r1_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R1_sorted.fq" - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run sort r1: $elapsed seconds" - - # sort + + # sort start=$(date +%s) echo "Run sort r2" - zcat $batch_dir/$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" - end=$(date +%s) - elapsed=$((end - start)) + zcat $batch_dir/"$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run sort r2: $elapsed seconds" - + # trim using cutadapt start=$(date +%s) echo "Run cutadapt" @@ -378,10 +381,10 @@ task Hisat_paired_end { -p $WORKING_DIR/${sample_id}-R2_trimmed.fq.gz \ $WORKING_DIR/${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq \ > $WORKING_DIR/${sample_id}.trimmed.stats.txt - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run cutadapt: $elapsed seconds" - + # hisat run start=$(date +%s) echo "Run hisat" @@ -396,25 +399,25 @@ task Hisat_paired_end { -t \ --new-summary \ --summary-file ${sample_id}.hisat3n_dna_summary.txt \ - --threads 8 | samtools view -b -q 0 -o "${sample_id}.hisat3n_dna.unsort.bam" - end=$(date +%s) - elapsed=$((end - start)) + --threads 8 | samtools view -b -q 0 -o "${sample_id}.hisat3n_dna.unsort.bam" + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run hisat: $elapsed seconds" - + # call separate_unique_and_multi_align_reads start=$(date +%s) echo "Run separate_unique_and_multi_align_reads" python3 -c 'from cemba_data.hisat3n import separate_unique_and_multi_align_reads;separate_unique_and_multi_align_reads(in_bam_path="'"$sample_id"'.hisat3n_dna.unsort.bam", out_unique_path="'"$sample_id"'.hisat3n_dna.unique_aligned.bam", out_multi_path="'"$sample_id"'.hisat3n_dna.multi_aligned.bam", out_unmappable_path="'"$sample_id"'.hisat3n_dna.unmapped.fastq", unmappable_format="fastq", mapq_cutoff=10, qlen_cutoff='"$min_read_length"')' - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run separate_unique_and_multi_align_reads: $elapsed seconds" - + # call split_hisat3n_unmapped_reads start=$(date +%s) echo "Run split_hisat3n_unmapped_reads" python3 -c 'from cemba_data.hisat3n import *;split_hisat3n_unmapped_reads(fastq_path="'"$sample_id"'.hisat3n_dna.unmapped.fastq",output_prefix="'"$sample_id"'.hisat3n_dna.split_reads",min_length='"$min_read_length"')' - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run split_hisat3n_unmapped_reads: $elapsed seconds" rm $batch_dir/${sample_id}-R1.fq.gz $batch_dir/${sample_id}-R2.fq.gz @@ -439,7 +442,7 @@ task Hisat_paired_end { # ) # done - # run 6 instances of task in parallel + # run 6 instances of task in parallel for file in "${R1_files[@]}"; do ( echo "starting task $file.." @@ -456,13 +459,13 @@ task Hisat_paired_end { wait echo "Tasks all done." du -h * - - #################################### + + #################################### ## make sure that the number of output bams equals the length of R1_files # Count the number of *.hisat3n_dna.unique_aligned.bam files bam_count=$(find . -maxdepth 1 -type f -name '*.hisat3n_dna.unique_aligned.bam' | wc -l) fastq_counts=$(find . -maxdepth 1 -type f -name '*.split_reads*.fastq' | wc -l) - + # Get the length of the array ${R1_files[@]} array_length=${#R1_files[@]} @@ -471,7 +474,7 @@ task Hisat_paired_end { echo "Error: Number of BAM files does not match the length of the array." exit 1 fi - + # Check if the count of FASTQ files matches the length of the array ${R1_files[@]} if [ "$fastq_counts" -ne "$((2 * array_length))" ]; then echo "Error: Number of FASTQ files: $fastq_count does not match the 2 * length of the array: ${#R1_files[@]}." @@ -479,13 +482,10 @@ task Hisat_paired_end { fi echo "Number of BAM and FASTQ files matches the length of the array." - #################################### + #################################### # tar up stats - echo "Tar up stats" - start=$(date +%s) - tar -cf - *.trimmed.stats.txt | pigz > ~{plate_id}.trimmed_stats_files.tar.gz - tar -cf - *.hisat3n_dna_summary.txt | pigz > ~{plate_id}.hisat3n_paired_end_stats_files.tar.gz +dna_summary.txt | pigz > ~{plate_id}.hisat3n_paired_end_stats_files.tar.gz end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run tar stats $elapsed seconds" @@ -505,7 +505,6 @@ task Hisat_paired_end { end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run tar fastqs $elapsed seconds" - >>> runtime { From e0f4863f373263a497d9b69b4f1c7739a9a7b893 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Fri, 10 May 2024 12:07:31 -0400 Subject: [PATCH 097/186] change working dir to batch dir --- pipelines/skylab/snm3C/snm3C.wdl | 38 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 65a7567ea3..5e0c226536 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -350,7 +350,7 @@ task Hisat_paired_end { # sort start=$(date +%s) echo "Run sort r1" - zcat $batch_dir/"$r1_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R1_sorted.fq" + zcat $batch_dir/"$r1_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "$batch_dir/${sample_id}-R1_sorted.fq" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort r1: $elapsed seconds" @@ -358,7 +358,7 @@ task Hisat_paired_end { # sort start=$(date +%s) echo "Run sort r2" - zcat $batch_dir/"$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" + zcat $batch_dir/"$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "$batch_dir/${sample_id}-R2_sorted.fq" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort r2: $elapsed seconds" @@ -377,10 +377,10 @@ task Hisat_paired_end { -Z \ -m ${min_read_length}:${min_read_length} \ --pair-filter 'both' \ - -o $WORKING_DIR/${sample_id}-R1_trimmed.fq.gz \ - -p $WORKING_DIR/${sample_id}-R2_trimmed.fq.gz \ - $WORKING_DIR/${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq \ - > $WORKING_DIR/${sample_id}.trimmed.stats.txt + -o $batch_dir/${sample_id}-R1_trimmed.fq.gz \ + -p $batch_dir/${sample_id}-R2_trimmed.fq.gz \ + $batch_dir/${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq \ + > $batch_dir/${sample_id}.trimmed.stats.txt end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run cutadapt: $elapsed seconds" @@ -390,16 +390,16 @@ task Hisat_paired_end { echo "Run hisat" hisat-3n ~{cromwell_root_dir}/$genome_fa_basename \ -q \ - -1 ${sample_id}-R1_trimmed.fq.gz \ - -2 ${sample_id}-R2_trimmed.fq.gz \ + -1 $batch_dir/${sample_id}-R1_trimmed.fq.gz \ + -2 $batch_dir/${sample_id}-R2_trimmed.fq.gz \ --directional-mapping-reverse --base-change C,T \ --no-repeat-index \ --no-spliced-alignment \ --no-temp-splicesite \ -t \ --new-summary \ - --summary-file ${sample_id}.hisat3n_dna_summary.txt \ - --threads 8 | samtools view -b -q 0 -o "${sample_id}.hisat3n_dna.unsort.bam" + --summary-file $batch_dir/${sample_id}.hisat3n_dna_summary.txt \ + --threads 8 | samtools view -b -q 0 -o $batch_dir/"${sample_id}.hisat3n_dna.unsort.bam" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run hisat: $elapsed seconds" @@ -407,7 +407,7 @@ task Hisat_paired_end { # call separate_unique_and_multi_align_reads start=$(date +%s) echo "Run separate_unique_and_multi_align_reads" - python3 -c 'from cemba_data.hisat3n import separate_unique_and_multi_align_reads;separate_unique_and_multi_align_reads(in_bam_path="'"$sample_id"'.hisat3n_dna.unsort.bam", out_unique_path="'"$sample_id"'.hisat3n_dna.unique_aligned.bam", out_multi_path="'"$sample_id"'.hisat3n_dna.multi_aligned.bam", out_unmappable_path="'"$sample_id"'.hisat3n_dna.unmapped.fastq", unmappable_format="fastq", mapq_cutoff=10, qlen_cutoff='"$min_read_length"')' + python3 -c 'from cemba_data.hisat3n import separate_unique_and_multi_align_reads;separate_unique_and_multi_align_reads(in_bam_path="'"$batch_dir/$sample_id"'.hisat3n_dna.unsort.bam", out_unique_path="'"$batch_dir/$sample_id"'.hisat3n_dna.unique_aligned.bam", out_multi_path="'"$batch_dir/$sample_id"'.hisat3n_dna.multi_aligned.bam", out_unmappable_path="'"$batch_dir/$sample_id"'.hisat3n_dna.unmapped.fastq", unmappable_format="fastq", mapq_cutoff=10, qlen_cutoff='"$min_read_length"')' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run separate_unique_and_multi_align_reads: $elapsed seconds" @@ -415,16 +415,16 @@ task Hisat_paired_end { # call split_hisat3n_unmapped_reads start=$(date +%s) echo "Run split_hisat3n_unmapped_reads" - python3 -c 'from cemba_data.hisat3n import *;split_hisat3n_unmapped_reads(fastq_path="'"$sample_id"'.hisat3n_dna.unmapped.fastq",output_prefix="'"$sample_id"'.hisat3n_dna.split_reads",min_length='"$min_read_length"')' + python3 -c 'from cemba_data.hisat3n import *;split_hisat3n_unmapped_reads(fastq_path="'"$batch_dir/$sample_id"'.hisat3n_dna.unmapped.fastq",output_prefix="'"$sample_id"'.hisat3n_dna.split_reads",min_length='"$min_read_length"')' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run split_hisat3n_unmapped_reads: $elapsed seconds" rm $batch_dir/${sample_id}-R1.fq.gz $batch_dir/${sample_id}-R2.fq.gz - rm ${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq - rm ${sample_id}-R1_trimmed.fq.gz ${sample_id}-R2_trimmed.fq.gz - rm ${sample_id}.hisat3n_dna.unsort.bam ${sample_id}.hisat3n_dna.multi_aligned.bam - rm ${sample_id}.hisat3n_dna.unmapped.fastq + rm $batch_dir${sample_id}-R1_sorted.fq $batch_dir${sample_id}-R2_sorted.fq + rm $batch_dir${sample_id}-R1_trimmed.fq.gz $batch_dir${sample_id}-R2_trimmed.fq.gz + rm $batch_dir${sample_id}.hisat3n_dna.unsort.bam $batch_dir${sample_id}.hisat3n_dna.multi_aligned.bam + rm $batch_dir${sample_id}.hisat3n_dna.unmapped.fastq } @@ -463,8 +463,8 @@ task Hisat_paired_end { #################################### ## make sure that the number of output bams equals the length of R1_files # Count the number of *.hisat3n_dna.unique_aligned.bam files - bam_count=$(find . -maxdepth 1 -type f -name '*.hisat3n_dna.unique_aligned.bam' | wc -l) - fastq_counts=$(find . -maxdepth 1 -type f -name '*.split_reads*.fastq' | wc -l) + bam_count=$(find $batch_dir -maxdepth 1 -type f -name '*.hisat3n_dna.unique_aligned.bam' | wc -l) + fastq_counts=$(find $batch_dir -maxdepth 1 -type f -name '*.split_reads*.fastq' | wc -l) # Get the length of the array ${R1_files[@]} array_length=${#R1_files[@]} @@ -485,7 +485,7 @@ task Hisat_paired_end { #################################### # tar up stats -dna_summary.txt | pigz > ~{plate_id}.hisat3n_paired_end_stats_files.tar.gz + dna_summary.txt | pigz > ~{plate_id}.hisat3n_paired_end_stats_files.tar.gz end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run tar stats $elapsed seconds" From 15269a1cad34b5e6b48fadfe6633a2c60982d784 Mon Sep 17 00:00:00 2001 From: npetrill Date: Fri, 10 May 2024 13:46:08 -0400 Subject: [PATCH 098/186] add ls --- tasks/skylab/PairedTagUtils.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/tasks/skylab/PairedTagUtils.wdl b/tasks/skylab/PairedTagUtils.wdl index acc1678a58..8fc4d0d7a1 100644 --- a/tasks/skylab/PairedTagUtils.wdl +++ b/tasks/skylab/PairedTagUtils.wdl @@ -107,6 +107,7 @@ task PairedTagDemultiplex { elif [[ $COUNT == 24 && ~{preindex} == "false" ]] then echo "FASTQ has correct index length, no modification necessary" + ls -lh mv "~{input_id}_R2_prefix.fq.gz" "~{r2_base}.fq.gz" mv "~{input_id}_R1_prefix.fq.gz" "~{r1_base}.fq.gz" mv "~{input_id}_R3_prefix.fq.gz" "~{r3_base}.fq.gz" From 2d4626eb435fff1dcbe7538bf2d700caec775e04 Mon Sep 17 00:00:00 2001 From: John Scira Date: Fri, 10 May 2024 14:03:13 -0400 Subject: [PATCH 099/186] version change --- .../broad/dna_seq/germline/variant_calling/VariantCalling.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index e703fd99d6..a7fc887109 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -9,7 +9,7 @@ import "../../../../../tasks/broad/DragenTasks.wdl" as DragenTasks workflow VariantCalling { - String pipeline_version = "2.1.19" + String pipeline_version = "2.1.20" input { From 653cb5e05bf97593b872fd5992b8ecfb3fe40bf9 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Fri, 10 May 2024 14:17:43 -0400 Subject: [PATCH 100/186] fix paths --- pipelines/skylab/snm3C/snm3C.wdl | 47 ++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 5e0c226536..d7809b1b3c 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -281,10 +281,10 @@ task Hisat_paired_end { } command <<< - echo "Tar up stats" - start=$(date +%s) - tar -cf - *.trimmed.stats.txt | pigz > ~{plate_id}.trimmed_stats_files.tar.gz - tar -cf - *.hisat3n_set -euo pipefail + echo "Tar up stats" + start=$(date +%s) + tar -cf - *.trimmed.stats.txt | pigz > ~{plate_id}.trimmed_stats_files.tar.gz + tar -cf - *.hisat3n_set -euo pipefail set -x lscpu WORKING_DIR=`pwd` @@ -350,7 +350,7 @@ task Hisat_paired_end { # sort start=$(date +%s) echo "Run sort r1" - zcat $batch_dir/"$r1_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "$batch_dir/${sample_id}-R1_sorted.fq" + zcat $batch_dir/"$r1_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R1_sorted.fq" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort r1: $elapsed seconds" @@ -358,7 +358,7 @@ task Hisat_paired_end { # sort start=$(date +%s) echo "Run sort r2" - zcat $batch_dir/"$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "$batch_dir/${sample_id}-R2_sorted.fq" + zcat $batch_dir/"$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort r2: $elapsed seconds" @@ -377,10 +377,10 @@ task Hisat_paired_end { -Z \ -m ${min_read_length}:${min_read_length} \ --pair-filter 'both' \ - -o $batch_dir/${sample_id}-R1_trimmed.fq.gz \ - -p $batch_dir/${sample_id}-R2_trimmed.fq.gz \ - $batch_dir/${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq \ - > $batch_dir/${sample_id}.trimmed.stats.txt + -o $batch_dir${sample_id}-R1_trimmed.fq.gz \ + -p $batch_dir${sample_id}-R2_trimmed.fq.gz \ + $batch_dir${sample_id}-R1_sorted.fq $batch_dir${sample_id}-R2_sorted.fq \ + > $batch_dir${sample_id}.trimmed.stats.txt end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run cutadapt: $elapsed seconds" @@ -390,16 +390,16 @@ task Hisat_paired_end { echo "Run hisat" hisat-3n ~{cromwell_root_dir}/$genome_fa_basename \ -q \ - -1 $batch_dir/${sample_id}-R1_trimmed.fq.gz \ - -2 $batch_dir/${sample_id}-R2_trimmed.fq.gz \ + -1 $batch_dir${sample_id}-R1_trimmed.fq.gz \ + -2 $batch_dir${sample_id}-R2_trimmed.fq.gz \ --directional-mapping-reverse --base-change C,T \ --no-repeat-index \ --no-spliced-alignment \ --no-temp-splicesite \ -t \ --new-summary \ - --summary-file $batch_dir/${sample_id}.hisat3n_dna_summary.txt \ - --threads 8 | samtools view -b -q 0 -o $batch_dir/"${sample_id}.hisat3n_dna.unsort.bam" + --summary-file $batch_dir${sample_id}.hisat3n_dna_summary.txt \ + --threads 8 | samtools view -b -q 0 -o $batch_dir"${sample_id}.hisat3n_dna.unsort.bam" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run hisat: $elapsed seconds" @@ -407,7 +407,7 @@ task Hisat_paired_end { # call separate_unique_and_multi_align_reads start=$(date +%s) echo "Run separate_unique_and_multi_align_reads" - python3 -c 'from cemba_data.hisat3n import separate_unique_and_multi_align_reads;separate_unique_and_multi_align_reads(in_bam_path="'"$batch_dir/$sample_id"'.hisat3n_dna.unsort.bam", out_unique_path="'"$batch_dir/$sample_id"'.hisat3n_dna.unique_aligned.bam", out_multi_path="'"$batch_dir/$sample_id"'.hisat3n_dna.multi_aligned.bam", out_unmappable_path="'"$batch_dir/$sample_id"'.hisat3n_dna.unmapped.fastq", unmappable_format="fastq", mapq_cutoff=10, qlen_cutoff='"$min_read_length"')' + python3 -c 'from cemba_data.hisat3n import separate_unique_and_multi_align_reads;separate_unique_and_multi_align_reads(in_bam_path="'"$batch_dir$sample_id"'.hisat3n_dna.unsort.bam", out_unique_path="'"$batch_dir$sample_id"'.hisat3n_dna.unique_aligned.bam", out_multi_path="'"$batch_dir$sample_id"'.hisat3n_dna.multi_aligned.bam", out_unmappable_path="'"$batch_dir$sample_id"'.hisat3n_dna.unmapped.fastq", unmappable_format="fastq", mapq_cutoff=10, qlen_cutoff='"$min_read_length"')' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run separate_unique_and_multi_align_reads: $elapsed seconds" @@ -415,16 +415,16 @@ task Hisat_paired_end { # call split_hisat3n_unmapped_reads start=$(date +%s) echo "Run split_hisat3n_unmapped_reads" - python3 -c 'from cemba_data.hisat3n import *;split_hisat3n_unmapped_reads(fastq_path="'"$batch_dir/$sample_id"'.hisat3n_dna.unmapped.fastq",output_prefix="'"$sample_id"'.hisat3n_dna.split_reads",min_length='"$min_read_length"')' + python3 -c 'from cemba_data.hisat3n import *;split_hisat3n_unmapped_reads(fastq_path="'"$batch_dir$sample_id"'.hisat3n_dna.unmapped.fastq",output_prefix="'"$sample_id"'.hisat3n_dna.split_reads",min_length='"$min_read_length"')' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run split_hisat3n_unmapped_reads: $elapsed seconds" rm $batch_dir/${sample_id}-R1.fq.gz $batch_dir/${sample_id}-R2.fq.gz - rm $batch_dir${sample_id}-R1_sorted.fq $batch_dir${sample_id}-R2_sorted.fq - rm $batch_dir${sample_id}-R1_trimmed.fq.gz $batch_dir${sample_id}-R2_trimmed.fq.gz - rm $batch_dir${sample_id}.hisat3n_dna.unsort.bam $batch_dir${sample_id}.hisat3n_dna.multi_aligned.bam - rm $batch_dir${sample_id}.hisat3n_dna.unmapped.fastq + rm $batch_dir/${sample_id}-R1_sorted.fq $batch_dir/${sample_id}-R2_sorted.fq + rm $batch_dir/${sample_id}-R1_trimmed.fq.gz $batch_dir/${sample_id}-R2_trimmed.fq.gz + rm $batch_dir/${sample_id}.hisat3n_dna.unsort.bam $batch_dir/${sample_id}.hisat3n_dna.multi_aligned.bam + rm $batch_dir/${sample_id}.hisat3n_dna.unmapped.fastq } @@ -463,6 +463,13 @@ task Hisat_paired_end { #################################### ## make sure that the number of output bams equals the length of R1_files # Count the number of *.hisat3n_dna.unique_aligned.bam files + echo "lsing batch dir" + ls $batch_dir + echo "ls current dir" + ls + echo "lsing working dir" + echo $WORKING_DIR + bam_count=$(find $batch_dir -maxdepth 1 -type f -name '*.hisat3n_dna.unique_aligned.bam' | wc -l) fastq_counts=$(find $batch_dir -maxdepth 1 -type f -name '*.split_reads*.fastq' | wc -l) From b7e791c18550633c364a95d99bf2df90758f3a0c Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Fri, 10 May 2024 15:01:46 -0400 Subject: [PATCH 101/186] fix batch dir --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index d7809b1b3c..45e33b6cfc 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -334,7 +334,7 @@ task Hisat_paired_end { if [ ~{cromwell_root_dir} = "gcp" ]; then batch_dir="batch*/" else - batch_dir="~{cromwell_root_dir}/*/*/*/*/*/~{cromwell_root_dir}/*/*/*/*/batch*/" + batch_dir="~{cromwell_root_dir}/*/*/*/*/*~{cromwell_root_dir}/*/*/*/*/batch*/" fi echo "batchdirectory: $batch_dir" From a6fd83b868e51d4ed77093749342ba4fa0e4727c Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Fri, 10 May 2024 15:03:33 -0400 Subject: [PATCH 102/186] fix batch dir --- pipelines/skylab/snm3C/snm3C.wdl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 45e33b6cfc..75694a4f7d 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -346,6 +346,9 @@ task Hisat_paired_end { r2_file="${sample_id}-R2.fq.gz" r1_file="${sample_id}-R1.fq.gz" + echo "r1 file: $r1_file" + echo "r2 file: $r2_file" + echo "batch dir: $batch_dir" # sort start=$(date +%s) From 81cea6a07174034ff2187e81eb62575cc218f608 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Fri, 10 May 2024 16:04:18 -0400 Subject: [PATCH 103/186] fix directories --- pipelines/skylab/snm3C/snm3C.wdl | 48 +++++++++++++++++--------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 75694a4f7d..b1aaa6b998 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -281,13 +281,15 @@ task Hisat_paired_end { } command <<< + WORKING_DIR=`pwd` echo "Tar up stats" + ls -lR start=$(date +%s) - tar -cf - *.trimmed.stats.txt | pigz > ~{plate_id}.trimmed_stats_files.tar.gz - tar -cf - *.hisat3n_set -euo pipefail + tar -cf - $WORKING_DIR/*.trimmed.stats.txt | pigz > ~{plate_id}.trimmed_stats_files.tar.gz + tar -cf - $WORKING_DIR/*.hisat3n_set -euo pipefail set -x lscpu - WORKING_DIR=`pwd` + # check genomic reference version and print to output txt file STRING=~{genome_fa} @@ -349,11 +351,13 @@ task Hisat_paired_end { echo "r1 file: $r1_file" echo "r2 file: $r2_file" echo "batch dir: $batch_dir" + cp $batch_dir/"$r1_file" . + cp $batch_dir/"$r2_file" . # sort start=$(date +%s) echo "Run sort r1" - zcat $batch_dir/"$r1_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R1_sorted.fq" + zcat "$r1_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R1_sorted.fq" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort r1: $elapsed seconds" @@ -361,7 +365,7 @@ task Hisat_paired_end { # sort start=$(date +%s) echo "Run sort r2" - zcat $batch_dir/"$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" + zcat "$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run sort r2: $elapsed seconds" @@ -380,10 +384,10 @@ task Hisat_paired_end { -Z \ -m ${min_read_length}:${min_read_length} \ --pair-filter 'both' \ - -o $batch_dir${sample_id}-R1_trimmed.fq.gz \ - -p $batch_dir${sample_id}-R2_trimmed.fq.gz \ - $batch_dir${sample_id}-R1_sorted.fq $batch_dir${sample_id}-R2_sorted.fq \ - > $batch_dir${sample_id}.trimmed.stats.txt + -o ${sample_id}-R1_trimmed.fq.gz \ + -p ${sample_id}-R2_trimmed.fq.gz \ + ${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq \ + > ${sample_id}.trimmed.stats.txt end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run cutadapt: $elapsed seconds" @@ -393,16 +397,16 @@ task Hisat_paired_end { echo "Run hisat" hisat-3n ~{cromwell_root_dir}/$genome_fa_basename \ -q \ - -1 $batch_dir${sample_id}-R1_trimmed.fq.gz \ - -2 $batch_dir${sample_id}-R2_trimmed.fq.gz \ + -1 ${sample_id}-R1_trimmed.fq.gz \ + -2 ${sample_id}-R2_trimmed.fq.gz \ --directional-mapping-reverse --base-change C,T \ --no-repeat-index \ --no-spliced-alignment \ --no-temp-splicesite \ -t \ --new-summary \ - --summary-file $batch_dir${sample_id}.hisat3n_dna_summary.txt \ - --threads 8 | samtools view -b -q 0 -o $batch_dir"${sample_id}.hisat3n_dna.unsort.bam" + --summary-file ${sample_id}.hisat3n_dna_summary.txt \ + --threads 8 | samtools view -b -q 0 -o "${sample_id}.hisat3n_dna.unsort.bam" end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run hisat: $elapsed seconds" @@ -410,7 +414,7 @@ task Hisat_paired_end { # call separate_unique_and_multi_align_reads start=$(date +%s) echo "Run separate_unique_and_multi_align_reads" - python3 -c 'from cemba_data.hisat3n import separate_unique_and_multi_align_reads;separate_unique_and_multi_align_reads(in_bam_path="'"$batch_dir$sample_id"'.hisat3n_dna.unsort.bam", out_unique_path="'"$batch_dir$sample_id"'.hisat3n_dna.unique_aligned.bam", out_multi_path="'"$batch_dir$sample_id"'.hisat3n_dna.multi_aligned.bam", out_unmappable_path="'"$batch_dir$sample_id"'.hisat3n_dna.unmapped.fastq", unmappable_format="fastq", mapq_cutoff=10, qlen_cutoff='"$min_read_length"')' + python3 -c 'from cemba_data.hisat3n import separate_unique_and_multi_align_reads;separate_unique_and_multi_align_reads(in_bam_path="'"$sample_id"'.hisat3n_dna.unsort.bam", out_unique_path="'"$sample_id"'.hisat3n_dna.unique_aligned.bam", out_multi_path="'"$sample_id"'.hisat3n_dna.multi_aligned.bam", out_unmappable_path="'"$sample_id"'.hisat3n_dna.unmapped.fastq", unmappable_format="fastq", mapq_cutoff=10, qlen_cutoff='"$min_read_length"')' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run separate_unique_and_multi_align_reads: $elapsed seconds" @@ -418,16 +422,16 @@ task Hisat_paired_end { # call split_hisat3n_unmapped_reads start=$(date +%s) echo "Run split_hisat3n_unmapped_reads" - python3 -c 'from cemba_data.hisat3n import *;split_hisat3n_unmapped_reads(fastq_path="'"$batch_dir$sample_id"'.hisat3n_dna.unmapped.fastq",output_prefix="'"$sample_id"'.hisat3n_dna.split_reads",min_length='"$min_read_length"')' + python3 -c 'from cemba_data.hisat3n import *;split_hisat3n_unmapped_reads(fastq_path="'"$sample_id"'.hisat3n_dna.unmapped.fastq",output_prefix="'"$sample_id"'.hisat3n_dna.split_reads",min_length='"$min_read_length"')' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run split_hisat3n_unmapped_reads: $elapsed seconds" - rm $batch_dir/${sample_id}-R1.fq.gz $batch_dir/${sample_id}-R2.fq.gz - rm $batch_dir/${sample_id}-R1_sorted.fq $batch_dir/${sample_id}-R2_sorted.fq - rm $batch_dir/${sample_id}-R1_trimmed.fq.gz $batch_dir/${sample_id}-R2_trimmed.fq.gz - rm $batch_dir/${sample_id}.hisat3n_dna.unsort.bam $batch_dir/${sample_id}.hisat3n_dna.multi_aligned.bam - rm $batch_dir/${sample_id}.hisat3n_dna.unmapped.fastq + rm ${sample_id}-R1.fq.gz ${sample_id}-R2.fq.gz + rm ${sample_id}-R1_sorted.fq ${sample_id}-R2_sorted.fq + rm ${sample_id}-R1_trimmed.fq.gz ${sample_id}-R2_trimmed.fq.gz + rm ${sample_id}.hisat3n_dna.unsort.bam ${sample_id}.hisat3n_dna.multi_aligned.bam + rm ${sample_id}.hisat3n_dna.unmapped.fastq } @@ -473,8 +477,8 @@ task Hisat_paired_end { echo "lsing working dir" echo $WORKING_DIR - bam_count=$(find $batch_dir -maxdepth 1 -type f -name '*.hisat3n_dna.unique_aligned.bam' | wc -l) - fastq_counts=$(find $batch_dir -maxdepth 1 -type f -name '*.split_reads*.fastq' | wc -l) + bam_count=$(find . -maxdepth 1 -type f -name '*.hisat3n_dna.unique_aligned.bam' | wc -l) + fastq_counts=$(find . -maxdepth 1 -type f -name '*.split_reads*.fastq' | wc -l) # Get the length of the array ${R1_files[@]} array_length=${#R1_files[@]} From 1cbdd46615f40220fb95a8a6529c6a23fd626000 Mon Sep 17 00:00:00 2001 From: npetrill Date: Mon, 13 May 2024 11:10:48 -0400 Subject: [PATCH 104/186] moving around inputs --- pipelines/skylab/snm3C/snm3C.wdl | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index b1aaa6b998..b13ccd721d 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -281,15 +281,19 @@ task Hisat_paired_end { } command <<< + + WORKING_DIR=`pwd` - echo "Tar up stats" - ls -lR - start=$(date +%s) - tar -cf - $WORKING_DIR/*.trimmed.stats.txt | pigz > ~{plate_id}.trimmed_stats_files.tar.gz - tar -cf - $WORKING_DIR/*.hisat3n_set -euo pipefail - set -x - lscpu + mkdir -p $WORKING_DIR/pipeline_inputs/ + + mv ~{tarred_demultiplexed_fastqs} $WORKING_DIR/pipeline_inputs/ + mv ~{tarred_index_files} $WORKING_DIR/pipeline_inputs/ + mv ~{genome_fa} $WORKING_DIR/pipeline_inputs/ + mv ~{chromosome_sizes} $WORKING_DIR/pipeline_inputs/ + cd $WORKING_DIR/pipeline_inputs/ + + ls -l # check genomic reference version and print to output txt file STRING=~{genome_fa} @@ -297,6 +301,9 @@ task Hisat_paired_end { echo "The reference is $BASE" > ~{plate_id}.reference_version.txt + echo "the path to tarred_index_files is:" + echo ~{tarred_index_files} + # untar the index files for hisat task start=$(date +%s) echo "Untarring tarred_index_files" From ea0458c2613c42690562ed85bbec6774335de811 Mon Sep 17 00:00:00 2001 From: npetrill Date: Mon, 13 May 2024 11:25:14 -0400 Subject: [PATCH 105/186] remove working dir from name of tar file --- pipelines/skylab/snm3C/snm3C.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index b13ccd721d..3aab7f355f 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -237,8 +237,8 @@ task Demultiplexing { # Tar up files per batch for i in $(seq 1 "${batch_number}"); do echo " working on batch: batch${i}" - echo "tarring $WORKING_DIR/batch${i}/*.fq.gz and outputting: $WORKING_DIR/~{plate_id}.${i}.cutadapt_output_files.tar.gz" - tar -cf - $WORKING_DIR/batch${i}/*.fq.gz | pigz > $WORKING_DIR/~{plate_id}.${i}.cutadapt_output_files.tar.gz + echo "tarring $WORKING_DIR/batch${i}/*.fq.gz and outputting: ~{plate_id}.${i}.cutadapt_output_files.tar.gz" + tar -cf - $WORKING_DIR/batch${i}/*.fq.gz | pigz > ~{plate_id}.${i}.cutadapt_output_files.tar.gz done >>> From 91b4b7c595f2a3bf5d96333719536edb0d500934 Mon Sep 17 00:00:00 2001 From: npetrill Date: Mon, 13 May 2024 11:48:53 -0400 Subject: [PATCH 106/186] take basenames --- pipelines/skylab/snm3C/snm3C.wdl | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 3aab7f355f..c58c50c4bb 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -301,20 +301,20 @@ task Hisat_paired_end { echo "The reference is $BASE" > ~{plate_id}.reference_version.txt - echo "the path to tarred_index_files is:" - echo ~{tarred_index_files} - # untar the index files for hisat task start=$(date +%s) echo "Untarring tarred_index_files" - pigz -dc ~{tarred_index_files} | tar -xf - - rm ~{tarred_index_files} + + #take the basename of the demultiplexed fastq tar file + index_basename=$(basename ~{tarred_index_files}) + pigz -dc $index_basename | tar -xf - + rm $index_basename + end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to untar tarred_index_files: $elapsed seconds" # get the basename of the genome_fa file - cp ~{genome_fa} . genome_fa_basename=$(basename ~{genome_fa} .fa) start=$(date +%s) @@ -329,21 +329,22 @@ task Hisat_paired_end { # untar the demultiplexed fastqs for sort and trim task start=$(date +%s) echo "Untar demultiplexed fastqs" - pigz -dc ~{tarred_demultiplexed_fastqs} | tar -xf - + #take the basename of the demultiplexed fastq tar file + demultiplexed_basename=$(basename ~{tarred_demultiplexed_fastqs}) + + pigz -dc $demultiplexed_basename | tar -xf - end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to untar: $elapsed seconds" echo "lsing current dir:" ls -lR - echo "lsing cromwell root:" - ls -lR ~{cromwell_root_dir} # define lists of r1 and r2 fq files if [ ~{cromwell_root_dir} = "gcp" ]; then batch_dir="batch*/" else - batch_dir="~{cromwell_root_dir}/*/*/*/*/*~{cromwell_root_dir}/*/*/*/*/batch*/" + batch_dir="~{cromwell_root_dir}/*/*/*/*/batch*/" fi echo "batchdirectory: $batch_dir" From 60274b942d9af13aeaacb1a5c13f0151095d8897 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 13 May 2024 12:27:32 -0400 Subject: [PATCH 107/186] recursively ls the root to find the batch dirs --- pipelines/skylab/snm3C/snm3C.wdl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index c58c50c4bb..49b9d1b0cd 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -281,8 +281,6 @@ task Hisat_paired_end { } command <<< - - WORKING_DIR=`pwd` mkdir -p $WORKING_DIR/pipeline_inputs/ @@ -340,6 +338,9 @@ task Hisat_paired_end { echo "lsing current dir:" ls -lR + echo "lsing root dir:" + ls -lR ~{cromwell_root_dir} + # define lists of r1 and r2 fq files if [ ~{cromwell_root_dir} = "gcp" ]; then batch_dir="batch*/" From 5d2c646b146fd74775713d68e3aa95385185827e Mon Sep 17 00:00:00 2001 From: npetrill Date: Mon, 13 May 2024 13:13:42 -0400 Subject: [PATCH 108/186] add echo statement --- pipelines/skylab/snm3C/snm3C.wdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 49b9d1b0cd..7275720e77 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -328,7 +328,9 @@ task Hisat_paired_end { start=$(date +%s) echo "Untar demultiplexed fastqs" #take the basename of the demultiplexed fastq tar file + demultiplexed_basename=$(basename ~{tarred_demultiplexed_fastqs}) + echo "the basename of the tarred_demultiplexed_fastqs is" pigz -dc $demultiplexed_basename | tar -xf - end=$(date +%s) From f24adf7a62729fceed203a4b4f0abd80de9bad8b Mon Sep 17 00:00:00 2001 From: John Scira Date: Mon, 13 May 2024 14:22:02 -0400 Subject: [PATCH 109/186] added docker parameter for DragenTasks.CalibrateDragstrModel supporting azure --- .../broad/dna_seq/germline/variant_calling/VariantCalling.wdl | 3 ++- tasks/broad/DragenTasks.wdl | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index a7fc887109..31cd1a02ed 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -65,7 +65,8 @@ workflow VariantCalling { ref_dict = ref_dict, alignment = input_bam, alignment_index = input_bam_index, - str_table_file = select_first([ref_str]) + str_table_file = select_first([ref_str]), + docker = gatk_docker } } diff --git a/tasks/broad/DragenTasks.wdl b/tasks/broad/DragenTasks.wdl index 149eb5fd12..7e28b793bd 100644 --- a/tasks/broad/DragenTasks.wdl +++ b/tasks/broad/DragenTasks.wdl @@ -24,7 +24,7 @@ task CalibrateDragstrModel { File str_table_file File alignment ## can handle cram or bam. File alignment_index - String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + String docker Int preemptible_tries = 3 Int threads = 4 Int? memory_mb From ab4ac4abd0a38034bc13cb64b4c89481ff6d80d7 Mon Sep 17 00:00:00 2001 From: John Scira Date: Mon, 13 May 2024 14:30:47 -0400 Subject: [PATCH 110/186] added docker parameter for Utils.ScatterIntervalList to support azure --- .../dna_seq/germline/variant_calling/VariantCalling.wdl | 7 ++++++- tasks/broad/Utilities.wdl | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index 31cd1a02ed..6cbe58a952 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -44,6 +44,10 @@ workflow VariantCalling { String gatk_docker_azure = "dsppipelinedev.azurecr.io/gatk_reduced_layers:latest" String gatk_docker = if cloud_provider == "gcp" then gatk_docker_gcp else gatk_docker_azure + String picard_cloud_docker_gcp = "us.gcr.io/broad-gotc-prod/picard-python:1.0.0-2.26.10-1663951039" + String picard_cloud_docker_azure = "dsppipelinedev.azurecr.io/picard-python:1.0.0-2.26.10-1663951039" + String picard_cloud_docker = if cloud_provider == "gcp" then picard_cloud_docker_gcp else picard_cloud_docker_azure + # make sure either gcp or azr is supplied as cloud_provider input if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { call Utils.ErrorWithMessage as ErrorMessageIncorrectInput { @@ -77,7 +81,8 @@ workflow VariantCalling { input: interval_list = calling_interval_list, scatter_count = haplotype_scatter_count, - break_bands_at_multiples_of = break_bands_at_multiples_of + break_bands_at_multiples_of = break_bands_at_multiples_of, + docker = picard_cloud_docker } # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. diff --git a/tasks/broad/Utilities.wdl b/tasks/broad/Utilities.wdl index ce6c101368..3ad524c90d 100644 --- a/tasks/broad/Utilities.wdl +++ b/tasks/broad/Utilities.wdl @@ -79,6 +79,7 @@ task ScatterIntervalList { File interval_list Int scatter_count Int break_bands_at_multiples_of + String docker } command <<< @@ -110,7 +111,7 @@ task ScatterIntervalList { Int interval_count = read_int(stdout()) } runtime { - docker: "us.gcr.io/broad-gotc-prod/picard-python:1.0.0-2.26.10-1663951039" + docker: docker memory: "2000 MiB" } } From 0dbf9eace4488bc27000b3847efa81923c63d8ac Mon Sep 17 00:00:00 2001 From: John Scira Date: Mon, 13 May 2024 14:33:16 -0400 Subject: [PATCH 111/186] added docker parameter for Calling.HaplotypeCaller_GATK4_VCF to support azure --- .../broad/dna_seq/germline/variant_calling/VariantCalling.wdl | 3 ++- tasks/broad/GermlineVariantDiscovery.wdl | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index 6cbe58a952..eb2dfc6d8a 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -106,7 +106,8 @@ workflow VariantCalling { ref_fasta_index = ref_fasta_index, contamination = contamination, preemptible_tries = agg_preemptible_tries, - hc_scatter = hc_divisor + hc_scatter = hc_divisor, + docker = gatk_docker } } diff --git a/tasks/broad/GermlineVariantDiscovery.wdl b/tasks/broad/GermlineVariantDiscovery.wdl index 7294f2d0b5..3fc8201af0 100644 --- a/tasks/broad/GermlineVariantDiscovery.wdl +++ b/tasks/broad/GermlineVariantDiscovery.wdl @@ -27,6 +27,7 @@ task HaplotypeCaller_GATK35_GVCF { Float? contamination Int preemptible_tries Int hc_scatter + String docker } parameter_meta { @@ -66,7 +67,7 @@ task HaplotypeCaller_GATK35_GVCF { --read_filter OverclippedRead } runtime { - docker: "us.gcr.io/broad-gotc-prod/gatk:1.3.0-4.2.6.1-1649964384" + docker: docker preemptible: preemptible_tries memory: "10000 MiB" cpu: "1" From 33023643b1abf83bfb3d42e7a3ddc4f84f3868c1 Mon Sep 17 00:00:00 2001 From: John Scira Date: Mon, 13 May 2024 14:34:31 -0400 Subject: [PATCH 112/186] added docker parameter for Calling.HaplotypeCaller_GATK4_VCF to support azure --- .../broad/dna_seq/germline/variant_calling/VariantCalling.wdl | 4 +++- tasks/broad/GermlineVariantDiscovery.wdl | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index eb2dfc6d8a..68be8dba04 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -130,7 +130,9 @@ workflow VariantCalling { use_dragen_hard_filtering = use_dragen_hard_filtering, use_spanning_event_genotyping = use_spanning_event_genotyping, dragstr_model = DragstrAutoCalibration.dragstr_model, - preemptible_tries = agg_preemptible_tries + preemptible_tries = agg_preemptible_tries, + gatk_docker = gatk_docker + } if (use_dragen_hard_filtering) { diff --git a/tasks/broad/GermlineVariantDiscovery.wdl b/tasks/broad/GermlineVariantDiscovery.wdl index 3fc8201af0..1ae34a58dc 100644 --- a/tasks/broad/GermlineVariantDiscovery.wdl +++ b/tasks/broad/GermlineVariantDiscovery.wdl @@ -97,7 +97,7 @@ task HaplotypeCaller_GATK4_VCF { Boolean use_dragen_hard_filtering = false Boolean use_spanning_event_genotyping = true File? dragstr_model - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + String gatk_docker Int memory_multiplier = 1 } From 3e350bd07cbfb9f0811e79633203847321996bcb Mon Sep 17 00:00:00 2001 From: John Scira Date: Mon, 13 May 2024 14:39:26 -0400 Subject: [PATCH 113/186] added docker parameter to support azure --- .../germline/variant_calling/VariantCalling.wdl | 12 ++++++++---- tasks/broad/BamProcessing.wdl | 3 ++- tasks/broad/GermlineVariantDiscovery.wdl | 5 +++-- tasks/broad/Qc.wdl | 3 ++- 4 files changed, 15 insertions(+), 8 deletions(-) diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index 68be8dba04..ca0c0de091 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -142,7 +142,8 @@ workflow VariantCalling { input_vcf_index = HaplotypeCallerGATK4.output_vcf_index, make_gvcf = make_gvcf, vcf_basename = base_file_name, - preemptible_tries = agg_preemptible_tries + preemptible_tries = agg_preemptible_tries, + gatk_docker = gatk_docker } } @@ -153,7 +154,8 @@ workflow VariantCalling { input_bam = HaplotypeCallerGATK4.bamout, output_bam_basename = final_vcf_base_name, preemptible_tries = agg_preemptible_tries, - compression_level = 2 + compression_level = 2, + docker = picard_cloud_docker } } } @@ -170,7 +172,8 @@ workflow VariantCalling { input_vcfs = vcfs_to_merge, input_vcfs_indexes = vcf_indices_to_merge, output_vcf_name = final_vcf_base_name + hard_filter_suffix + merge_suffix, - preemptible_tries = agg_preemptible_tries + preemptible_tries = agg_preemptible_tries, + docker = picard_cloud_docker } if (make_gvcf && !skip_reblocking) { @@ -222,7 +225,8 @@ workflow VariantCalling { ref_dict = ref_dict, evaluation_interval_list = evaluation_interval_list, is_gvcf = make_gvcf, - preemptible_tries = agg_preemptible_tries + preemptible_tries = agg_preemptible_tries, + docker = picard_cloud_docker } output { diff --git a/tasks/broad/BamProcessing.wdl b/tasks/broad/BamProcessing.wdl index e5ae21039a..13d88c4f5f 100644 --- a/tasks/broad/BamProcessing.wdl +++ b/tasks/broad/BamProcessing.wdl @@ -24,6 +24,7 @@ task SortSam { Int compression_level Int additional_disk = 20 Int memory_multiplier = 1 + String docker } # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier @@ -46,7 +47,7 @@ task SortSam { } runtime { - docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + docker: docker disks: "local-disk " + disk_size + " HDD" cpu: "1" memory: "${machine_mem_mb} MiB" diff --git a/tasks/broad/GermlineVariantDiscovery.wdl b/tasks/broad/GermlineVariantDiscovery.wdl index 1ae34a58dc..d6bcb77298 100644 --- a/tasks/broad/GermlineVariantDiscovery.wdl +++ b/tasks/broad/GermlineVariantDiscovery.wdl @@ -171,6 +171,7 @@ task MergeVCFs { Array[File] input_vcfs_indexes String output_vcf_name Int preemptible_tries = 3 + String docker } Int disk_size = ceil(size(input_vcfs, "GiB") * 2.5) + 10 @@ -184,7 +185,7 @@ task MergeVCFs { OUTPUT=~{output_vcf_name} } runtime { - docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + docker: docker preemptible: preemptible_tries memory: "3000 MiB" disks: "local-disk ~{disk_size} HDD" @@ -293,7 +294,7 @@ task DragenHardFilterVcf { Boolean make_gvcf String vcf_basename Int preemptible_tries - String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + String gatk_docker } Int disk_size = ceil(2 * size(input_vcf, "GiB")) + 20 diff --git a/tasks/broad/Qc.wdl b/tasks/broad/Qc.wdl index 58c94f46e9..12d3208d86 100644 --- a/tasks/broad/Qc.wdl +++ b/tasks/broad/Qc.wdl @@ -677,6 +677,7 @@ task CollectVariantCallingMetrics { File evaluation_interval_list Boolean is_gvcf = true Int preemptible_tries + String docker } Int disk_size = ceil(size(input_vcf, "GiB") + size(dbsnp_vcf, "GiB")) + 20 @@ -692,7 +693,7 @@ task CollectVariantCallingMetrics { ~{true="GVCF_INPUT=true" false="" is_gvcf} } runtime { - docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + docker: docker preemptible: preemptible_tries memory: "3000 MiB" disks: "local-disk " + disk_size + " HDD" From ee114955d2c39a990e722f40307b48f93105ad6d Mon Sep 17 00:00:00 2001 From: npetrill Date: Mon, 13 May 2024 15:15:15 -0400 Subject: [PATCH 114/186] add echo statement --- pipelines/skylab/snm3C/snm3C.wdl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 7275720e77..a940d8fa77 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -281,6 +281,8 @@ task Hisat_paired_end { } command <<< + set -euo pipefail + WORKING_DIR=`pwd` mkdir -p $WORKING_DIR/pipeline_inputs/ @@ -331,6 +333,9 @@ task Hisat_paired_end { demultiplexed_basename=$(basename ~{tarred_demultiplexed_fastqs}) echo "the basename of the tarred_demultiplexed_fastqs is" + echo $demultiplexed_basename + echo "this is the wdl variable path:" + echo ~{tarred_demultiplexed_fastqs} pigz -dc $demultiplexed_basename | tar -xf - end=$(date +%s) From 8b4fe34730fb58db3779876a8bc5a6f792d8d7d2 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 13 May 2024 17:05:53 -0400 Subject: [PATCH 115/186] fix batch dir --- pipelines/skylab/snm3C/snm3C.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index a940d8fa77..049f8033aa 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -353,6 +353,7 @@ task Hisat_paired_end { batch_dir="batch*/" else batch_dir="~{cromwell_root_dir}/*/*/*/*/batch*/" + fi echo "batchdirectory: $batch_dir" From 0ebc00f859bb3f9eaf6d79b70525fdc500d49b3f Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 13 May 2024 17:38:07 -0400 Subject: [PATCH 116/186] fix batch dir --- pipelines/skylab/snm3C/snm3C.wdl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 049f8033aa..191d21b28f 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -352,8 +352,7 @@ task Hisat_paired_end { if [ ~{cromwell_root_dir} = "gcp" ]; then batch_dir="batch*/" else - batch_dir="~{cromwell_root_dir}/*/*/*/*/batch*/" - + batch_dir="~{cromwell_root_dir}/*/*/*/*/*/*/*/*/*/*/*/batch*/" fi echo "batchdirectory: $batch_dir" From ab63a12e21fc705a92045bdb825f030c0d514ed1 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 13 May 2024 19:42:17 -0400 Subject: [PATCH 117/186] fix path to hisat index files --- pipelines/skylab/snm3C/snm3C.wdl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 191d21b28f..82d3651529 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -411,7 +411,14 @@ task Hisat_paired_end { # hisat run start=$(date +%s) echo "Run hisat" - hisat-3n ~{cromwell_root_dir}/$genome_fa_basename \ + if [ ~{cromwell_root_dir} = "gcp" ]; then + hisat_index_file_dir="~{cromwell_root_dir}/$genome_fa_basename" + else + hisat_index_file_dir="$WORKING_DIR/$genome_fa_basename" + fi + echo "hisat_index_file_dir: $hisat_index_file_dir" + + hisat-3n $hisat_index_file_dir \ -q \ -1 ${sample_id}-R1_trimmed.fq.gz \ -2 ${sample_id}-R2_trimmed.fq.gz \ From c01f2aa167c6bd3dffacaa809e8c4558370a93c0 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 13 May 2024 20:44:51 -0400 Subject: [PATCH 118/186] add pipline_inputs subdir to location of index files --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 82d3651529..ebebc74a6f 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -412,7 +412,7 @@ task Hisat_paired_end { start=$(date +%s) echo "Run hisat" if [ ~{cromwell_root_dir} = "gcp" ]; then - hisat_index_file_dir="~{cromwell_root_dir}/$genome_fa_basename" + hisat_index_file_dir="~{cromwell_root_dir}/pipeline_inputs/$genome_fa_basename" else hisat_index_file_dir="$WORKING_DIR/$genome_fa_basename" fi From 352c7afc0d512834f2249dad5f68b295834d5c3f Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 13 May 2024 21:09:48 -0400 Subject: [PATCH 119/186] add pipline_inputs subdir to location of index files again --- pipelines/skylab/snm3C/snm3C.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index ebebc74a6f..1f40a2d392 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -412,9 +412,9 @@ task Hisat_paired_end { start=$(date +%s) echo "Run hisat" if [ ~{cromwell_root_dir} = "gcp" ]; then - hisat_index_file_dir="~{cromwell_root_dir}/pipeline_inputs/$genome_fa_basename" + hisat_index_file_dir="~{cromwell_root_dir}/$genome_fa_basename" else - hisat_index_file_dir="$WORKING_DIR/$genome_fa_basename" + hisat_index_file_dir="$WORKING_DIR/pipeline_inputs/$genome_fa_basename" fi echo "hisat_index_file_dir: $hisat_index_file_dir" From 797ba497f002972d4eb2bff623209c6d074e4d87 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Mon, 13 May 2024 21:41:26 -0400 Subject: [PATCH 120/186] fix tar command --- pipelines/skylab/snm3C/snm3C.wdl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 1f40a2d392..981c16619d 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -522,7 +522,10 @@ task Hisat_paired_end { #################################### # tar up stats - dna_summary.txt | pigz > ~{plate_id}.hisat3n_paired_end_stats_files.tar.gz + echo "Tar up stats" + start=$(date +%s) + tar -cf - *.trimmed.stats.txt | pigz > ~{plate_id}.trimmed_stats_files.tar.gz + tar -cf - *.hisat3n_dna_summary.txt | pigz > ~{plate_id}.hisat3n_paired_end_stats_files.tar.gz end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run tar stats $elapsed seconds" From a1a952bed9a6dfb79e503e96546ab24ce7431228 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 14 May 2024 09:14:52 -0400 Subject: [PATCH 121/186] some clean up, remove pipeline inputs subdir, edit single end task --- pipelines/skylab/snm3C/snm3C.wdl | 93 ++++++++------------------------ 1 file changed, 21 insertions(+), 72 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 981c16619d..d4c70e2d5c 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -197,14 +197,11 @@ task Demultiplexing { adapter_name = 'A' + adapter_name.group(1) if adapter_name in adapter_counts and adapter_counts[adapter_name] > threshold: os.remove(file_path) - print(f'Removed file: {filename}') CODE # Batch the fastq files into folders of batch_number size batch_number=~{batch_number} - echo "batch number: $batch_number" for i in $(seq 1 "${batch_number}"); do # Use seq for reliable brace expansion - echo "making batch directory: batch${i}" mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs done @@ -215,29 +212,19 @@ task Demultiplexing { # Define lists of r1 and r2 fq files R1_files=($(ls $WORKING_DIR | grep "\-R1.fq.gz")) R2_files=($(ls $WORKING_DIR | grep "\-R2.fq.gz")) - echo "R1 files: $R1_files" - echo "R2 files: $R2_files" # Distribute the FASTQ files and create TAR files - echo "starting loop of files" for file in "${R1_files[@]}"; do sample_id=$(basename "$file" "-R1.fq.gz") - echo "sampleId: $sample_id" r2_file="${sample_id}-R2.fq.gz" - echo "r2 file: $r2_file" mv $WORKING_DIR/$file batch$((folder_index))/$file - echo "moved $WORKING_DIR/$file to: batch$((folder_index))/$file" mv $WORKING_DIR/$r2_file batch$((folder_index))/$r2_file - echo "moved $WORKING_DIR/$r2_file to: batch$((folder_index))/$r2_file" # Increment the counter folder_index=$(( (folder_index % $batch_number) + 1 )) - echo "folder index is now: $folder_index" done # Tar up files per batch for i in $(seq 1 "${batch_number}"); do - echo " working on batch: batch${i}" - echo "tarring $WORKING_DIR/batch${i}/*.fq.gz and outputting: ~{plate_id}.${i}.cutadapt_output_files.tar.gz" tar -cf - $WORKING_DIR/batch${i}/*.fq.gz | pigz > ~{plate_id}.${i}.cutadapt_output_files.tar.gz done >>> @@ -282,18 +269,7 @@ task Hisat_paired_end { command <<< set -euo pipefail - WORKING_DIR=`pwd` - mkdir -p $WORKING_DIR/pipeline_inputs/ - - mv ~{tarred_demultiplexed_fastqs} $WORKING_DIR/pipeline_inputs/ - mv ~{tarred_index_files} $WORKING_DIR/pipeline_inputs/ - mv ~{genome_fa} $WORKING_DIR/pipeline_inputs/ - mv ~{chromosome_sizes} $WORKING_DIR/pipeline_inputs/ - - cd $WORKING_DIR/pipeline_inputs/ - - ls -l # check genomic reference version and print to output txt file STRING=~{genome_fa} @@ -304,11 +280,8 @@ task Hisat_paired_end { # untar the index files for hisat task start=$(date +%s) echo "Untarring tarred_index_files" - - #take the basename of the demultiplexed fastq tar file - index_basename=$(basename ~{tarred_index_files}) - pigz -dc $index_basename | tar -xf - - rm $index_basename + pigz -dc ~{tarred_index_files} | tar -xf - + rm ~{tarred_index_files} end=$(date +%s) elapsed=$((end - start)) @@ -329,33 +302,17 @@ task Hisat_paired_end { # untar the demultiplexed fastqs for sort and trim task start=$(date +%s) echo "Untar demultiplexed fastqs" - #take the basename of the demultiplexed fastq tar file - - demultiplexed_basename=$(basename ~{tarred_demultiplexed_fastqs}) - echo "the basename of the tarred_demultiplexed_fastqs is" - echo $demultiplexed_basename - echo "this is the wdl variable path:" - echo ~{tarred_demultiplexed_fastqs} - - pigz -dc $demultiplexed_basename | tar -xf - + pigz -dc ~{tarred_demultiplexed_fastqs} | tar -xf - end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to untar: $elapsed seconds" - echo "lsing current dir:" - ls -lR - - echo "lsing root dir:" - ls -lR ~{cromwell_root_dir} - # define lists of r1 and r2 fq files if [ ~{cromwell_root_dir} = "gcp" ]; then batch_dir="batch*/" else batch_dir="~{cromwell_root_dir}/*/*/*/*/*/*/*/*/*/*/*/batch*/" fi - echo "batchdirectory: $batch_dir" - task() { local file=$1 @@ -364,9 +321,6 @@ task Hisat_paired_end { r2_file="${sample_id}-R2.fq.gz" r1_file="${sample_id}-R1.fq.gz" - echo "r1 file: $r1_file" - echo "r2 file: $r2_file" - echo "batch dir: $batch_dir" cp $batch_dir/"$r1_file" . cp $batch_dir/"$r2_file" . @@ -414,9 +368,8 @@ task Hisat_paired_end { if [ ~{cromwell_root_dir} = "gcp" ]; then hisat_index_file_dir="~{cromwell_root_dir}/$genome_fa_basename" else - hisat_index_file_dir="$WORKING_DIR/pipeline_inputs/$genome_fa_basename" + hisat_index_file_dir="$WORKING_DIR/$genome_fa_basename" fi - echo "hisat_index_file_dir: $hisat_index_file_dir" hisat-3n $hisat_index_file_dir \ -q \ @@ -461,17 +414,6 @@ task Hisat_paired_end { R1_files=($(ls $batch_dir | grep "\-R1.fq.gz")) R2_files=($(ls $batch_dir | grep "\-R2.fq.gz")) - echo "Found r1 files: $R1_files" - echo "Found r2 files: $R2_files" - - # for file in "${R1_files[@]}"; do - # ( - # echo "starting task $file.." - # du -h batch*/$file - # task "$file" - # ) - # done - # run 6 instances of task in parallel for file in "${R1_files[@]}"; do ( @@ -493,13 +435,6 @@ task Hisat_paired_end { #################################### ## make sure that the number of output bams equals the length of R1_files # Count the number of *.hisat3n_dna.unique_aligned.bam files - echo "lsing batch dir" - ls $batch_dir - echo "ls current dir" - ls - echo "lsing working dir" - echo $WORKING_DIR - bam_count=$(find . -maxdepth 1 -type f -name '*.hisat3n_dna.unique_aligned.bam' | wc -l) fastq_counts=$(find . -maxdepth 1 -type f -name '*.split_reads*.fastq' | wc -l) @@ -585,6 +520,7 @@ task Hisat_single_end { set -euo pipefail set -x lscpu + WORKING_DIR=`pwd` # untar the tarred index files echo "Untar tarred_index_files" @@ -623,16 +559,27 @@ task Hisat_single_end { R1_files=($(ls | grep "\.hisat3n_dna.split_reads.R1.fastq")) R2_files=($(ls | grep "\.hisat3n_dna.split_reads.R2.fastq")) + echo "Found R1 files: $R1_files" + echo "Found R2 files: $R2_files" + + task() { BASE=$(basename "$file" ".hisat3n_dna.split_reads.R1.fastq") echo $BASE echo "Running hisat on sample_id_R1" $BASE echo "Hisat 3n R1" - start=$(date +%s) + start=$(date +%s) + + if [ ~{cromwell_root_dir} = "gcp" ]; then + hisat_index_file_dir="~{cromwell_root_dir}/$genome_fa_basename" + else + hisat_index_file_dir="$WORKING_DIR/$genome_fa_basename" + fi + # hisat on R1 single end - hisat-3n ~{cromwell_root_dir}/$genome_fa_basename \ + hisat-3n $hisat_index_file_dir \ -q \ -U ${BASE}.hisat3n_dna.split_reads.R1.fastq \ -S ${BASE}.hisat3n_dna.split_reads.R1.sam --directional-mapping-reverse --base-change C,T \ @@ -654,7 +601,7 @@ task Hisat_single_end { echo "Running hisat on sample_id_R2" $BASE # hisat on R2 single end - hisat-3n ~{cromwell_root_dir}/$genome_fa_basename \ + hisat-3n $hisat_index_file_dir \ -q \ -U ${BASE}.hisat3n_dna.split_reads.R2.fastq \ -S ${BASE}.hisat3n_dna.split_reads.R2.sam --directional-mapping --base-change C,T \ @@ -695,6 +642,8 @@ task Hisat_single_end { echo "Elapsed time to run samtools -q 10 $elapsed seconds" # remove_overlap_read_parts + echo "recusively ls cromwell root" + ls -lR ~{cromwell_root_dir} echo "call remove_overlap_read_parts" start=$(date +%s) python3 -c 'from cemba_data.hisat3n import *;import os;remove_overlap_read_parts(in_bam_path=os.path.join(os.path.sep,~{cromwell_root_dir},"'"$BASE"'.name_sorted.filtered.bam"),out_bam_path=os.path.join(os.path.sep,~{cromwell_root_dir},"'"$BASE"'.hisat3n_dna.split_reads.read_overlap.bam"))' From 5162ae40125d70fcc3460e5fd7538d130d3d0a5c Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 14 May 2024 10:16:49 -0400 Subject: [PATCH 122/186] copy fa file again --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index d4c70e2d5c..5866bdb24d 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -282,12 +282,12 @@ task Hisat_paired_end { echo "Untarring tarred_index_files" pigz -dc ~{tarred_index_files} | tar -xf - rm ~{tarred_index_files} - end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to untar tarred_index_files: $elapsed seconds" # get the basename of the genome_fa file + cp ~{genome_fa} . genome_fa_basename=$(basename ~{genome_fa} .fa) start=$(date +%s) From 33b28b7e47634430a593cbb6161f8d210c5ff2f3 Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 14 May 2024 10:41:38 -0400 Subject: [PATCH 123/186] rename some inputs for paired tag demultiplexing --- tasks/skylab/PairedTagUtils.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tasks/skylab/PairedTagUtils.wdl b/tasks/skylab/PairedTagUtils.wdl index 8fc4d0d7a1..0fe171b60e 100644 --- a/tasks/skylab/PairedTagUtils.wdl +++ b/tasks/skylab/PairedTagUtils.wdl @@ -108,9 +108,9 @@ task PairedTagDemultiplex { then echo "FASTQ has correct index length, no modification necessary" ls -lh - mv "~{input_id}_R2_prefix.fq.gz" "~{r2_base}.fq.gz" - mv "~{input_id}_R1_prefix.fq.gz" "~{r1_base}.fq.gz" - mv "~{input_id}_R3_prefix.fq.gz" "~{r3_base}.fq.gz" + mv "~{input_id}_R2.fq.gz" "~{r2_base}.fq.gz" + mv "~{input_id}_R1.fq.gz" "~{r1_base}.fq.gz" + mv "~{input_id}_R3.fq.gz" "~{r3_base}.fq.gz" elif [[ $COUNT == 24 && ~{preindex} == "true" ]] then pass="false" From c0dd6265a78cc5211f307c15953c8cb25e7e5b6a Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 14 May 2024 11:05:09 -0400 Subject: [PATCH 124/186] rename some intermediate inputs for paired tag demultiplexing --- tasks/skylab/PairedTagUtils.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tasks/skylab/PairedTagUtils.wdl b/tasks/skylab/PairedTagUtils.wdl index a34b3c3187..ca5b6cf885 100644 --- a/tasks/skylab/PairedTagUtils.wdl +++ b/tasks/skylab/PairedTagUtils.wdl @@ -106,9 +106,9 @@ task PairedTagDemultiplex { elif [[ $COUNT == 24 && ~{preindex} == "false" ]] then echo "FASTQ has correct index length, no modification necessary" - mv "~{input_id}_R2_prefix.fq.gz" "~{r2_base}.fq.gz" - mv "~{input_id}_R1_prefix.fq.gz" "~{r1_base}.fq.gz" - mv "~{input_id}_R3_prefix.fq.gz" "~{r3_base}.fq.gz" + mv "~{input_id}_R2.fq.gz" "~{r2_base}.fq.gz" + mv "~{input_id}_R1.fq.gz" "~{r1_base}.fq.gz" + mv "~{input_id}_R3.fq.gz" "~{r3_base}.fq.gz" elif [[ $COUNT == 24 && ~{preindex} == "true" ]] then pass="false" From a2ebd1b59914a1ef324a349db5d3b2da83176f8d Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 14 May 2024 11:34:39 -0400 Subject: [PATCH 125/186] list cromwell root --- pipelines/skylab/snm3C/snm3C.wdl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 5866bdb24d..bd0a46e169 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -307,6 +307,9 @@ task Hisat_paired_end { elapsed=$((end - start)) echo "Elapsed time to untar: $elapsed seconds" + echo "recursively list cromwell roo" + ls -lR ~{cromwell_root_dir} + # define lists of r1 and r2 fq files if [ ~{cromwell_root_dir} = "gcp" ]; then batch_dir="batch*/" From 9704a24644d7a649434f767f97ac28b261853e49 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 14 May 2024 12:05:37 -0400 Subject: [PATCH 126/186] fix batch dir --- pipelines/skylab/snm3C/snm3C.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index bd0a46e169..7aca66de5f 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -314,7 +314,8 @@ task Hisat_paired_end { if [ ~{cromwell_root_dir} = "gcp" ]; then batch_dir="batch*/" else - batch_dir="~{cromwell_root_dir}/*/*/*/*/*/*/*/*/*/*/*/batch*/" + batch_dir="~{cromwell_root_dir}/*/*/*/*/*~{cromwell_root_dir}/*/*/*/*/batch*/" + fi task() { From aff8a9ca458c6a08cf7e1f228898f469573959f7 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 14 May 2024 13:30:01 -0400 Subject: [PATCH 127/186] add conditional for input and output bams --- pipelines/skylab/snm3C/snm3C.wdl | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 7aca66de5f..e339e276b7 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -307,15 +307,11 @@ task Hisat_paired_end { elapsed=$((end - start)) echo "Elapsed time to untar: $elapsed seconds" - echo "recursively list cromwell roo" - ls -lR ~{cromwell_root_dir} - # define lists of r1 and r2 fq files if [ ~{cromwell_root_dir} = "gcp" ]; then batch_dir="batch*/" else batch_dir="~{cromwell_root_dir}/*/*/*/*/*~{cromwell_root_dir}/*/*/*/*/batch*/" - fi task() { @@ -648,9 +644,21 @@ task Hisat_single_end { # remove_overlap_read_parts echo "recusively ls cromwell root" ls -lR ~{cromwell_root_dir} + + if [ ~{cromwell_root_dir} = "gcp" ]; then + filtered_bam_path="~{cromwell_root_dir}/$BASE.name_sorted.filtered.bam" + read_overlap_bam_path="~{cromwell_root_dir}/$BASE.hisat3n_dna.split_reads.read_overlap.bam" + else + filtered_bam_path="$WORKING_DIR/$BASE.name_sorted.filtered.bam" + read_overlap_bam_path="$WORKING_DIR/$BASE.hisat3n_dna.split_reads.read_overlap.bam" + fi + + echo "filtered bam path: $filtered_bam_path" + echo "read overlap bam path: $read_overlap_bam_path" + echo "call remove_overlap_read_parts" start=$(date +%s) - python3 -c 'from cemba_data.hisat3n import *;import os;remove_overlap_read_parts(in_bam_path=os.path.join(os.path.sep,~{cromwell_root_dir},"'"$BASE"'.name_sorted.filtered.bam"),out_bam_path=os.path.join(os.path.sep,~{cromwell_root_dir},"'"$BASE"'.hisat3n_dna.split_reads.read_overlap.bam"))' + python3 -c 'from cemba_data.hisat3n import *;import os;remove_overlap_read_parts(in_bam_path=$filtered_bam_path,out_bam_path=$read_overlap_bam_path)' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run remove overlap $elapsed seconds" @@ -846,7 +854,7 @@ task Merge_sort_analyze { echo "Elapsed time to samtools index $elapsed seconds" start=$(date +%s) - echo "Call chromatin contacts from name sorted bams" + echo "Call chromatin contacts from name sorted bams" python3 -c 'from cemba_data.hisat3n import *;import os;import glob;call_chromatin_contacts(bam_path="'"$sample_id"'.hisat3n_dna.all_reads.name_sort.bam",contact_prefix="'"$sample_id"'.hisat3n_dna.all_reads",save_raw=False,save_hic_format=True)' end=$(date +%s) elapsed=$((end - start)) From f5ef9ad9cbd37bcf28e54aa59d0574248fce8821 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 14 May 2024 14:03:14 -0400 Subject: [PATCH 128/186] fix logic for cloud provider vs. crowell root dir --- pipelines/skylab/snm3C/snm3C.wdl | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index e339e276b7..5f8195bd02 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -72,7 +72,8 @@ workflow snm3C { r2_right_cut = r2_right_cut, plate_id = plate_id, docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir + cromwell_root_dir = cromwell_root_dir, + cloud_provider = cloud_provider, } call Hisat_single_end as Hisat_single_end { @@ -82,7 +83,8 @@ workflow snm3C { genome_fa = genome_fa, plate_id = plate_id, docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir + cromwell_root_dir = cromwell_root_dir, + cloud_provider = cloud_provider } call Merge_sort_analyze as Merge_sort_analyze { @@ -252,6 +254,7 @@ task Hisat_paired_end { String plate_id String docker String cromwell_root_dir + String cloud_provider String r1_adapter String r2_adapter @@ -308,7 +311,7 @@ task Hisat_paired_end { echo "Elapsed time to untar: $elapsed seconds" # define lists of r1 and r2 fq files - if [ ~{cromwell_root_dir} = "gcp" ]; then + if [ ~{cloud_provider} = "gcp" ]; then batch_dir="batch*/" else batch_dir="~{cromwell_root_dir}/*/*/*/*/*~{cromwell_root_dir}/*/*/*/*/batch*/" @@ -365,7 +368,7 @@ task Hisat_paired_end { # hisat run start=$(date +%s) echo "Run hisat" - if [ ~{cromwell_root_dir} = "gcp" ]; then + if [ ~{cloud_provider} = "gcp" ]; then hisat_index_file_dir="~{cromwell_root_dir}/$genome_fa_basename" else hisat_index_file_dir="$WORKING_DIR/$genome_fa_basename" @@ -508,6 +511,7 @@ task Hisat_single_end { String plate_id String docker String cromwell_root_dir + String cloud_provider Int disk_size = 1000 Int mem_size = 64 @@ -571,7 +575,7 @@ task Hisat_single_end { echo "Hisat 3n R1" start=$(date +%s) - if [ ~{cromwell_root_dir} = "gcp" ]; then + if [ ~{cloud_provider} = "gcp" ]; then hisat_index_file_dir="~{cromwell_root_dir}/$genome_fa_basename" else hisat_index_file_dir="$WORKING_DIR/$genome_fa_basename" @@ -645,7 +649,7 @@ task Hisat_single_end { echo "recusively ls cromwell root" ls -lR ~{cromwell_root_dir} - if [ ~{cromwell_root_dir} = "gcp" ]; then + if [ ~{cloud_provider} = "gcp" ]; then filtered_bam_path="~{cromwell_root_dir}/$BASE.name_sorted.filtered.bam" read_overlap_bam_path="~{cromwell_root_dir}/$BASE.hisat3n_dna.split_reads.read_overlap.bam" else From e12283aee0a1f58e7d5f3cdce61b8c506fd6aaef Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 14 May 2024 14:12:05 -0400 Subject: [PATCH 129/186] add ls --- pipelines/skylab/snm3C/snm3C.wdl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 5f8195bd02..60910bf37d 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -310,9 +310,12 @@ task Hisat_paired_end { elapsed=$((end - start)) echo "Elapsed time to untar: $elapsed seconds" + echo "lsing cromwell root dir" + ls -lR ~{cromwell_root_dir} + # define lists of r1 and r2 fq files if [ ~{cloud_provider} = "gcp" ]; then - batch_dir="batch*/" + batch_dir="~{cromwell_root_dir}/batch*/" else batch_dir="~{cromwell_root_dir}/*/*/*/*/*~{cromwell_root_dir}/*/*/*/*/batch*/" fi From c6a12161edba87f32a41b61177384f91092719e6 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 14 May 2024 14:22:29 -0400 Subject: [PATCH 130/186] fix batch dir in gcp --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 60910bf37d..5a9e4ffb8f 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -315,7 +315,7 @@ task Hisat_paired_end { # define lists of r1 and r2 fq files if [ ~{cloud_provider} = "gcp" ]; then - batch_dir="~{cromwell_root_dir}/batch*/" + batch_dir="~{cromwell_root_dir}~{cromwell_root_dir}/batch*/" else batch_dir="~{cromwell_root_dir}/*/*/*/*/*~{cromwell_root_dir}/*/*/*/*/batch*/" fi From 106347500d8b30645d7d5f94f8e759d6be6d63eb Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 14 May 2024 14:33:51 -0400 Subject: [PATCH 131/186] fix sytaxt error --- pipelines/skylab/snm3C/snm3C.wdl | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 5a9e4ffb8f..ae9bf86051 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -653,11 +653,9 @@ task Hisat_single_end { ls -lR ~{cromwell_root_dir} if [ ~{cloud_provider} = "gcp" ]; then - filtered_bam_path="~{cromwell_root_dir}/$BASE.name_sorted.filtered.bam" - read_overlap_bam_path="~{cromwell_root_dir}/$BASE.hisat3n_dna.split_reads.read_overlap.bam" + bam_path_prefix="~{cromwell_root_dir}" else - filtered_bam_path="$WORKING_DIR/$BASE.name_sorted.filtered.bam" - read_overlap_bam_path="$WORKING_DIR/$BASE.hisat3n_dna.split_reads.read_overlap.bam" + bam_path_prefix=$WORKING_DIR fi echo "filtered bam path: $filtered_bam_path" @@ -665,7 +663,7 @@ task Hisat_single_end { echo "call remove_overlap_read_parts" start=$(date +%s) - python3 -c 'from cemba_data.hisat3n import *;import os;remove_overlap_read_parts(in_bam_path=$filtered_bam_path,out_bam_path=$read_overlap_bam_path)' + python3 -c 'from cemba_data.hisat3n import *;import os;remove_overlap_read_parts(in_bam_path=os.path.join(os.path.sep,'"$bam_path_prefix"',"'"$BASE"'.name_sorted.filtered.bam"),out_bam_path=os.path.join(os.path.sep,"$bam_path_prefix","'"$BASE"'.hisat3n_dna.split_reads.read_overlap.bam"))' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run remove overlap $elapsed seconds" From fefba082c98c2e79181096a967612ddfaef5dd12 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 14 May 2024 15:39:18 -0400 Subject: [PATCH 132/186] ugh remove echo statements referencing variable that doesnt exist anymore --- pipelines/skylab/snm3C/snm3C.wdl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index ae9bf86051..07ad57de59 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -658,8 +658,7 @@ task Hisat_single_end { bam_path_prefix=$WORKING_DIR fi - echo "filtered bam path: $filtered_bam_path" - echo "read overlap bam path: $read_overlap_bam_path" + echo "bam_path_prefix $bam_path_prefix" echo "call remove_overlap_read_parts" start=$(date +%s) From 30a6b9368dba835903fe5c96f76bc5f1bbd90885 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 14 May 2024 15:53:06 -0400 Subject: [PATCH 133/186] attempt to fix syntax error --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 07ad57de59..918d3fb3bc 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -662,7 +662,7 @@ task Hisat_single_end { echo "call remove_overlap_read_parts" start=$(date +%s) - python3 -c 'from cemba_data.hisat3n import *;import os;remove_overlap_read_parts(in_bam_path=os.path.join(os.path.sep,'"$bam_path_prefix"',"'"$BASE"'.name_sorted.filtered.bam"),out_bam_path=os.path.join(os.path.sep,"$bam_path_prefix","'"$BASE"'.hisat3n_dna.split_reads.read_overlap.bam"))' + python3 -c 'from cemba_data.hisat3n import *;import os;remove_overlap_read_parts(in_bam_path="'"$BASE"'.name_sorted.filtered.bam",out_bam_path="'"$BASE"'.hisat3n_dna.split_reads.read_overlap.bam")' end=$(date +%s) elapsed=$((end - start)) echo "Elapsed time to run remove overlap $elapsed seconds" From ebb3c39381932e25350a7aafcb34a325716869a1 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Tue, 14 May 2024 16:31:20 -0400 Subject: [PATCH 134/186] add missing forward slash --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 918d3fb3bc..554a5f4ad0 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -812,7 +812,7 @@ task Merge_sort_analyze { # make directories mkdir ~{cromwell_root_dir}/output_bams mkdir ~{cromwell_root_dir}temp - mkdir ~{cromwell_root_dir}allc-${mcg_context} + mkdir ~{cromwell_root_dir}/allc-${mcg_context} task() { local file=$1 From 36a2ce931028754aac4423d3535b8230bf986b60 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Wed, 15 May 2024 10:16:10 -0400 Subject: [PATCH 135/186] fix directories, add ls commands --- pipelines/skylab/snm3C/snm3C.wdl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 554a5f4ad0..58005e9f64 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -811,7 +811,7 @@ task Merge_sort_analyze { # make directories mkdir ~{cromwell_root_dir}/output_bams - mkdir ~{cromwell_root_dir}temp + mkdir ~{cromwell_root_dir}/temp mkdir ~{cromwell_root_dir}/allc-${mcg_context} task() { @@ -864,6 +864,9 @@ task Merge_sort_analyze { elapsed=$((end - start)) echo "Elapsed time to chromatin contacts $elapsed seconds" + echo "recursively ls cromwell root" + ls -lR ~{cromwell_root_dir} + start=$(date +%s) echo "Call allcools bam-to-allc from deduped.bams" /opt/conda/bin/allcools bam-to-allc \ @@ -936,8 +939,11 @@ task Merge_sort_analyze { echo "Number of output files matches the length of the array." #################################### + echo "recursively ls'sing cromwell root again" + ls -lR ~{cromwell_root_dir} + echo "Tar files." - tar -cf - output_bams/*.matrix.txt | pigz > ~{plate_id}.dedup_unique_bam_and_index_unique_bam_stats.tar.gz + tar -cf - ~{cromwell_root_dir}/output_bams/*.matrix.txt | pigz > ~{plate_id}.dedup_unique_bam_and_index_unique_bam_stats.tar.gz tar -cf - *.hisat3n_dna.all_reads.name_sort.bam | pigz > ~{plate_id}.hisat3n_dna.all_reads.name_sort.tar.gz # tar outputs of call_chromatin_contacts @@ -949,7 +955,7 @@ task Merge_sort_analyze { tar -cf - *.allc.tsv.gz | pigz > ~{plate_id}.allc.tsv.tar.gz tar -cf - *.allc.tsv.gz.tbi | pigz > ~{plate_id}.allc.tbi.tar.gz tar -cf - *.allc.tsv.gz.count.csv | pigz > ~{plate_id}.allc.count.tar.gz - tar -cf - ~{cromwell_root_dir}allc-${mcg_context}/*.gz | pigz > ~{plate_id}.extract-allc.tar.gz + tar -cf - ~{cromwell_root_dir}/allc-${mcg_context}/*.gz | pigz > ~{plate_id}.extract-allc.tar.gz tar -cf - ~{cromwell_root_dir}/allc-${mcg_context}/*.tbi | pigz > ~{plate_id}.extract-allc_tbi.tar.gz >>> From 18ad64cff0f16ff42b242ccede18f8849843b051 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Wed, 15 May 2024 13:12:36 -0400 Subject: [PATCH 136/186] add more ls commands --- pipelines/skylab/snm3C/snm3C.wdl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 58005e9f64..f5a350abd2 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -1004,6 +1004,9 @@ task Summary { command <<< set -euo pipefail + echo "recursively ls'sing cromwell root in summary task" + ls -lR ~{cromwell_root_dir} + mkdir ~{cromwell_root_dir}/fastq mkdir ~{cromwell_root_dir}/bam mkdir ~{cromwell_root_dir}/allc From cd4f876f255e99ed76a72156bdde3e0b963739cb Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Wed, 15 May 2024 13:13:33 -0400 Subject: [PATCH 137/186] add more ls commands --- pipelines/skylab/snm3C/snm3C.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index f5a350abd2..f18714036c 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -1004,6 +1004,7 @@ task Summary { command <<< set -euo pipefail + echo "recursively ls'sing cromwell root in summary task" ls -lR ~{cromwell_root_dir} From 6b2889e152f28dfc87221807de8f35a4a45270f8 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Wed, 15 May 2024 13:13:44 -0400 Subject: [PATCH 138/186] add more ls commands again --- pipelines/skylab/snm3C/snm3C.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index f18714036c..f5a350abd2 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -1004,7 +1004,6 @@ task Summary { command <<< set -euo pipefail - echo "recursively ls'sing cromwell root in summary task" ls -lR ~{cromwell_root_dir} From 2c6d56b23ed2d9e8bd19a4a16b5add327a4d3f5f Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Wed, 15 May 2024 15:39:46 -0400 Subject: [PATCH 139/186] fix fasta file path --- pipelines/skylab/snm3C/snm3C.wdl | 262 ++++++++++++++++--------------- 1 file changed, 137 insertions(+), 125 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index f5a350abd2..ce858a19a9 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -98,7 +98,8 @@ workflow snm3C { chromosome_sizes = chromosome_sizes, plate_id = plate_id, docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir + cromwell_root_dir = cromwell_root_dir, + cloud_provider = cloud_provider } } @@ -467,24 +468,24 @@ task Hisat_paired_end { start=$(date +%s) tar -cf - *.trimmed.stats.txt | pigz > ~{plate_id}.trimmed_stats_files.tar.gz tar -cf - *.hisat3n_dna_summary.txt | pigz > ~{plate_id}.hisat3n_paired_end_stats_files.tar.gz - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run tar stats $elapsed seconds" # tar up the uniqe bams echo "Tar up unique bams" start=$(date +%s) tar -cf - *.hisat3n_dna.unique_aligned.bam | pigz > ~{plate_id}.hisat3n_paired_end_unique_bam_files.tar.gz - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run tar unique bams $elapsed seconds" # tar up the split fastq files echo "Tar up fastqs" start=$(date +%s) tar -cf - *.split_reads*.fastq | pigz > ~{plate_id}.hisat3n_paired_end_split_fastq_files.tar.gz - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run tar fastqs $elapsed seconds" >>> @@ -516,11 +517,11 @@ task Hisat_single_end { String cromwell_root_dir String cloud_provider - Int disk_size = 1000 - Int mem_size = 64 + Int disk_size = 1000 + Int mem_size = 64 Int cpu = 32 Int preemptible_tries = 2 - String cpu_platform = "Intel Ice Lake" + String cpu_platform = "Intel Ice Lake" } command <<< @@ -528,40 +529,40 @@ task Hisat_single_end { set -x lscpu WORKING_DIR=`pwd` - + # untar the tarred index files echo "Untar tarred_index_files" - start=$(date +%s) - pigz -dc ~{tarred_index_files} | tar -xf - + start=$(date +%s) + pigz -dc ~{tarred_index_files} | tar -xf - rm ~{tarred_index_files} - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to untar tarred_index_files: $elapsed seconds" - + cp ~{genome_fa} . #get the basename of the genome_fa file echo "samtools faidx" - start=$(date +%s) + start=$(date +%s) genome_fa_basename=$(basename ~{genome_fa} .fa) samtools faidx $genome_fa_basename.fa - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to samtools faidx: $elapsed seconds" - + # untar the unmapped fastq files echo "Untar split_fq_tar" - start=$(date +%s) - pigz -dc ~{split_fq_tar} | tar -xf - + start=$(date +%s) + pigz -dc ~{split_fq_tar} | tar -xf - rm ~{split_fq_tar} - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to untar split_fq_tar: $elapsed seconds" - # make directories + # make directories mkdir -p ~{cromwell_root_dir}/merged_sort_bams mkdir -p ~{cromwell_root_dir}/read_overlap - + # define lists of r1 and r2 fq files R1_files=($(ls | grep "\.hisat3n_dna.split_reads.R1.fastq")) R2_files=($(ls | grep "\.hisat3n_dna.split_reads.R2.fastq")) @@ -574,8 +575,8 @@ task Hisat_single_end { BASE=$(basename "$file" ".hisat3n_dna.split_reads.R1.fastq") echo $BASE echo "Running hisat on sample_id_R1" $BASE - - echo "Hisat 3n R1" + + echo "Hisat 3n R1" start=$(date +%s) if [ ~{cloud_provider} = "gcp" ]; then @@ -584,7 +585,7 @@ task Hisat_single_end { hisat_index_file_dir="$WORKING_DIR/$genome_fa_basename" fi - + # hisat on R1 single end hisat-3n $hisat_index_file_dir \ -q \ @@ -596,15 +597,15 @@ task Hisat_single_end { -t \ --new-summary \ --summary-file ${BASE}.hisat3n_dna_split_reads_summary.R1.txt \ - --threads 8 - - end=$(date +%s) - elapsed=$((end - start)) + --threads 8 + + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run $elapsed seconds" echo "Finish running hisat on sample_id_R1" $BASE - - echo "Hisat 3n R2" - start=$(date +%s) + + echo "Hisat 3n R2" + start=$(date +%s) echo "Running hisat on sample_id_R2" $BASE # hisat on R2 single end @@ -619,33 +620,33 @@ task Hisat_single_end { --summary-file ${BASE}.hisat3n_dna_split_reads_summary.R2.txt \ --threads 8 - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run $elapsed seconds" echo "Finish running hisat on sample_id_R2" $BASE - + # samtools merge - echo "samtools merge R1 and R2" - start=$(date +%s) + echo "samtools merge R1 and R2" + start=$(date +%s) samtools merge -o ${BASE}.name_merged.sam ${BASE}.hisat3n_dna.split_reads.R1.sam ${BASE}.hisat3n_dna.split_reads.R2.sam -@8 - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run samtools merge $elapsed seconds" - - # samtools sort - echo "samtools sort R1 and R2" - start=$(date +%s) + + # samtools sort + echo "samtools sort R1 and R2" + start=$(date +%s) samtools sort -n -@8 -m1g ${BASE}.name_merged.sam -o ${BASE}.name_sorted.bam - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run samtools sort $elapsed seconds" # samtools filter bam - echo "samtools -q 10" - start=$(date +%s) + echo "samtools -q 10" + start=$(date +%s) samtools view -q 10 ${BASE}.name_sorted.bam -o ${BASE}.name_sorted.filtered.bam - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run samtools -q 10 $elapsed seconds" # remove_overlap_read_parts @@ -660,14 +661,14 @@ task Hisat_single_end { echo "bam_path_prefix $bam_path_prefix" - echo "call remove_overlap_read_parts" - start=$(date +%s) + echo "call remove_overlap_read_parts" + start=$(date +%s) python3 -c 'from cemba_data.hisat3n import *;import os;remove_overlap_read_parts(in_bam_path="'"$BASE"'.name_sorted.filtered.bam",out_bam_path="'"$BASE"'.hisat3n_dna.split_reads.read_overlap.bam")' - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run remove overlap $elapsed seconds" - - # remove files + + # remove files rm ${BASE}.hisat3n_dna.split_reads.R1.fastq ${BASE}.hisat3n_dna.split_reads.R2.fastq rm ${BASE}.hisat3n_dna.split_reads.R1.sam ${BASE}.hisat3n_dna.split_reads.R2.sam rm ${BASE}.name_merged.sam @@ -695,10 +696,10 @@ task Hisat_single_end { ## make sure that the number of output bams equals the length of R1_files # Count the number of bam files bam_count=$(find . -maxdepth 1 -type f -name '*read_overlap.bam' | wc -l) - + # Get the length of the array ${R1_files[@]} array_length=${#R1_files[@]} - + # Check if the count of bams matches the length of the array ${R1_files[@]} if [ "$bam_count" -ne "$array_length" ]; then echo "Error: Number of BAM files does not match the length of the array." @@ -712,16 +713,16 @@ task Hisat_single_end { # tar up the r1 and r2 stats files -p to set number of threads tar -cf - *.hisat3n_dna_split_reads_summary.R1.txt | pigz > ~{plate_id}.hisat3n_dna_split_reads_summary.R1.tar.gz tar -cf - *.hisat3n_dna_split_reads_summary.R2.txt | pigz > ~{plate_id}.hisat3n_dna_split_reads_summary.R2.tar.gz - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run tar summary text files $elapsed seconds" - + # tar up read overlap files echo "Tar up read_overlap bams" start=$(date +%s) tar -cf - *read_overlap.bam | pigz > ~{plate_id}.remove_overlap_read_parts.tar.gz - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to tar read_overlap bams $elapsed seconds" >>> @@ -738,10 +739,10 @@ task Hisat_single_end { File hisat3n_dna_split_reads_summary_R1_tar = "~{plate_id}.hisat3n_dna_split_reads_summary.R1.tar.gz" File hisat3n_dna_split_reads_summary_R2_tar = "~{plate_id}.hisat3n_dna_split_reads_summary.R2.tar.gz" File remove_overlaps_output_bam_tar = "~{plate_id}.remove_overlap_read_parts.tar.gz" - + } } - + task Merge_sort_analyze { input { String plate_id @@ -749,6 +750,7 @@ task Merge_sort_analyze { File read_overlap_tar String docker String cromwell_root_dir + String cloud_provider #input for allcools bam-to-allc File genome_fa @@ -769,33 +771,35 @@ task Merge_sort_analyze { set -euo pipefail set -x lscpu - + + WORKING_DIR=`pwd` + # unzip tars echo "Untar paired_end_unique_tar" - start=$(date +%s) - pigz -dc ~{paired_end_unique_tar} | tar -xf - + start=$(date +%s) + pigz -dc ~{paired_end_unique_tar} | tar -xf - rm ~{paired_end_unique_tar} - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to untar paired_end_unique_tar: $elapsed seconds" echo "Untar read_overlap_tar" - start=$(date +%s) - pigz -dc ~{read_overlap_tar} | tar -xf - + start=$(date +%s) + pigz -dc ~{read_overlap_tar} | tar -xf - rm ~{read_overlap_tar} - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to untar read_overlap_tar: $elapsed seconds" - - # reference and index - start=$(date +%s) + + # reference and index + start=$(date +%s) echo "Reference and index fasta" mkdir reference cp ~{genome_fa} reference ls reference samtools faidx reference/*.fa - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to index fasta $elapsed seconds" # define lists of r1 and r2 fq files @@ -813,94 +817,102 @@ task Merge_sort_analyze { mkdir ~{cromwell_root_dir}/output_bams mkdir ~{cromwell_root_dir}/temp mkdir ~{cromwell_root_dir}/allc-${mcg_context} - + task() { local file=$1 sample_id=$(basename "$file" ".hisat3n_dna.unique_aligned.bam") echo $sample_id - start=$(date +%s) + start=$(date +%s) echo "Merge all unique_aligned and read_overlap" samtools merge -f "${sample_id}.hisat3n_dna.all_reads.bam" "${sample_id}.hisat3n_dna.unique_aligned.bam" "${sample_id}.hisat3n_dna.split_reads.read_overlap.bam" -@4 - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run merge $elapsed seconds" - start=$(date +%s) + start=$(date +%s) echo "Sort all reads by name" - samtools sort -n -@4 -m1g -o "${sample_id}.hisat3n_dna.all_reads.name_sort.bam" "${sample_id}.hisat3n_dna.all_reads.bam" - end=$(date +%s) - elapsed=$((end - start)) + samtools sort -n -@4 -m1g -o "${sample_id}.hisat3n_dna.all_reads.name_sort.bam" "${sample_id}.hisat3n_dna.all_reads.bam" + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run sort by name $elapsed seconds" - - start=$(date +%s) + + start=$(date +%s) echo "Sort all reads by position" - samtools sort -O BAM -@4 -m1g -o "${sample_id}.hisat3n_dna.all_reads.pos_sort.bam" "${sample_id}.hisat3n_dna.all_reads.name_sort.bam" - end=$(date +%s) - elapsed=$((end - start)) + samtools sort -O BAM -@4 -m1g -o "${sample_id}.hisat3n_dna.all_reads.pos_sort.bam" "${sample_id}.hisat3n_dna.all_reads.name_sort.bam" + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run sort by pos $elapsed seconds" - - start=$(date +%s) + + start=$(date +%s) echo "Call Picard remove duplicates" name=${sample_id}.hisat3n_dna.all_reads.deduped picard MarkDuplicates I=${sample_id}.hisat3n_dna.all_reads.pos_sort.bam O=~{cromwell_root_dir}/output_bams/${name}.bam \ M=~{cromwell_root_dir}/output_bams/${name}.matrix.txt \ REMOVE_DUPLICATES=true TMP_DIR=~{cromwell_root_dir}/temp - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to run picard $elapsed seconds" - - start=$(date +%s) + + start=$(date +%s) echo "Call samtools index" samtools index ~{cromwell_root_dir}/output_bams/${name}.bam - end=$(date +%s) - elapsed=$((end - start)) - echo "Elapsed time to samtools index $elapsed seconds" - - start=$(date +%s) + end=$(date +%s) + elapsed=$((end - start)) + echo "Elapsed time to samtools index $elapsed seconds" + + start=$(date +%s) echo "Call chromatin contacts from name sorted bams" python3 -c 'from cemba_data.hisat3n import *;import os;import glob;call_chromatin_contacts(bam_path="'"$sample_id"'.hisat3n_dna.all_reads.name_sort.bam",contact_prefix="'"$sample_id"'.hisat3n_dna.all_reads",save_raw=False,save_hic_format=True)' - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to chromatin contacts $elapsed seconds" echo "recursively ls cromwell root" ls -lR ~{cromwell_root_dir} - start=$(date +%s) - echo "Call allcools bam-to-allc from deduped.bams" + if [ ~{cloud_provider} = "gcp" ]; then + reference_fasta="~{cromwell_root_dir}/reference/~{genome_base}" + else + reference_fasta="$WORKING_DIR/reference/~{genome_base}" + fi + + echo "reference fast location: $reference_fasta" + + start=$(date +%s) + echo "Call allcools bam-to-allc from deduped.bams" /opt/conda/bin/allcools bam-to-allc \ --bam_path ~{cromwell_root_dir}/output_bams/${name}.bam \ - --reference_fasta ~{cromwell_root_dir}/reference/~{genome_base} \ + --reference_fasta $reference_fasta \ --output_path "${sample_id}.allc.tsv.gz" \ --num_upstr_bases ~{num_upstr_bases} \ --num_downstr_bases ~{num_downstr_bases} \ --compress_level ~{compress_level} \ --save_count_df \ --convert_bam_strandness - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to allcools bam-to-allc $elapsed seconds" - start=$(date +%s) - echo "Call allcools extract-all" + start=$(date +%s) + echo "Call allcools extract-all" allcools extract-allc --strandness merge \ --allc_path ${sample_id}.allc.tsv.gz \ --output_prefix ~{cromwell_root_dir}/allc-${mcg_context}/${sample_id} \ --mc_contexts ${mcg_context} \ --chrom_size_path ~{chromosome_sizes} - end=$(date +%s) - elapsed=$((end - start)) + end=$(date +%s) + elapsed=$((end - start)) echo "Elapsed time to allcools extract-all $elapsed seconds" - + echo "Remove some bams" rm ${sample_id}.hisat3n_dna.all_reads.bam rm ${sample_id}.hisat3n_dna.all_reads.pos_sort.bam rm ~{cromwell_root_dir}/${sample_id}.hisat3n_dna.split_reads.read_overlap.bam rm ~{cromwell_root_dir}/${sample_id}.hisat3n_dna.unique_aligned.bam } - - # run 4 instances of task in parallel + + # run 4 instances of task in parallel for file in "${UNIQUE_BAMS[@]}"; do ( echo "starting task $file.." @@ -922,7 +934,7 @@ task Merge_sort_analyze { # Count the number of *.hisat3n_dna.unique_aligned.bam files bam_count=$(find . -maxdepth 1 -type f -name '*.hisat3n_dna.all_reads.name_sort.bam' | wc -l) contact_count=$(find . -maxdepth 1 -type f -name '*.hisat3n_dna.all_reads.3C.contact.tsv.gz' | wc -l) - + # Get the length of the array ${UNIQUE_BAMS[@]} array_length=${#UNIQUE_BAMS[@]} @@ -942,15 +954,15 @@ task Merge_sort_analyze { echo "recursively ls'sing cromwell root again" ls -lR ~{cromwell_root_dir} - echo "Tar files." + echo "Tar files." tar -cf - ~{cromwell_root_dir}/output_bams/*.matrix.txt | pigz > ~{plate_id}.dedup_unique_bam_and_index_unique_bam_stats.tar.gz tar -cf - *.hisat3n_dna.all_reads.name_sort.bam | pigz > ~{plate_id}.hisat3n_dna.all_reads.name_sort.tar.gz - + # tar outputs of call_chromatin_contacts tar -cf - *.hisat3n_dna.all_reads.3C.contact.tsv.gz | pigz > ~{plate_id}.hisat3n_dna.all_reads.3C.contact.tar.gz tar -cf - *.hisat3n_dna.all_reads.dedup_contacts.tsv.gz | pigz > ~{plate_id}.hisat3n_dna.all_reads.dedup_contacts.tar.gz tar -cf - *.hisat3n_dna.all_reads.contact_stats.csv | pigz > ~{plate_id}.chromatin_contact_stats.tar.gz - + # tar outputs of allcools tar -cf - *.allc.tsv.gz | pigz > ~{plate_id}.allc.tsv.tar.gz tar -cf - *.allc.tsv.gz.tbi | pigz > ~{plate_id}.allc.tbi.tar.gz @@ -967,7 +979,7 @@ task Merge_sort_analyze { cpuPlatform: cpu_platform preemptible: preemptible_tries } - + output { File allc = "~{plate_id}.allc.tsv.tar.gz" File tbi = "~{plate_id}.allc.tbi.tar.gz" From edf0460b9024bee59454e50de4d07d13c874a4c3 Mon Sep 17 00:00:00 2001 From: John Scira Date: Wed, 15 May 2024 15:52:20 -0400 Subject: [PATCH 140/186] changing docker inputs to allow for tests to pass against pipelines that have not yet been azurized --- .../germline/variant_calling/VariantCalling.wdl | 13 +++++++++---- tasks/broad/BamProcessing.wdl | 3 ++- tasks/broad/DragenTasks.wdl | 3 ++- tasks/broad/GermlineVariantDiscovery.wdl | 5 +++-- tasks/broad/Qc.wdl | 6 ++++-- tasks/broad/Utilities.wdl | 3 ++- 6 files changed, 22 insertions(+), 11 deletions(-) diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index ca0c0de091..049ab2836d 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -44,9 +44,14 @@ workflow VariantCalling { String gatk_docker_azure = "dsppipelinedev.azurecr.io/gatk_reduced_layers:latest" String gatk_docker = if cloud_provider == "gcp" then gatk_docker_gcp else gatk_docker_azure - String picard_cloud_docker_gcp = "us.gcr.io/broad-gotc-prod/picard-python:1.0.0-2.26.10-1663951039" - String picard_cloud_docker_azure = "dsppipelinedev.azurecr.io/picard-python:1.0.0-2.26.10-1663951039" + String picard_python_docker_gcp = "us.gcr.io/broad-gotc-prod/picard-python:1.0.0-2.26.10-1663951039" + String picard_python_docker_azure = "dsppipelinedev.azurecr.io/picard-python:1.0.0-2.26.10-1663951039" + String picard_python_docker = if cloud_provider == "gcp" then picard_python_docker_gcp else picard_python_docker_azure + + String picard_cloud_docker_gcp = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + String picard_cloud_docker_azure = "dsppipelinedev.azurecr.io/picard-cloud:2.26.10" String picard_cloud_docker = if cloud_provider == "gcp" then picard_cloud_docker_gcp else picard_cloud_docker_azure + # make sure either gcp or azr is supplied as cloud_provider input if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { @@ -82,7 +87,7 @@ workflow VariantCalling { interval_list = calling_interval_list, scatter_count = haplotype_scatter_count, break_bands_at_multiples_of = break_bands_at_multiples_of, - docker = picard_cloud_docker + docker = picard_python_docker } # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. @@ -210,7 +215,7 @@ workflow VariantCalling { calling_interval_list = calling_interval_list, is_gvcf = make_gvcf, extra_args = if (skip_reblocking == false) then "--no-overlaps" else "", - docker_path = gatk_docker, + docker_path = picard_cloud_docker, preemptible_tries = agg_preemptible_tries } diff --git a/tasks/broad/BamProcessing.wdl b/tasks/broad/BamProcessing.wdl index 13d88c4f5f..cf4ff4d4e2 100644 --- a/tasks/broad/BamProcessing.wdl +++ b/tasks/broad/BamProcessing.wdl @@ -24,7 +24,8 @@ task SortSam { Int compression_level Int additional_disk = 20 Int memory_multiplier = 1 - String docker + #Setting default docker value for workflows that haven't yet been azurized. + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" } # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier diff --git a/tasks/broad/DragenTasks.wdl b/tasks/broad/DragenTasks.wdl index 7e28b793bd..95b27f2fcc 100644 --- a/tasks/broad/DragenTasks.wdl +++ b/tasks/broad/DragenTasks.wdl @@ -24,7 +24,8 @@ task CalibrateDragstrModel { File str_table_file File alignment ## can handle cram or bam. File alignment_index - String docker + #Setting default docker value for workflows that haven't yet been azurized. + String docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int preemptible_tries = 3 Int threads = 4 Int? memory_mb diff --git a/tasks/broad/GermlineVariantDiscovery.wdl b/tasks/broad/GermlineVariantDiscovery.wdl index d6bcb77298..fb224e2fe7 100644 --- a/tasks/broad/GermlineVariantDiscovery.wdl +++ b/tasks/broad/GermlineVariantDiscovery.wdl @@ -27,7 +27,8 @@ task HaplotypeCaller_GATK35_GVCF { Float? contamination Int preemptible_tries Int hc_scatter - String docker + #Setting default docker value for workflows that haven't yet been azurized. + String docker = "us.gcr.io/broad-gotc-prod/gatk:1.3.0-4.2.6.1-1649964384" } parameter_meta { @@ -97,7 +98,7 @@ task HaplotypeCaller_GATK4_VCF { Boolean use_dragen_hard_filtering = false Boolean use_spanning_event_genotyping = true File? dragstr_model - String gatk_docker + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int memory_multiplier = 1 } diff --git a/tasks/broad/Qc.wdl b/tasks/broad/Qc.wdl index 12d3208d86..56a37e7973 100644 --- a/tasks/broad/Qc.wdl +++ b/tasks/broad/Qc.wdl @@ -622,7 +622,8 @@ task ValidateVCF { Int preemptible_tries = 3 Boolean is_gvcf = true String? extra_args - String docker_path + #Setting default docker value for workflows that haven't yet been azurized. + String docker_path = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" Int machine_mem_mb = 7000 } @@ -677,7 +678,8 @@ task CollectVariantCallingMetrics { File evaluation_interval_list Boolean is_gvcf = true Int preemptible_tries - String docker + #Setting default docker value for workflows that haven't yet been azurized. + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" } Int disk_size = ceil(size(input_vcf, "GiB") + size(dbsnp_vcf, "GiB")) + 20 diff --git a/tasks/broad/Utilities.wdl b/tasks/broad/Utilities.wdl index 3ad524c90d..e6a1aeec17 100644 --- a/tasks/broad/Utilities.wdl +++ b/tasks/broad/Utilities.wdl @@ -79,7 +79,8 @@ task ScatterIntervalList { File interval_list Int scatter_count Int break_bands_at_multiples_of - String docker + #Setting default docker value for workflows that haven't yet been azurized. + String docker = "us.gcr.io/broad-gotc-prod/picard-python:1.0.0-2.26.10-1663951039" } command <<< From 429b63576128dc470773fbc2b5e603a93625b6be Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Wed, 15 May 2024 16:58:41 -0400 Subject: [PATCH 141/186] add more logging, put in cromwell root for move rather than local dir --- pipelines/skylab/snm3C/snm3C.wdl | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index ce858a19a9..c4fc7bdf83 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -1016,8 +1016,11 @@ task Summary { command <<< set -euo pipefail - echo "recursively ls'sing cromwell root in summary task" + echo "recursively lsing cromwell root in summary task" ls -lR ~{cromwell_root_dir} + echo "lsing current dir" + ls -lrt + mkdir ~{cromwell_root_dir}/fastq mkdir ~{cromwell_root_dir}/bam @@ -1045,9 +1048,12 @@ task Summary { extract_and_remove ~{sep=' ' allc_uniq_reads_stats} extract_and_remove ~{sep=' ' unique_reads_cgn_extraction_tbi} + echo "lsing cromwell root again" + ls -lrt ~{cromwell_root_dir} + mv *.trimmed.stats.txt ~{cromwell_root_dir}/fastq mv *.hisat3n_dna_summary.txt *.hisat3n_dna_split_reads_summary.R1.txt *.hisat3n_dna_split_reads_summary.R2.txt ~{cromwell_root_dir}/bam - mv output_bams/*.hisat3n_dna.all_reads.deduped.matrix.txt ~{cromwell_root_dir}/bam + mv ~{cromwell_root_dir}/output_bams/*.hisat3n_dna.all_reads.deduped.matrix.txt ~{cromwell_root_dir}/bam mv *.hisat3n_dna.all_reads.contact_stats.csv ~{cromwell_root_dir}/hic mv *.allc.tsv.gz.count.csv ~{cromwell_root_dir}/allc mv ~{cromwell_root_dir}/allc-CGN/*.allc.tsv.gz.tbi ~{cromwell_root_dir}/allc From 2c3a6c5072a205261f3eee75f5600f414b275310 Mon Sep 17 00:00:00 2001 From: John Scira Date: Thu, 16 May 2024 09:46:29 -0400 Subject: [PATCH 142/186] fix to docker task inputs to allow default values --- tasks/broad/GermlineVariantDiscovery.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tasks/broad/GermlineVariantDiscovery.wdl b/tasks/broad/GermlineVariantDiscovery.wdl index fb224e2fe7..d6b6e55cc3 100644 --- a/tasks/broad/GermlineVariantDiscovery.wdl +++ b/tasks/broad/GermlineVariantDiscovery.wdl @@ -98,6 +98,7 @@ task HaplotypeCaller_GATK4_VCF { Boolean use_dragen_hard_filtering = false Boolean use_spanning_event_genotyping = true File? dragstr_model + #Setting default docker value for workflows that haven't yet been azurized. String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int memory_multiplier = 1 } @@ -172,7 +173,8 @@ task MergeVCFs { Array[File] input_vcfs_indexes String output_vcf_name Int preemptible_tries = 3 - String docker + #Setting default docker value for workflows that haven't yet been azurized. + String docker = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" } Int disk_size = ceil(size(input_vcfs, "GiB") * 2.5) + 10 From ca4867a1581338ae7c05f50fc85001265fee71c7 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 16 May 2024 11:10:14 -0400 Subject: [PATCH 143/186] add more logging, add more ls, try tarring files in current dir rather than previous subdir --- pipelines/skylab/snm3C/snm3C.wdl | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index c4fc7bdf83..898bfdf012 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -1034,7 +1034,9 @@ task Summary { return fi for tar in "${@}"; do - tar -xf "$tar" + echo "unstarring this file now: $tar" + tar -xfv "$tar" + echo "removing this tar file now: $tar" rm "$tar" done } @@ -1049,14 +1051,17 @@ task Summary { extract_and_remove ~{sep=' ' unique_reads_cgn_extraction_tbi} echo "lsing cromwell root again" - ls -lrt ~{cromwell_root_dir} + ls -lRt ~{cromwell_root_dir} + + echo "lsing current directory again" + ls -lRt mv *.trimmed.stats.txt ~{cromwell_root_dir}/fastq mv *.hisat3n_dna_summary.txt *.hisat3n_dna_split_reads_summary.R1.txt *.hisat3n_dna_split_reads_summary.R2.txt ~{cromwell_root_dir}/bam - mv ~{cromwell_root_dir}/output_bams/*.hisat3n_dna.all_reads.deduped.matrix.txt ~{cromwell_root_dir}/bam + mv *.hisat3n_dna.all_reads.deduped.matrix.txt ~{cromwell_root_dir}/bam mv *.hisat3n_dna.all_reads.contact_stats.csv ~{cromwell_root_dir}/hic mv *.allc.tsv.gz.count.csv ~{cromwell_root_dir}/allc - mv ~{cromwell_root_dir}/allc-CGN/*.allc.tsv.gz.tbi ~{cromwell_root_dir}/allc + mv *.allc.tsv.gz.tbi ~{cromwell_root_dir}/allc python3 -c 'from cemba_data.hisat3n import *;snm3c_summary()' mv MappingSummary.csv.gz ~{plate_id}_MappingSummary.csv.gz From 0a4ac85a8e3840760cd48bb86b1823ba7fa6bf9a Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 16 May 2024 12:18:38 -0400 Subject: [PATCH 144/186] use correct syntax for verbose untarring --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 898bfdf012..36eddce6eb 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -1035,7 +1035,7 @@ task Summary { fi for tar in "${@}"; do echo "unstarring this file now: $tar" - tar -xfv "$tar" + tar -xvf "$tar" echo "removing this tar file now: $tar" rm "$tar" done From b6ea9a5b423b58d8122b952e553b8f4f7928d3b8 Mon Sep 17 00:00:00 2001 From: John Scira Date: Thu, 16 May 2024 14:59:20 -0400 Subject: [PATCH 145/186] fix to docker version --- .../broad/dna_seq/germline/variant_calling/VariantCalling.wdl | 2 +- tasks/broad/Qc.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index 049ab2836d..00d40172f8 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -215,7 +215,7 @@ workflow VariantCalling { calling_interval_list = calling_interval_list, is_gvcf = make_gvcf, extra_args = if (skip_reblocking == false) then "--no-overlaps" else "", - docker_path = picard_cloud_docker, + docker_path = gatk_docker, preemptible_tries = agg_preemptible_tries } diff --git a/tasks/broad/Qc.wdl b/tasks/broad/Qc.wdl index 56a37e7973..847a9b4683 100644 --- a/tasks/broad/Qc.wdl +++ b/tasks/broad/Qc.wdl @@ -623,7 +623,7 @@ task ValidateVCF { Boolean is_gvcf = true String? extra_args #Setting default docker value for workflows that haven't yet been azurized. - String docker_path = "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + String docker_path = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int machine_mem_mb = 7000 } From 0db052e84e2022044ef2459a7962400a160049df Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Thu, 16 May 2024 16:21:27 -0400 Subject: [PATCH 146/186] set correct base dirs that are nested --- pipelines/skylab/snm3C/snm3C.wdl | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 36eddce6eb..87aa1ecf43 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -115,7 +115,8 @@ workflow snm3C { unique_reads_cgn_extraction_tbi = Merge_sort_analyze.extract_allc_output_tbi_tar, plate_id = plate_id, docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir + cromwell_root_dir = cromwell_root_dir, + cloud_provider = cloud_provider } meta { @@ -1006,6 +1007,7 @@ task Summary { Array[File] unique_reads_cgn_extraction_tbi String plate_id String cromwell_root_dir + String cloud_provider String docker Int disk_size = 80 @@ -1056,12 +1058,24 @@ task Summary { echo "lsing current directory again" ls -lRt + WORKING_DIR=`pwd` + + if [ ~{cloud_provider} = "gcp" ]; then + matrix_files_dir="~{cromwell_root_dir}~{cromwell_root_dir}/output_bams" + allc_index_dir="~{cromwell_root_dir}~{cromwell_root_dir}/allc-*" + else + matrix_files_dir="$WORKING_DIR~{cromwell_root_dir}/output_bams" + allc_index_dir="$WORKING_DIR~{cromwell_root_dir}/allc-*" + fi + echo "matrix files dir: $matrix_files_dir" + echo "allc_index_dir: $allc_index_dir" + mv *.trimmed.stats.txt ~{cromwell_root_dir}/fastq mv *.hisat3n_dna_summary.txt *.hisat3n_dna_split_reads_summary.R1.txt *.hisat3n_dna_split_reads_summary.R2.txt ~{cromwell_root_dir}/bam - mv *.hisat3n_dna.all_reads.deduped.matrix.txt ~{cromwell_root_dir}/bam + mv $matrix_files_dir/*.hisat3n_dna.all_reads.deduped.matrix.txt ~{cromwell_root_dir}/bam mv *.hisat3n_dna.all_reads.contact_stats.csv ~{cromwell_root_dir}/hic mv *.allc.tsv.gz.count.csv ~{cromwell_root_dir}/allc - mv *.allc.tsv.gz.tbi ~{cromwell_root_dir}/allc + mv $allc_index_dir/*.allc.tsv.gz.tbi ~{cromwell_root_dir}/allc python3 -c 'from cemba_data.hisat3n import *;snm3c_summary()' mv MappingSummary.csv.gz ~{plate_id}_MappingSummary.csv.gz From 8878e2f525227d97f1a6808d0f845b56bb1bbc8a Mon Sep 17 00:00:00 2001 From: John Scira Date: Fri, 17 May 2024 12:00:02 -0400 Subject: [PATCH 147/186] change to how cromwell root dir is set --- pipelines/skylab/snm3C/snm3C.wdl | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 689bba7849..076e8f1486 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -72,7 +72,6 @@ workflow snm3C { r2_right_cut = r2_right_cut, plate_id = plate_id, docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir } call Hisat_single_end as Hisat_single_end { @@ -82,7 +81,6 @@ workflow snm3C { genome_fa = genome_fa, plate_id = plate_id, docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir } call Merge_sort_analyze as Merge_sort_analyze { @@ -252,7 +250,6 @@ task Hisat_paired_end { File chromosome_sizes String plate_id String docker - String cromwell_root_dir String r1_adapter String r2_adapter @@ -267,7 +264,10 @@ task Hisat_paired_end { Int preemptible_tries = 2 String cpu_platform = "Intel Ice Lake" } - + + cromwell_root_dir=$(pwd) + batch_dir=$cromwell_root_dir/batch* + command <<< set -euo pipefail set -x @@ -315,13 +315,6 @@ task Hisat_paired_end { echo "lsing cromwell root:" ls -lR ~{cromwell_root_dir} - # define lists of r1 and r2 fq files - if [ ~{cromwell_root_dir} = "gcp" ]; then - batch_dir="batch*/" - else - batch_dir="~{cromwell_root_dir}/*/*/*/*/*/~{cromwell_root_dir}/*/*/*/*/batch*/" - fi - echo "batchdirectory: $batch_dir" task() { From 5d6aaf4a083828d7f402960b8d766e4a7daf9ffe Mon Sep 17 00:00:00 2001 From: John Scira Date: Fri, 17 May 2024 12:03:29 -0400 Subject: [PATCH 148/186] correction to docker image being used --- .../dna_seq/germline/variant_calling/VariantCalling.wdl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index 00d40172f8..a7a53b444a 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -43,6 +43,10 @@ workflow VariantCalling { String gatk_docker_gcp = "us.gcr.io/broad-gatk/gatk:4.5.0.0" String gatk_docker_azure = "dsppipelinedev.azurecr.io/gatk_reduced_layers:latest" String gatk_docker = if cloud_provider == "gcp" then gatk_docker_gcp else gatk_docker_azure + + String gatk_1_3_docker_gcp = us.gcr.io/broad-gotc-prod/gatk:1.3.0-4.2.6.1-1649964384 + String gatk_1_3_docker_azure = us.gcr.io/broad-gotc-prod/gatk:1.3.0-4.2.6.1-1649964384 + String gatk_1_3_docker if cloud_provider == "gcp" then gatk_1_3_docker_gcp else gatk_1_3_docker_azure String picard_python_docker_gcp = "us.gcr.io/broad-gotc-prod/picard-python:1.0.0-2.26.10-1663951039" String picard_python_docker_azure = "dsppipelinedev.azurecr.io/picard-python:1.0.0-2.26.10-1663951039" @@ -112,7 +116,7 @@ workflow VariantCalling { contamination = contamination, preemptible_tries = agg_preemptible_tries, hc_scatter = hc_divisor, - docker = gatk_docker + docker = gatk_1_3_docker } } From 2fe6f9230c46ec4e87294e9ea1f43316b9a9ecff Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Fri, 17 May 2024 13:57:23 -0400 Subject: [PATCH 149/186] add debugging --- pipelines/skylab/snm3C/snm3C.wdl | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 87aa1ecf43..96cdb69e03 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -1077,7 +1077,33 @@ task Summary { mv *.allc.tsv.gz.count.csv ~{cromwell_root_dir}/allc mv $allc_index_dir/*.allc.tsv.gz.tbi ~{cromwell_root_dir}/allc - python3 -c 'from cemba_data.hisat3n import *;snm3c_summary()' + cwd=`pwd` + echo "current working dir is: $cwd" + + + python3 <>> From ff3e6b27bec5b7c259311d002c6a1abb2c7b63de Mon Sep 17 00:00:00 2001 From: John Scira Date: Fri, 17 May 2024 14:17:17 -0400 Subject: [PATCH 150/186] formatting fix --- .../dna_seq/germline/variant_calling/VariantCalling.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index a7a53b444a..98b9fb77ee 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -44,9 +44,9 @@ workflow VariantCalling { String gatk_docker_azure = "dsppipelinedev.azurecr.io/gatk_reduced_layers:latest" String gatk_docker = if cloud_provider == "gcp" then gatk_docker_gcp else gatk_docker_azure - String gatk_1_3_docker_gcp = us.gcr.io/broad-gotc-prod/gatk:1.3.0-4.2.6.1-1649964384 - String gatk_1_3_docker_azure = us.gcr.io/broad-gotc-prod/gatk:1.3.0-4.2.6.1-1649964384 - String gatk_1_3_docker if cloud_provider == "gcp" then gatk_1_3_docker_gcp else gatk_1_3_docker_azure + String gatk_1_3_docker_gcp = "us.gcr.io/broad-gotc-prod/gatk:1.3.0-4.2.6.1-1649964384" + String gatk_1_3_docker_azure = "us.gcr.io/broad-gotc-prod/gatk:1.3.0-4.2.6.1-1649964384" + String gatk_1_3_docker = if cloud_provider == "gcp" then gatk_1_3_docker_gcp else gatk_1_3_docker_azure String picard_python_docker_gcp = "us.gcr.io/broad-gotc-prod/picard-python:1.0.0-2.26.10-1663951039" String picard_python_docker_azure = "dsppipelinedev.azurecr.io/picard-python:1.0.0-2.26.10-1663951039" From 45de2a335e82d3a45246d47b406c843ba19b7415 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Fri, 17 May 2024 16:07:38 -0400 Subject: [PATCH 151/186] add logic for base directory based on gcp vs azure --- pipelines/skylab/snm3C/snm3C.wdl | 52 +++++++++++++++++--------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 96cdb69e03..0f206de50c 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -1023,11 +1023,26 @@ task Summary { echo "lsing current dir" ls -lrt + WORKING_DIR=`pwd` + + if [ ~{cloud_provider} = "gcp" ]; then + base_directory=~{cromwell_root_dir} + matrix_files_dir="~{cromwell_root_dir}~{cromwell_root_dir}/output_bams" + allc_index_dir="~{cromwell_root_dir}~{cromwell_root_dir}/allc-*" + else + base_directory=$WORKING_DIR + matrix_files_dir="$WORKING_DIR~{cromwell_root_dir}/output_bams" + allc_index_dir="$WORKING_DIR~{cromwell_root_dir}/allc-*" + fi + echo "matrix files dir: $matrix_files_dir" + echo "allc_index_dir: $allc_index_dir" + echo "base directory is: $base_directory" + - mkdir ~{cromwell_root_dir}/fastq - mkdir ~{cromwell_root_dir}/bam - mkdir ~{cromwell_root_dir}/allc - mkdir ~{cromwell_root_dir}/hic + mkdir $base_directory/fastq + mkdir $base_directory/bam + mkdir $base_directory/allc + mkdir $base_directory/hic extract_and_remove() { if [ $# -eq 0 ]; @@ -1058,24 +1073,12 @@ task Summary { echo "lsing current directory again" ls -lRt - WORKING_DIR=`pwd` - - if [ ~{cloud_provider} = "gcp" ]; then - matrix_files_dir="~{cromwell_root_dir}~{cromwell_root_dir}/output_bams" - allc_index_dir="~{cromwell_root_dir}~{cromwell_root_dir}/allc-*" - else - matrix_files_dir="$WORKING_DIR~{cromwell_root_dir}/output_bams" - allc_index_dir="$WORKING_DIR~{cromwell_root_dir}/allc-*" - fi - echo "matrix files dir: $matrix_files_dir" - echo "allc_index_dir: $allc_index_dir" - - mv *.trimmed.stats.txt ~{cromwell_root_dir}/fastq - mv *.hisat3n_dna_summary.txt *.hisat3n_dna_split_reads_summary.R1.txt *.hisat3n_dna_split_reads_summary.R2.txt ~{cromwell_root_dir}/bam - mv $matrix_files_dir/*.hisat3n_dna.all_reads.deduped.matrix.txt ~{cromwell_root_dir}/bam - mv *.hisat3n_dna.all_reads.contact_stats.csv ~{cromwell_root_dir}/hic - mv *.allc.tsv.gz.count.csv ~{cromwell_root_dir}/allc - mv $allc_index_dir/*.allc.tsv.gz.tbi ~{cromwell_root_dir}/allc + mv *.trimmed.stats.txt $base_directory/fastq + mv *.hisat3n_dna_summary.txt *.hisat3n_dna_split_reads_summary.R1.txt *.hisat3n_dna_split_reads_summary.R2.txt $base_directory/bam + mv $matrix_files_dir/*.hisat3n_dna.all_reads.deduped.matrix.txt $base_directory/bam + mv *.hisat3n_dna.all_reads.contact_stats.csv $base_directory/hic + mv *.allc.tsv.gz.count.csv $base_directory/allc + mv $allc_index_dir/*.allc.tsv.gz.tbi $base_directory/allc cwd=`pwd` echo "current working dir is: $cwd" @@ -1090,12 +1093,13 @@ task Summary { print("Calling summary function") snm3c_summary() - print("Called summry function") + print("Called summary function") working_dir = os.getcwd() print(f"Current working direcetory is: {working_dir}") print("These are the files located here:") - os.listdir() + files = os.listdir() + print(files) CODE From 6cc717b0790d41ac5cfdded34c4f6845895b37d5 Mon Sep 17 00:00:00 2001 From: John Scira Date: Mon, 20 May 2024 14:08:08 -0400 Subject: [PATCH 152/186] added notes to affected changelogs --- .../exome/ExomeGermlineSingleSample.changelog.md | 5 +++++ .../ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md | 6 ++++++ .../wgs/WholeGenomeGermlineSingleSample.changelog.md | 3 +++ pipelines/skylab/optimus/Optimus.changelog.md | 5 +++++ pipelines/skylab/snm3C/snm3C.changelog.md | 4 ++++ 5 files changed, 23 insertions(+) diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md index acfffcef76..d30b3f70fe 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md @@ -1,3 +1,8 @@ +# 3.1.21 +2024-05-20 (Date of Last Commit) + +* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline. + # 3.1.20 2024-04-08 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md index 388d75b7fb..703ad6af50 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md @@ -1,3 +1,9 @@ +# 1.0.18 +2024-05-20 (Date of Last Commit) + +* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline. + + # 1.0.17 2024-04-08 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md index 747a7030a1..eef60fd476 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md @@ -1,3 +1,6 @@ +# 3.1.22 +* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline. + # 3.1.21 2024-04-08 (Date of Last Commit) diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md index eaac72c59b..ce591371fc 100644 --- a/pipelines/skylab/optimus/Optimus.changelog.md +++ b/pipelines/skylab/optimus/Optimus.changelog.md @@ -1,3 +1,8 @@ +# 6.6.4 +2024-05-20 (Date of Last Commit) + +* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline. + # 6.6.3 2024-05-08 (Date of Last Commit) diff --git a/pipelines/skylab/snm3C/snm3C.changelog.md b/pipelines/skylab/snm3C/snm3C.changelog.md index 0dbde2c845..5035568938 100644 --- a/pipelines/skylab/snm3C/snm3C.changelog.md +++ b/pipelines/skylab/snm3C/snm3C.changelog.md @@ -1,3 +1,7 @@ +# 4.0.2 +2024-05-20 (Date of Last Commit) +* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to support ToA. This change does not affect the snM3C pipeline. + # 4.0.1 2024-04-18 (Date of Last Commit) * Updated the snM3C wdl to run on Azure. This change does not affect the snM3C pipeline. From 3d78f0b669a33d8f583968d5416a2c97d27fdd7b Mon Sep 17 00:00:00 2001 From: John Scira Date: Tue, 21 May 2024 09:17:13 -0400 Subject: [PATCH 153/186] updated changelog for WGS Single sample pipeline --- .../wgs/WholeGenomeGermlineSingleSample.changelog.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md index eef60fd476..e329070b0b 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md @@ -1,10 +1,8 @@ -# 3.1.22 -* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline. - # 3.1.21 -2024-04-08 (Date of Last Commit) +2024-05-21 (Date of Last Commit) * Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. +* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline. # 3.1.20 2024-03-26 (Date of Last Commit) From 4f61003631f1c67d04979c3d8edc291f09b40fe2 Mon Sep 17 00:00:00 2001 From: John Scira Date: Tue, 21 May 2024 09:18:24 -0400 Subject: [PATCH 154/186] updated pipeline version to match changelong --- .../germline/single_sample/exome/ExomeGermlineSingleSample.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl index f5efc80b60..00769b467c 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl @@ -45,7 +45,7 @@ import "../../../../../../tasks/broad/Utilities.wdl" as utils # WORKFLOW DEFINITION workflow ExomeGermlineSingleSample { - String pipeline_version = "3.1.20" + String pipeline_version = "3.1.21" input { From db223a848a86afbd6afe4bf19547ee3ee65a92ce Mon Sep 17 00:00:00 2001 From: John Scira Date: Tue, 21 May 2024 09:19:27 -0400 Subject: [PATCH 155/186] updated pipeline version to match changelong --- .../single_sample/exome/ExomeGermlineSingleSample.changelog.md | 2 +- .../single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md index d30b3f70fe..38e0008f64 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md @@ -1,5 +1,5 @@ # 3.1.21 -2024-05-20 (Date of Last Commit) +2024-05-21 (Date of Last Commit) * Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline. diff --git a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl index a404f5d561..b9270c3b3c 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl +++ b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl @@ -50,7 +50,7 @@ workflow UltimaGenomicsWholeGenomeGermline { filtering_model_no_gt_name: "String describing the optional filtering model; default set to rf_model_ignore_gt_incl_hpol_runs" } - String pipeline_version = "1.0.17" + String pipeline_version = "1.0.18" References references = alignment_references.references From 45b40a5cde9f77544a10388ac552525f938576ac Mon Sep 17 00:00:00 2001 From: John Scira Date: Tue, 21 May 2024 09:20:44 -0400 Subject: [PATCH 156/186] updated pipeline version to match changelong --- .../broad/dna_seq/germline/variant_calling/VariantCalling.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index 98b9fb77ee..34df120d96 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -9,7 +9,7 @@ import "../../../../../tasks/broad/DragenTasks.wdl" as DragenTasks workflow VariantCalling { - String pipeline_version = "2.1.20" + String pipeline_version = "2.1.19" input { From 6d0603fe804531f49d32f198192d05b4cddc8cbc Mon Sep 17 00:00:00 2001 From: John Scira Date: Tue, 21 May 2024 09:22:37 -0400 Subject: [PATCH 157/186] updated changelog and pipeline version --- pipelines/broad/arrays/imputation/Imputation.changelog.md | 6 ++++++ pipelines/broad/arrays/imputation/Imputation.wdl | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation/Imputation.changelog.md b/pipelines/broad/arrays/imputation/Imputation.changelog.md index e96dabb6a6..02b32dc771 100644 --- a/pipelines/broad/arrays/imputation/Imputation.changelog.md +++ b/pipelines/broad/arrays/imputation/Imputation.changelog.md @@ -1,3 +1,9 @@ +# 1.1.13 +2023-05-21 (Date of Last Commit) + +* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline. + + # 1.1.12 2023-12-18 (Date of Last Commit) diff --git a/pipelines/broad/arrays/imputation/Imputation.wdl b/pipelines/broad/arrays/imputation/Imputation.wdl index 44d5a93cd0..2780b64e62 100644 --- a/pipelines/broad/arrays/imputation/Imputation.wdl +++ b/pipelines/broad/arrays/imputation/Imputation.wdl @@ -6,7 +6,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils workflow Imputation { - String pipeline_version = "1.1.12" + String pipeline_version = "1.1.13" input { Int chunkLength = 25000000 From f08c5c8ace02e2d8e6f3fde9ba6e4aa92a5dca61 Mon Sep 17 00:00:00 2001 From: John Scira Date: Tue, 21 May 2024 09:23:39 -0400 Subject: [PATCH 158/186] updated changelog and pipeline version --- .../arrays/imputation/BroadInternalImputation.changelog.md | 5 +++++ .../internal/arrays/imputation/BroadInternalImputation.wdl | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md b/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md index 0ac74c9794..6d45d66333 100644 --- a/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md +++ b/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md @@ -1,3 +1,8 @@ +# 1.1.11 +2024-05-21 (Date of Last Commit) + +* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline. + # 1.1.10 2023-12-18 (Date of Last Commit) diff --git a/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.wdl b/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.wdl index 3021fe6a4c..7b5e2958ce 100644 --- a/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.wdl +++ b/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.wdl @@ -9,7 +9,7 @@ workflow BroadInternalImputation { description: "Push outputs of Imputation.wdl to TDR dataset table ImputationOutputsTable and split out Imputation arrays into ImputationWideOutputsTable." allowNestedInputs: true } - String pipeline_version = "1.1.10" + String pipeline_version = "1.1.11" input { # inputs to wrapper task From ef487b86d800a11fd223d63022edf60772d00dc1 Mon Sep 17 00:00:00 2001 From: John Scira Date: Tue, 21 May 2024 09:24:59 -0400 Subject: [PATCH 159/186] updated changelog and pipeline version --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 076e8f1486..ab9eec2689 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -44,7 +44,7 @@ workflow snm3C { } # version of the pipeline - String pipeline_version = "4.0.1" + String pipeline_version = "4.0.2" call Demultiplexing { input: From d09e0bdb3f34afe7247a71744976fb3d47c6c82b Mon Sep 17 00:00:00 2001 From: John Scira Date: Tue, 21 May 2024 09:26:11 -0400 Subject: [PATCH 160/186] updated changelog and pipeline version --- pipelines/skylab/optimus/Optimus.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index b4b0196f89..91d843a839 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -68,7 +68,7 @@ workflow Optimus { # version of this pipeline - String pipeline_version = "6.6.3" + String pipeline_version = "6.6.4" # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays From 10d934bd369bbd91edf5fc85aaf4a25d533af261 Mon Sep 17 00:00:00 2001 From: John Scira Date: Thu, 23 May 2024 14:19:40 -0400 Subject: [PATCH 161/186] reverting snm3c changes --- pipelines/skylab/snm3C/snm3C.changelog.md | 4 ---- pipelines/skylab/snm3C/snm3C.wdl | 13 ++----------- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.changelog.md b/pipelines/skylab/snm3C/snm3C.changelog.md index 5035568938..0dbde2c845 100644 --- a/pipelines/skylab/snm3C/snm3C.changelog.md +++ b/pipelines/skylab/snm3C/snm3C.changelog.md @@ -1,7 +1,3 @@ -# 4.0.2 -2024-05-20 (Date of Last Commit) -* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to support ToA. This change does not affect the snM3C pipeline. - # 4.0.1 2024-04-18 (Date of Last Commit) * Updated the snM3C wdl to run on Azure. This change does not affect the snM3C pipeline. diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 1a1a6f1aa0..0f206de50c 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -44,7 +44,7 @@ workflow snm3C { } # version of the pipeline - String pipeline_version = "4.0.2" + String pipeline_version = "4.0.1" call Demultiplexing { input: @@ -72,10 +72,8 @@ workflow snm3C { r2_right_cut = r2_right_cut, plate_id = plate_id, docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir, cloud_provider = cloud_provider, - } call Hisat_single_end as Hisat_single_end { @@ -85,10 +83,8 @@ workflow snm3C { genome_fa = genome_fa, plate_id = plate_id, docker = docker_prefix + m3c_yap_hisat_docker, - cromwell_root_dir = cromwell_root_dir, cloud_provider = cloud_provider - } call Merge_sort_analyze as Merge_sort_analyze { @@ -259,7 +255,6 @@ task Hisat_paired_end { File chromosome_sizes String plate_id String docker - String cromwell_root_dir String cloud_provider @@ -276,10 +271,7 @@ task Hisat_paired_end { Int preemptible_tries = 2 String cpu_platform = "Intel Ice Lake" } - - cromwell_root_dir=$(pwd) - batch_dir=$cromwell_root_dir/batch* - + command <<< set -euo pipefail WORKING_DIR=`pwd` @@ -323,7 +315,6 @@ task Hisat_paired_end { echo "lsing cromwell root dir" ls -lR ~{cromwell_root_dir} - # define lists of r1 and r2 fq files if [ ~{cloud_provider} = "gcp" ]; then batch_dir="~{cromwell_root_dir}~{cromwell_root_dir}/batch*/" From 3484065049cf518ca67bf916614330255b90e826 Mon Sep 17 00:00:00 2001 From: John Scira Date: Thu, 23 May 2024 15:43:21 -0400 Subject: [PATCH 162/186] added notes for SlideSeq and changelog for updated tasks --- pipelines/skylab/slideseq/SlideSeq.changelog.md | 5 +++++ pipelines/skylab/slideseq/SlideSeq.wdl | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/slideseq/SlideSeq.changelog.md b/pipelines/skylab/slideseq/SlideSeq.changelog.md index b637f92696..74ad623070 100644 --- a/pipelines/skylab/slideseq/SlideSeq.changelog.md +++ b/pipelines/skylab/slideseq/SlideSeq.changelog.md @@ -1,3 +1,8 @@ +# 3.1.7 +2023-05-23 (Date of Last Commit) + +* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline. + # 3.1.6 2024-05-07 (Date of Last Commit) diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index 0502a32fcd..0998c8eb9b 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -25,7 +25,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow SlideSeq { - String pipeline_version = "3.1.6" + String pipeline_version = "3.1.7" input { Array[File] r1_fastq From 4e8b5e51e760b544d67fea376249428870b02c39 Mon Sep 17 00:00:00 2001 From: Nareh Sahakian Date: Wed, 29 May 2024 09:27:39 -0400 Subject: [PATCH 163/186] remove logging and debugging statements --- pipelines/skylab/snm3C/snm3C.wdl | 67 +------------------------------- 1 file changed, 2 insertions(+), 65 deletions(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 0f206de50c..2e07e931c9 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -312,10 +312,6 @@ task Hisat_paired_end { elapsed=$((end - start)) echo "Elapsed time to untar: $elapsed seconds" - echo "lsing cromwell root dir" - ls -lR ~{cromwell_root_dir} - - # define lists of r1 and r2 fq files if [ ~{cloud_provider} = "gcp" ]; then batch_dir="~{cromwell_root_dir}~{cromwell_root_dir}/batch*/" else @@ -426,7 +422,6 @@ task Hisat_paired_end { for file in "${R1_files[@]}"; do ( echo "starting task $file.." - du -h $batch_dir/$file task "$file" sleep $(( (RANDOM % 3) + 1)) ) & @@ -568,10 +563,6 @@ task Hisat_single_end { R1_files=($(ls | grep "\.hisat3n_dna.split_reads.R1.fastq")) R2_files=($(ls | grep "\.hisat3n_dna.split_reads.R2.fastq")) - echo "Found R1 files: $R1_files" - echo "Found R2 files: $R2_files" - - task() { BASE=$(basename "$file" ".hisat3n_dna.split_reads.R1.fastq") echo $BASE @@ -650,18 +641,13 @@ task Hisat_single_end { elapsed=$((end - start)) echo "Elapsed time to run samtools -q 10 $elapsed seconds" - # remove_overlap_read_parts - echo "recusively ls cromwell root" - ls -lR ~{cromwell_root_dir} - if [ ~{cloud_provider} = "gcp" ]; then bam_path_prefix="~{cromwell_root_dir}" else bam_path_prefix=$WORKING_DIR fi - echo "bam_path_prefix $bam_path_prefix" - + # remove_overlap_read_parts echo "call remove_overlap_read_parts" start=$(date +%s) python3 -c 'from cemba_data.hisat3n import *;import os;remove_overlap_read_parts(in_bam_path="'"$BASE"'.name_sorted.filtered.bam",out_bam_path="'"$BASE"'.hisat3n_dna.split_reads.read_overlap.bam")' @@ -869,17 +855,12 @@ task Merge_sort_analyze { elapsed=$((end - start)) echo "Elapsed time to chromatin contacts $elapsed seconds" - echo "recursively ls cromwell root" - ls -lR ~{cromwell_root_dir} - if [ ~{cloud_provider} = "gcp" ]; then reference_fasta="~{cromwell_root_dir}/reference/~{genome_base}" else reference_fasta="$WORKING_DIR/reference/~{genome_base}" fi - echo "reference fast location: $reference_fasta" - start=$(date +%s) echo "Call allcools bam-to-allc from deduped.bams" /opt/conda/bin/allcools bam-to-allc \ @@ -1018,11 +999,6 @@ task Summary { command <<< set -euo pipefail - echo "recursively lsing cromwell root in summary task" - ls -lR ~{cromwell_root_dir} - echo "lsing current dir" - ls -lrt - WORKING_DIR=`pwd` if [ ~{cloud_provider} = "gcp" ]; then @@ -1034,10 +1010,6 @@ task Summary { matrix_files_dir="$WORKING_DIR~{cromwell_root_dir}/output_bams" allc_index_dir="$WORKING_DIR~{cromwell_root_dir}/allc-*" fi - echo "matrix files dir: $matrix_files_dir" - echo "allc_index_dir: $allc_index_dir" - echo "base directory is: $base_directory" - mkdir $base_directory/fastq mkdir $base_directory/bam @@ -1051,9 +1023,7 @@ task Summary { return fi for tar in "${@}"; do - echo "unstarring this file now: $tar" tar -xvf "$tar" - echo "removing this tar file now: $tar" rm "$tar" done } @@ -1067,12 +1037,6 @@ task Summary { extract_and_remove ~{sep=' ' allc_uniq_reads_stats} extract_and_remove ~{sep=' ' unique_reads_cgn_extraction_tbi} - echo "lsing cromwell root again" - ls -lRt ~{cromwell_root_dir} - - echo "lsing current directory again" - ls -lRt - mv *.trimmed.stats.txt $base_directory/fastq mv *.hisat3n_dna_summary.txt *.hisat3n_dna_split_reads_summary.R1.txt *.hisat3n_dna_split_reads_summary.R2.txt $base_directory/bam mv $matrix_files_dir/*.hisat3n_dna.all_reads.deduped.matrix.txt $base_directory/bam @@ -1080,34 +1044,7 @@ task Summary { mv *.allc.tsv.gz.count.csv $base_directory/allc mv $allc_index_dir/*.allc.tsv.gz.tbi $base_directory/allc - cwd=`pwd` - echo "current working dir is: $cwd" - - - python3 <>> From c0672fdb5fcaa5db15ca682c376ee6043a347cb2 Mon Sep 17 00:00:00 2001 From: npetrill Date: Fri, 7 Jun 2024 09:48:55 -0400 Subject: [PATCH 164/186] just testing --- pipelines/skylab/paired_tag/PairedTag.wdl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index 22667f27d9..64ab0c9d1b 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -51,7 +51,7 @@ workflow PairedTag { } # All docker images that are needed for tasks in this workflow - String upstools_docker = "upstools:1.2.0-2023.03.03-1704723060" + String upstools_docker = "upstools:2.0.0" String snapatac_docker = "snapatac2:1.0.4-2.3.1-1700590229" # Prefixes based on cloud env @@ -100,7 +100,8 @@ workflow PairedTag { barcodes_fastq = atac_r2_fastq[idx], input_id = input_id, whitelist = atac_whitelist, - preindex = preindex + preindex = preindex, + docker_path = docker_prefix + upstools_docker } } From 774b129d80d1ab224af4ad4b9dd53cfb74719ab2 Mon Sep 17 00:00:00 2001 From: npetrill Date: Fri, 7 Jun 2024 11:42:19 -0400 Subject: [PATCH 165/186] add input to test --- verification/test-wdls/TestPairedTag.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/verification/test-wdls/TestPairedTag.wdl b/verification/test-wdls/TestPairedTag.wdl index 9695fb98e6..5294e258a5 100644 --- a/verification/test-wdls/TestPairedTag.wdl +++ b/verification/test-wdls/TestPairedTag.wdl @@ -53,6 +53,7 @@ workflow TestPairedTag { String vault_token_path String google_account_vault_path Boolean run_cellbender = false + String cloud_provider } @@ -86,7 +87,8 @@ workflow TestPairedTag { adapter_seq_read3 = adapter_seq_read3, chrom_sizes = chrom_sizes, atac_whitelist = atac_whitelist, - soloMultiMappers = soloMultiMappers + soloMultiMappers = soloMultiMappers, + cloud_provider = cloud_provider } From 498b7438359736174136d5765cae617b7743b96b Mon Sep 17 00:00:00 2001 From: npetrill Date: Fri, 7 Jun 2024 14:31:37 -0400 Subject: [PATCH 166/186] changelogs --- pipelines/skylab/multiome/Multiome.changelog.md | 5 +++++ pipelines/skylab/multiome/Multiome.wdl | 2 +- pipelines/skylab/multiome/atac.changelog.md | 5 +++++ pipelines/skylab/multiome/atac.wdl | 4 ++-- pipelines/skylab/optimus/Optimus.wdl | 2 +- pipelines/skylab/paired_tag/PairedTag.changelog.md | 5 +++++ pipelines/skylab/paired_tag/PairedTag.wdl | 2 +- 7 files changed, 20 insertions(+), 5 deletions(-) diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index a2737a1c95..ea1e04d94d 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -1,3 +1,8 @@ +# 5.0.1 +2024-05-20 (Date of Last Commit) + +* Updated the Multiome.wdl to run on Azure + # 5.0.0 2024-05-20 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index b7f21a8b39..3c3b7d222b 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow Multiome { - String pipeline_version = "5.0.0" + String pipeline_version = "5.0.1" input { String cloud_provider diff --git a/pipelines/skylab/multiome/atac.changelog.md b/pipelines/skylab/multiome/atac.changelog.md index 7478a49e0d..596b401a07 100644 --- a/pipelines/skylab/multiome/atac.changelog.md +++ b/pipelines/skylab/multiome/atac.changelog.md @@ -1,3 +1,8 @@ +# 5.0.1 +2024-06-07 (Date of Last Commit) + +* Updated the atac.wdl to run on Azure + # 2.0.0 2024-05-20 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index 644799da11..2ff8512111 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -43,11 +43,11 @@ workflow ATAC { String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" } - String pipeline_version = "2.0.0" + String pipeline_version = "2.0.1" # Determine docker prefix based on cloud provider String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" - String acr_docker_prefix = "dsppipelinedev.azurecr.io/" + String acr_docker_prefix = "dsppipeli nedev.azurecr.io/" String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix # Docker image names diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index 6f56e87060..b2a05cf727 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -68,7 +68,7 @@ workflow Optimus { # version of this pipeline - String pipeline_version = "7.1.0" + String pipeline_version = "7.1.2" # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index 22491b540b..aa3efd879a 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,3 +1,8 @@ +# 0.7.1 +2024-06-07 (Date of Last Commit) + +* Updated the PairedTag.wdl to run on Azure + # 0.7.0 2024-05-20 diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index 64ab0c9d1b..bc19f65160 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -7,7 +7,7 @@ import "../../../tasks/skylab/PairedTagUtils.wdl" as Demultiplexing import "../../../tasks/broad/Utilities.wdl" as utils workflow PairedTag { - String pipeline_version = "0.7.0" + String pipeline_version = "0.7.1" input { String input_id From 3d942b9f6181642360fa2857e4f272a5b72d0586 Mon Sep 17 00:00:00 2001 From: npetrill Date: Fri, 7 Jun 2024 14:32:55 -0400 Subject: [PATCH 167/186] changelogs --- pipelines/skylab/multiome/atac.changelog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/multiome/atac.changelog.md b/pipelines/skylab/multiome/atac.changelog.md index 596b401a07..86c284ef92 100644 --- a/pipelines/skylab/multiome/atac.changelog.md +++ b/pipelines/skylab/multiome/atac.changelog.md @@ -1,4 +1,4 @@ -# 5.0.1 +# 2.0.1 2024-06-07 (Date of Last Commit) * Updated the atac.wdl to run on Azure From a27adc23935088f73b37742fab7d381949c0951c Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 11 Jun 2024 10:28:43 -0400 Subject: [PATCH 168/186] changelogs --- pipelines/skylab/multiome/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index 2ff8512111..3c2f420c20 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -47,7 +47,7 @@ workflow ATAC { # Determine docker prefix based on cloud provider String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" - String acr_docker_prefix = "dsppipeli nedev.azurecr.io/" + String acr_docker_prefix = "dsppipelinedev.azurecr.io/" String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix # Docker image names From 08d6bbce9f4d46d679851498b4ed1b249d4d0c00 Mon Sep 17 00:00:00 2001 From: Nikelle Petrillo <38223776+nikellepetrillo@users.noreply.github.com> Date: Tue, 18 Jun 2024 09:22:12 -0400 Subject: [PATCH 169/186] Np make vm size an input to multiome (#1289) * add vm size as input to Multiome.wdl * add vm size as input to Multiome.wdl * add new input to overviews * add new input to overviews * add new input to overviews * add new input to overviews --- pipelines/skylab/multiome/Multiome.wdl | 7 ++- pipelines/skylab/multiome/atac.wdl | 17 ++++-- pipelines/skylab/paired_tag/PairedTag.wdl | 6 ++- .../Plumbing/BC011_BC015_downsampled.json | 3 +- .../Plumbing/BI015_downsampled.json | 3 +- website/docs/Pipelines/ATAC/README.md | 33 ++++++------ .../Pipelines/Multiome_Pipeline/README.md | 52 ++++++++++--------- 7 files changed, 70 insertions(+), 51 deletions(-) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 3c3b7d222b..1a150ea9ea 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -35,6 +35,8 @@ workflow Multiome { Array[File] atac_r1_fastq Array[File] atac_r2_fastq Array[File] atac_r3_fastq + # VM size used for several ATAC tasks + String vm_size = "Standard_M128s" # BWA tar reference File tar_bwa_reference # Chromosone sizes @@ -109,8 +111,9 @@ workflow Multiome { chrom_sizes = chrom_sizes, whitelist = atac_whitelist, adapter_seq_read1 = adapter_seq_read1, - annotations_gtf = annotations_gtf, - adapter_seq_read3 = adapter_seq_read3 + adapter_seq_read3 = adapter_seq_read3, + vm_size = vm_size, + annotations_gtf = annotations_gtf } call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes { input: diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index 3c2f420c20..061e9c892b 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -30,6 +30,7 @@ workflow ATAC { Int num_threads_bwa = 128 Int mem_size_bwa = 512 String cpu_platform_bwa = "Intel Ice Lake" + String vm_size # Text file containing chrom_sizes for genome build (i.e. hg38) File chrom_sizes @@ -80,7 +81,8 @@ workflow ATAC { input: nthreads = num_threads_bwa, mem_size = mem_size_bwa, - cpu_platform = cpu_platform_bwa + cpu_platform = cpu_platform_bwa, + vm_size = vm_size } call FastqProcessing.FastqProcessATAC as SplitFastq { @@ -116,7 +118,8 @@ workflow ATAC { mem_size = mem_size_bwa, cpu_platform = cpu_platform_bwa, docker_path = docker_prefix + samtools_docker, - cloud_provider = cloud_provider + cloud_provider = cloud_provider, + vm_size = vm_size } if (preindex) { @@ -166,12 +169,14 @@ task GetNumSplits { Int mem_size String cpu_platform String docker_image = "ubuntu:latest" + String vm_size } parameter_meta { docker_image: "the ubuntu docker image (default: ubuntu:latest)" nthreads: "Number of threads per node (default: 128)" mem_size: "the size of memory used during alignment" + vm_size: "the virtual machine used for the task" } command <<< @@ -236,7 +241,7 @@ task GetNumSplits { cpu: nthreads cpuPlatform: cpu_platform memory: "${mem_size} GiB" - vm_size: "Standard_M128s" + vm_size: vm_size } output { @@ -327,7 +332,8 @@ task BWAPairedEndAlignment { Int disk_size = 2000 Int nthreads Int mem_size - String cpu_platform + String cpu_platform + String vm_size } parameter_meta { @@ -342,6 +348,7 @@ task BWAPairedEndAlignment { output_base_name: "basename to be used for the output of the task" docker_path: "The docker image path containing the runtime environment for this task" cloud_provider: "The cloud provider for the pipeline." + vm_size: "the virtual machine used for the task" } String bam_aligned_output_name = output_base_name + ".bam" @@ -471,7 +478,7 @@ task BWAPairedEndAlignment { cpu: nthreads cpuPlatform: cpu_platform memory: "${mem_size} GiB" - vm_size: "Standard_M128s" + vm_size: vm_size } output { diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index bc19f65160..ce1eb08599 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -34,6 +34,9 @@ workflow PairedTag { Array[File] atac_r1_fastq Array[File] atac_r2_fastq Array[File] atac_r3_fastq + + String vm_size = "Standard_M128s" + # BWA input File tar_bwa_reference File chrom_sizes @@ -119,7 +122,8 @@ workflow PairedTag { adapter_seq_read3 = adapter_seq_read3, annotations_gtf = annotations_gtf, preindex = preindex, - cloud_provider = cloud_provider + cloud_provider = cloud_provider, + vm_size = vm_size } if (preindex) { diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json index 9e7b18b679..470b1ce33c 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json +++ b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json @@ -23,5 +23,6 @@ "PairedTag.Atac_preindex.cpu_platform_bwa":"Intel Cascade Lake", "PairedTag.Atac_preindex.num_threads_bwa":"16", "PairedTag.Atac_preindex.mem_size_bwa":"64", - "PairedTag.soloMultiMappers":"Uniform" + "PairedTag.soloMultiMappers":"Uniform", + "PairedTag.cloud_provider": "gcp" } diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json index 2bdd7a8fe2..67560d3aee 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json +++ b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json @@ -23,5 +23,6 @@ "PairedTag.Atac_preindex.cpu_platform_bwa":"Intel Cascade Lake", "PairedTag.Atac_preindex.num_threads_bwa":"16", "PairedTag.Atac_preindex.mem_size_bwa":"64", - "PairedTag.soloMultiMappers":"Uniform" + "PairedTag.soloMultiMappers":"Uniform", + "PairedTag.cloud_provider": "gcp" } diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md index 76033520f8..d95d82a440 100644 --- a/website/docs/Pipelines/ATAC/README.md +++ b/website/docs/Pipelines/ATAC/README.md @@ -44,23 +44,24 @@ ATAC can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/stable/ ## Input Variables The following describes the inputs of the ATAC workflow. For more details on how default inputs are set for the Multiome workflow, see the [Multiome overview](../Multiome_Pipeline/README). -| Variable name | Description | -| --- | --- | -| read1_fastq_gzipped | Fastq inputs (array of compressed read 1 FASTQ files). | -| read2_fastq_gzipped | Fastq inputs (array of compressed read 2 FASTQ files containing cellular barcodes). | -| read3_fastq_gzipped | Fastq inputs (array of compressed read 3 FASTQ files). | -| input_id | Output prefix/base name for all intermediate files and pipeline outputs. | +| Variable name | Description | +| --- |-----------------------------------------------------------------------------------------------------------------| +| read1_fastq_gzipped | Fastq inputs (array of compressed read 1 FASTQ files). | +| read2_fastq_gzipped | Fastq inputs (array of compressed read 2 FASTQ files containing cellular barcodes). | +| read3_fastq_gzipped | Fastq inputs (array of compressed read 3 FASTQ files). | +| input_id | Output prefix/base name for all intermediate files and pipeline outputs. | | cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | -| preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false. | -| tar_bwa_reference | BWA reference (tar file containing reference fasta and corresponding files). | -| num_threads_bwa | Optional integer defining the number of CPUs per node for the BWA-mem alignment task (default: 128). | -| mem_size_bwa | Optional integer defining the memory size for the BWA-mem alignment task in GB (default: 512). | -| cpu_platform_bwa | Optional string defining the CPU platform for the BWA-mem alignment task (default: "Intel Ice Lake"). | -| annotations_gtf | CreateFragmentFile input variable: GTF file for SnapATAC2 to calculate TSS sites of fragment file.| -| chrom_sizes | CreateFragmentFile input variable: Text file containing chrom_sizes for genome build (i.e., hg38) | -| whitelist | Whitelist file for ATAC cellular barcodes. | -| adapter_seq_read1 | TrimAdapters input: Sequence adapter for read 1 fastq. | -| adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. | +| preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false. | +| tar_bwa_reference | BWA reference (tar file containing reference fasta and corresponding files). | +| num_threads_bwa | Optional integer defining the number of CPUs per node for the BWA-mem alignment task (default: 128). | +| mem_size_bwa | Optional integer defining the memory size for the BWA-mem alignment task in GB (default: 512). | +| cpu_platform_bwa | Optional string defining the CPU platform for the BWA-mem alignment task (default: "Intel Ice Lake"). | +| annotations_gtf | CreateFragmentFile input variable: GTF file for SnapATAC2 to calculate TSS sites of fragment file. | +| chrom_sizes | CreateFragmentFile input variable: Text file containing chrom_sizes for genome build (i.e., hg38) | +| whitelist | Whitelist file for ATAC cellular barcodes. | +| adapter_seq_read1 | TrimAdapters input: Sequence adapter for read 1 fastq. | +| adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. | +| vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s"). | String | ## ATAC tasks and tools diff --git a/website/docs/Pipelines/Multiome_Pipeline/README.md b/website/docs/Pipelines/Multiome_Pipeline/README.md index ecdbea40b1..4d77ad4dfe 100644 --- a/website/docs/Pipelines/Multiome_Pipeline/README.md +++ b/website/docs/Pipelines/Multiome_Pipeline/README.md @@ -52,32 +52,34 @@ Multiome can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/sta ## Inputs -| Input name | Description | Type | -| --- | --- | --- | -| input_id | Unique identifier describing the biological sample or replicate that corresponds with the FASTQ files; can be a human-readable name or UUID. | String | -| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | -| annotations_gtf | GTF file containing gene annotations used for GEX cell metric calculation and ATAC fragment metrics; must match the GTF used to build the STAR aligner. | File | -| gex_r1_fastq | Array of read 1 FASTQ files representing a single GEX 10x library. | Array[File] | -| gex_r2_fastq | Array of read 2 FASTQ files representing a single GEX 10x library.| Array[File] | -| gex_i1_fastq | Optional array of index FASTQ files representing a single GEX 10x library; multiplexed samples are not currently supported, but the file may be passed to the pipeline. | Array[File] | -| tar_star_reference | TAR file containing a species-specific reference genome and GTF for Optimus (GEX) pipeline. | File | -| mt_genes | Optional file for the Optimus (GEX) pipeline containing mitochondrial gene names used for metric calculation; default assumes 'mt' prefix in GTF (case insensitive). | File | -| counting_mode | Optional string that determines whether the Optimus (GEX) pipeline should be run in single-cell mode (sc_rna) or single-nucleus mode (sn_rna); default is "sn_rna". | String | -| tenx_chemistry_version | Optional integer for the Optimus (GEX) pipeline specifying the 10x version chemistry the data was generated with; validated by examination of the first read 1 FASTQ file read structure; default is "3". | Integer | -| emptydrops_lower | Optional threshold for UMIs for the Optimus (GEX) pipeline that empty drops tool should consider for determining cell; data below threshold is not removed; default is "100". | Integer | -| force_no_check | Optional boolean for the Optimus (GEX) pipeline indicating if the pipeline should perform checks; default is "false". | Boolean | +| Input name | Description | Type | +| --- |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| --- | +| input_id | Unique identifier describing the biological sample or replicate that corresponds with the FASTQ files; can be a human-readable name or UUID. | String | +| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | +| annotations_gtf | GTF file containing gene annotations used for GEX cell metric calculation and ATAC fragment metrics; must match the GTF used to build the STAR aligner. | File | +| gex_r1_fastq | Array of read 1 FASTQ files representing a single GEX 10x library. | Array[File] | +| gex_r2_fastq | Array of read 2 FASTQ files representing a single GEX 10x library. | Array[File] | +| gex_i1_fastq | Optional array of index FASTQ files representing a single GEX 10x library; multiplexed samples are not currently supported, but the file may be passed to the pipeline. | Array[File] | +| tar_star_reference | TAR file containing a species-specific reference genome and GTF for Optimus (GEX) pipeline. | File | +| mt_genes | Optional file for the Optimus (GEX) pipeline containing mitochondrial gene names used for metric calculation; default assumes 'mt' prefix in GTF (case insensitive). | File | +| counting_mode | Optional string that determines whether the Optimus (GEX) pipeline should be run in single-cell mode (sc_rna) or single-nucleus mode (sn_rna); default is "sn_rna". | String | +| tenx_chemistry_version | Optional integer for the Optimus (GEX) pipeline specifying the 10x version chemistry the data was generated with; validated by examination of the first read 1 FASTQ file read structure; default is "3". | Integer | +| emptydrops_lower | Optional threshold for UMIs for the Optimus (GEX) pipeline that empty drops tool should consider for determining cell; data below threshold is not removed; default is "100". | Integer | +| force_no_check | Optional boolean for the Optimus (GEX) pipeline indicating if the pipeline should perform checks; default is "false". | Boolean | | ignore_r1_read_length | Optional boolean for the Optimus (GEX) pipeline indicating if the pipeline should ignore barcode chemistry check; if "true", the workflow will not ensure the `10x_chemistry_version` input matches the chemistry in the read 1 FASTQ; default is "false". | Boolean | -| star_strand_mode | Optional string for the Optimus (GEX) pipeline for performing STARsolo alignment on forward stranded, reverse stranded, or unstranded data; default is "Forward". | String | -| count_exons | Optional boolean for the Optimus (GEX) pipeline indicating if the workflow should calculate exon counts **when in single-nucleus (sn_rna) mode**; if "true" in sc_rna mode, the workflow will return an error; default is "false". | Boolean | -| soloMultiMappers | Optional string describing whether or not the Optimus (GEX) pipeline should run STARsolo with the `--soloMultiMappers` flag. | String | -| atac_r1_fastq | Array of read 1 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] | -| atac_r2_fastq | Array of barcodes FASTQ files representing a single 10x multiome ATAC library. | Array[File] | -| atac_r3_fastq | Array of read 2 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] | -| tar_bwa_reference | TAR file containing the reference index files for BWA-mem alignment for the ATAC pipeline. | File | -| chrom_sizes | File containing the genome chromosome sizes; used to calculate ATAC fragment file metrics. | File | -| adapter_seq_read1 | Optional string describing the adapter sequence for ATAC read 1 paired-end reads to be used during adapter trimming with Cutadapt; default is "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG". | String | -| adapter_seq_read3 | Optional string describing the adapter sequence for ATAC read 2 paired-end reads to be used during adapter trimming with Cutadapt; default is "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG". | String | -| run_cellbender | Optional boolean used to determine if the Optimus (GEX) pipeline should run CellBender on the output gene expression h5ad file, `h5ad_output_file_gex`; default is "false". | Boolean | +| star_strand_mode | Optional string for the Optimus (GEX) pipeline for performing STARsolo alignment on forward stranded, reverse stranded, or unstranded data; default is "Forward". | String | +| count_exons | Optional boolean for the Optimus (GEX) pipeline indicating if the workflow should calculate exon counts **when in single-nucleus (sn_rna) mode**; if "true" in sc_rna mode, the workflow will return an error; default is "false". | Boolean | +| soloMultiMappers | Optional string describing whether or not the Optimus (GEX) pipeline should run STARsolo with the `--soloMultiMappers` flag. | String | +| atac_r1_fastq | Array of read 1 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] | +| atac_r2_fastq | Array of barcodes FASTQ files representing a single 10x multiome ATAC library. | Array[File] | +| atac_r3_fastq | Array of read 2 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] | +| tar_bwa_reference | TAR file containing the reference index files for BWA-mem alignment for the ATAC pipeline. | File | +| chrom_sizes | File containing the genome chromosome sizes; used to calculate ATAC fragment file metrics. | File | +| adapter_seq_read1 | Optional string describing the adapter sequence for ATAC read 1 paired-end reads to be used during adapter trimming with Cutadapt; default is "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG". | String | +| adapter_seq_read3 | Optional string describing the adapter sequence for ATAC read 2 paired-end reads to be used during adapter trimming with Cutadapt; default is "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG". | String | +| run_cellbender | Optional boolean used to determine if the Optimus (GEX) pipeline should run CellBender on the output gene expression h5ad file, `h5ad_output_file_gex`; default is "false". | Boolean | +| vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s"). | String | + #### Sample inputs for analyses in a Terra Workspace From 988c970eecd965ce283e01b0ef461df04b71c639 Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 25 Jun 2024 13:03:42 -0400 Subject: [PATCH 170/186] more disk and mem --- tasks/broad/GermlineVariantDiscovery.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/broad/GermlineVariantDiscovery.wdl b/tasks/broad/GermlineVariantDiscovery.wdl index d6b6e55cc3..bdfa826dfc 100644 --- a/tasks/broad/GermlineVariantDiscovery.wdl +++ b/tasks/broad/GermlineVariantDiscovery.wdl @@ -103,13 +103,13 @@ task HaplotypeCaller_GATK4_VCF { Int memory_multiplier = 1 } - Int memory_size_mb = ceil(8000 * memory_multiplier) + Int memory_size_mb = ceil(8000 * memory_multiplier) + 2000 String output_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz" String output_file_name = vcf_basename + output_suffix Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") - Int disk_size = ceil(((size(input_bam, "GiB") + 30) / hc_scatter) + ref_size) + 20 + Int disk_size = ceil(((size(input_bam, "GiB") + 30) / hc_scatter) + ref_size) + 50 String bamout_arg = if make_bamout then "-bamout ~{vcf_basename}.bamout.bam" else "" From 1da298a607aecbbe90e352a1e15a7b33adb7286c Mon Sep 17 00:00:00 2001 From: aawdeh Date: Wed, 3 Jul 2024 14:21:47 -0400 Subject: [PATCH 171/186] Azurize Cell Bender in Multiome (#1299) --- pipelines/skylab/multiome/Multiome.wdl | 45 ++++++++++++++++++-------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 1a150ea9ea..6257e744d5 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -3,6 +3,7 @@ version 1.0 import "../../../pipelines/skylab/multiome/atac.wdl" as atac import "../../../pipelines/skylab/optimus/Optimus.wdl" as optimus import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils +import "https://raw.githubusercontent.com/aawdeh/CellBender/aa-cbwithoutcuda/wdl/cellbender_remove_background_azure.wdl" as CellBender_no_cuda import "https://raw.githubusercontent.com/broadinstitute/CellBender/v0.3.0/wdl/cellbender_remove_background.wdl" as CellBender import "../../../tasks/broad/Utilities.wdl" as utils @@ -127,25 +128,43 @@ workflow Multiome { # Call CellBender if (run_cellbender) { - call CellBender.run_cellbender_remove_background_gpu as CellBender { - input: - sample_name = input_id, - input_file_unfiltered = Optimus.h5ad_output_file, - hardware_boot_disk_size_GB = 20, - hardware_cpu_count = 4, - hardware_disk_size_GB = 50, - hardware_gpu_type = "nvidia-tesla-t4", - hardware_memory_GB = 32, - hardware_preemptible_tries = 2, - hardware_zones = "us-central1-a us-central1-c", - nvidia_driver_version = "470.82.01" - } + if (cloud_provider == "gcp") { + call CellBender.run_cellbender_remove_background_gpu as CellBender { + input: + sample_name = input_id, + input_file_unfiltered = Optimus.h5ad_output_file, + hardware_boot_disk_size_GB = 20, + hardware_cpu_count = 4, + hardware_disk_size_GB = 50, + hardware_gpu_type = "nvidia-tesla-t4", + hardware_memory_GB = 32, + hardware_preemptible_tries = 2, + hardware_zones = "us-central1-a us-central1-c", + nvidia_driver_version = "470.82.01" + } + } + if (cloud_provider == "azure") { + call CellBender_no_cuda.run_cellbender_remove_background_gpu as CellBender_no_cuda { + input: + sample_name = input_id, + input_file_unfiltered = Optimus.h5ad_output_file, + hardware_boot_disk_size_GB = 20, + hardware_cpu_count = 4, + hardware_disk_size_GB = 50, + hardware_gpu_type = "nvidia-tesla-t4", + hardware_memory_GB = 32, + hardware_preemptible_tries = 2, + hardware_zones = "us-central1-a us-central1-c", + nvidia_driver_version = "470.82.01" + } + } } meta { allowNestedInputs: true } + output { String multiome_pipeline_version_out = pipeline_version From b3293c57dbe7cec9379910f8f9926d223a9d0a18 Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 9 Jul 2024 12:04:27 -0400 Subject: [PATCH 172/186] update docker for Summary_PerCellOutput --- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index b1850b8af1..5975a2a46e 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -112,7 +112,7 @@ workflow snm3C { unique_reads_cgn_extraction_allc_extract = Merge_sort_analyze.extract_allc_output_allc_tar, unique_reads_cgn_extraction_tbi_extract = Merge_sort_analyze.extract_allc_output_tbi_tar, plate_id = plate_id, - docker = docker + docker = docker_prefix + m3c_yap_hisat_docker } call Summary { From 82cccda91e19a526682f849a3f575c31f32b74d1 Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 9 Jul 2024 13:17:22 -0400 Subject: [PATCH 173/186] changelogs --- pipelines/broad/arrays/single_sample/Arrays.wdl | 2 +- .../single_sample/wgs/WholeGenomeGermlineSingleSample.wdl | 2 +- .../broad/dna_seq/germline/variant_calling/VariantCalling.wdl | 2 +- .../single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl | 2 +- pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl | 2 +- .../broad/internal/arrays/single_sample/BroadInternalArrays.wdl | 2 +- .../UltimaGenomics/BroadInternalUltimaGenomics.wdl | 2 +- pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl | 2 +- pipelines/broad/qc/CheckFingerprint.wdl | 2 +- pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl | 2 +- .../reprocessing/external/exome/ExternalExomeReprocessing.wdl | 2 +- .../external/wgs/ExternalWholeGenomeReprocessing.wdl | 2 +- pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl | 2 +- pipelines/skylab/snm3C/snm3C.wdl | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pipelines/broad/arrays/single_sample/Arrays.wdl b/pipelines/broad/arrays/single_sample/Arrays.wdl index 75e52e5c90..2455e4ab20 100644 --- a/pipelines/broad/arrays/single_sample/Arrays.wdl +++ b/pipelines/broad/arrays/single_sample/Arrays.wdl @@ -23,7 +23,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils workflow Arrays { - String pipeline_version = "2.6.24" + String pipeline_version = "2.6.25" input { String chip_well_barcode diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl index 48af86c619..72183e30db 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl @@ -40,7 +40,7 @@ import "../../../../../../structs/dna_seq/DNASeqStructs.wdl" workflow WholeGenomeGermlineSingleSample { - String pipeline_version = "3.1.21" + String pipeline_version = "3.1.22" input { diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index 34df120d96..98b9fb77ee 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -9,7 +9,7 @@ import "../../../../../tasks/broad/DragenTasks.wdl" as DragenTasks workflow VariantCalling { - String pipeline_version = "2.1.19" + String pipeline_version = "2.1.20" input { diff --git a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl index 17d4fecfb8..6cc165522f 100644 --- a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl +++ b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl @@ -43,7 +43,7 @@ workflow UltimaGenomicsWholeGenomeCramOnly { save_bam_file: "If true, then save intermeidate ouputs used by germline pipeline (such as the output BAM) otherwise they won't be kept as outputs." } - String pipeline_version = "1.0.17" + String pipeline_version = "1.0.18" References references = alignment_references.references diff --git a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl index 2443bc8bcb..314995c5db 100644 --- a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl +++ b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl @@ -21,7 +21,7 @@ import "../../../../tasks/broad/Qc.wdl" as Qc workflow IlluminaGenotypingArray { - String pipeline_version = "1.12.18" + String pipeline_version = "1.12.19" input { String sample_alias diff --git a/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl b/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl index b7bf1c183e..6a9b1b195d 100644 --- a/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl +++ b/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl @@ -9,7 +9,7 @@ workflow BroadInternalArrays { description: "Push outputs of Arrays.wdl to TDR dataset table ArraysOutputsTable." } - String pipeline_version = "1.1.8" + String pipeline_version = "1.1.9" input { # inputs to wrapper task diff --git a/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl b/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl index 946c9196dd..df1b6e664b 100644 --- a/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl +++ b/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl @@ -6,7 +6,7 @@ import "../../../../../../../pipelines/broad/qc/CheckFingerprint.wdl" as FP workflow BroadInternalUltimaGenomics { - String pipeline_version = "1.0.18" + String pipeline_version = "1.0.19" input { diff --git a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl index d4f5316e89..766f087263 100644 --- a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl +++ b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl @@ -7,7 +7,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils workflow BroadInternalRNAWithUMIs { - String pipeline_version = "1.0.30" + String pipeline_version = "1.0.31" input { # input needs to be either "hg19" or "hg38" diff --git a/pipelines/broad/qc/CheckFingerprint.wdl b/pipelines/broad/qc/CheckFingerprint.wdl index 0338466c3b..dcc7ee057f 100644 --- a/pipelines/broad/qc/CheckFingerprint.wdl +++ b/pipelines/broad/qc/CheckFingerprint.wdl @@ -24,7 +24,7 @@ import "../../../tasks/broad/Qc.wdl" as Qc workflow CheckFingerprint { - String pipeline_version = "1.0.17" + String pipeline_version = "1.0.18" input { File? input_vcf diff --git a/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl b/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl index 0f4fadb666..49c768ba3c 100644 --- a/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl @@ -7,7 +7,7 @@ import "../../../../structs/dna_seq/DNASeqStructs.wdl" workflow ExomeReprocessing { - String pipeline_version = "3.1.20" + String pipeline_version = "3.1.21" input { File? input_cram diff --git a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl index 3ff6daaa8b..84ff050377 100644 --- a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl @@ -5,7 +5,7 @@ import "../../../../../tasks/broad/CopyFilesFromCloudToCloud.wdl" as Copy workflow ExternalExomeReprocessing { - String pipeline_version = "3.1.22" + String pipeline_version = "3.1.23" input { diff --git a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl index 9776ce06d5..35b77bc3ca 100644 --- a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl @@ -6,7 +6,7 @@ import "../../../../../tasks/broad/CopyFilesFromCloudToCloud.wdl" as Copy workflow ExternalWholeGenomeReprocessing { - String pipeline_version = "2.1.22" + String pipeline_version = "2.1.23" input { File? input_cram diff --git a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl index cd4afd70b5..e7f6d51614 100644 --- a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl @@ -6,7 +6,7 @@ import "../../../../structs/dna_seq/DNASeqStructs.wdl" workflow WholeGenomeReprocessing { - String pipeline_version = "3.1.21" + String pipeline_version = "3.1.22" input { File? input_cram diff --git a/pipelines/skylab/snm3C/snm3C.wdl b/pipelines/skylab/snm3C/snm3C.wdl index 5975a2a46e..dc1f47b00e 100644 --- a/pipelines/skylab/snm3C/snm3C.wdl +++ b/pipelines/skylab/snm3C/snm3C.wdl @@ -44,7 +44,7 @@ workflow snm3C { } # version of the pipeline - String pipeline_version = "4.0.1" + String pipeline_version = "4.0.2" call Demultiplexing { input: From deb14ab9c134a14e08405e0136eddabbc6e7aad4 Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 9 Jul 2024 13:18:53 -0400 Subject: [PATCH 174/186] changelogs --- pipelines/skylab/slideseq/SlideSeq.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index 0998c8eb9b..1f8b7f7b00 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -25,7 +25,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow SlideSeq { - String pipeline_version = "3.1.7" + String pipeline_version = "3.1.8" input { Array[File] r1_fastq From 55385c69fc399548aa5cb2e185bb01356af11ebb Mon Sep 17 00:00:00 2001 From: npetrill Date: Tue, 9 Jul 2024 13:45:31 -0400 Subject: [PATCH 175/186] update dockers --- pipelines/skylab/optimus/Optimus.wdl | 2 +- pipelines/skylab/slideseq/SlideSeq.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index 7e8d089a6e..239ff28423 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -91,7 +91,7 @@ workflow Optimus { String star_docker = "star:1.0.1-2.7.11a-1692706072" String warp_tools_docker_2_0_1 = "warp-tools:2.0.1" String warp_tools_docker_2_0_2 = "warp-tools:2.0.2-1709308985" - String star_merge_docker = "star-merge-npz:1.1" + String star_merge_docker = "star-merge-npz:1.2" #TODO how do we handle these? String alpine_docker = "alpine-bash:latest" diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index 1f8b7f7b00..7d7da7fd50 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -50,7 +50,7 @@ workflow SlideSeq { String picard_cloud_docker = "picard-cloud:2.26.10" String warp_tools_docker_2_0_1 = "warp-tools:2.0.1" String warp_tools_docker_2_0_2 = "warp-tools:2.0.2-1709308985" - String star_merge_docker = "star-merge-npz:1.1" + String star_merge_docker = "star-merge-npz:1.2" String ubuntu_docker = "ubuntu_16_0_4:latest" String gcp_ubuntu_docker_prefix = "gcr.io/gcp-runtimes/" From 1722781692cb2589ba3cfc839f1713ff07928761 Mon Sep 17 00:00:00 2001 From: npetrill Date: Thu, 11 Jul 2024 14:08:50 -0400 Subject: [PATCH 176/186] more changelogs ugh --- pipelines/skylab/multiome/atac.wdl | 2 +- pipelines/skylab/slideseq/SlideSeq.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index f94002abf9..4822ade914 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -46,7 +46,7 @@ workflow ATAC { String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" } - String pipeline_version = "2.0.1" + String pipeline_version = "2.1.1" # Determine docker prefix based on cloud provider String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index 7d7da7fd50..a5aba56884 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -25,7 +25,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow SlideSeq { - String pipeline_version = "3.1.8" + String pipeline_version = "3.1.9" input { Array[File] r1_fastq From 6228acaaa2fd9714f1342f0a867a818d0142482d Mon Sep 17 00:00:00 2001 From: npetrill Date: Thu, 11 Jul 2024 14:10:44 -0400 Subject: [PATCH 177/186] more changelogs ugh --- pipelines/skylab/multiome/atac.changelog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/multiome/atac.changelog.md b/pipelines/skylab/multiome/atac.changelog.md index f0dd220a3c..ae2a9705bc 100644 --- a/pipelines/skylab/multiome/atac.changelog.md +++ b/pipelines/skylab/multiome/atac.changelog.md @@ -1,7 +1,7 @@ # 2.1.1 2024-07-11 (Date of Last Commit) -* Updated the atac.wdl to run on Azure +* Updated the atac.wdl to run on Azure # 2.1.0 2024-07-09 (Date of Last Commit) From 7f38addfe8c4acdc6f15c5410561c12237adfa6b Mon Sep 17 00:00:00 2001 From: npetrill Date: Thu, 11 Jul 2024 14:13:10 -0400 Subject: [PATCH 178/186] more changelogs ugh --- pipelines/skylab/multiome/atac.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index 4822ade914..baa608552e 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -68,7 +68,6 @@ workflow ATAC { } } - String pipeline_version = "2.1.1" parameter_meta { read1_fastq_gzipped: "read 1 FASTQ file as input for the pipeline, contains read 1 of paired reads" From b3854d2be7bf57930018a137d86c8c4c1aeb2729 Mon Sep 17 00:00:00 2001 From: npetrill Date: Mon, 15 Jul 2024 10:18:40 -0400 Subject: [PATCH 179/186] made minor updates, not patches --- .../joint_genotyping/reblocking/ReblockGVCF.changelog.md | 4 ++-- .../germline/joint_genotyping/reblocking/ReblockGVCF.wdl | 2 +- .../wgs/WholeGenomeGermlineSingleSample.changelog.md | 4 ++-- .../single_sample/wgs/WholeGenomeGermlineSingleSample.wdl | 2 +- .../germline/variant_calling/VariantCalling.changelog.md | 4 ++-- .../dna_seq/germline/variant_calling/VariantCalling.wdl | 2 +- .../broad/reprocessing/exome/ExomeReprocessing.changelog.md | 4 ++-- pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl | 2 +- .../external/exome/ExternalExomeReprocessing.changelog.md | 2 +- .../external/exome/ExternalExomeReprocessing.wdl | 2 +- .../wgs/ExternalWholeGenomeReprocessing.changelog.md | 4 ++-- .../external/wgs/ExternalWholeGenomeReprocessing.wdl | 2 +- .../reprocessing/wgs/WholeGenomeReprocessing.changelog.md | 4 ++-- pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl | 2 +- pipelines/skylab/multiome/Multiome.changelog.md | 5 +++-- pipelines/skylab/multiome/Multiome.wdl | 2 +- pipelines/skylab/multiome/atac.changelog.md | 4 ++-- pipelines/skylab/multiome/atac.wdl | 2 +- pipelines/skylab/optimus/Optimus.changelog.md | 4 ++-- pipelines/skylab/optimus/Optimus.wdl | 2 +- pipelines/skylab/paired_tag/PairedTag.changelog.md | 4 ++-- pipelines/skylab/paired_tag/PairedTag.wdl | 2 +- pipelines/skylab/slideseq/SlideSeq.changelog.md | 4 ++-- pipelines/skylab/slideseq/SlideSeq.wdl | 2 +- .../MultiSampleSmartSeq2SingleNucleus.changelog.md | 4 ++-- .../MultiSampleSmartSeq2SingleNucleus.wdl | 2 +- 26 files changed, 39 insertions(+), 38 deletions(-) diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md index 2ca2b89d97..a7e79abe57 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md @@ -1,7 +1,7 @@ -# 2.1.14 +# 2.2.0 2024-07-09 (Date of Last Commit) -* Updated ReblockGVCF.wdl to run in Azure. +* Updated ReblockGVCF.wdl to run in Azure. cloud_provider is a new, required input. # 2.1.13 2024-07-01 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl index 68408f30b9..f9a14011dc 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl @@ -6,7 +6,7 @@ import "../../../../../../tasks/broad/Utilities.wdl" as utils workflow ReblockGVCF { - String pipeline_version = "2.1.14" + String pipeline_version = "2.2.0" input { diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md index b0af35698e..7d4242d6fb 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md @@ -1,7 +1,7 @@ -# 3.1.22 +# 3.2.0 2024-07-09 (Date of Last Commit) -* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers. cloud_provider is a new, required input. * Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline. # 3.1.21 diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl index 72183e30db..bc87cfd0cb 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl @@ -40,7 +40,7 @@ import "../../../../../../structs/dna_seq/DNASeqStructs.wdl" workflow WholeGenomeGermlineSingleSample { - String pipeline_version = "3.1.22" + String pipeline_version = "3.2.0" input { diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md index 89e0aec3e3..b4eb529e7d 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md @@ -1,7 +1,7 @@ -# 2.1.20 +# 2.2.0 2024-07-09 (Date of Last Commit) -* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers. cloud_provider is a new, required input. # 2.1.19 2024-07-01 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index 98b9fb77ee..d352b628bc 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -9,7 +9,7 @@ import "../../../../../tasks/broad/DragenTasks.wdl" as DragenTasks workflow VariantCalling { - String pipeline_version = "2.1.20" + String pipeline_version = "2.2.0" input { diff --git a/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md b/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md index c368809b4e..f42b61ac28 100644 --- a/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md @@ -1,7 +1,7 @@ -# 3.1.21 +# 3.2.0 2024-07-09 (Date of Last Commit) -* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers. +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers. cloud_provider is a new, required input. # 3.1.20 2024-07-01 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl b/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl index 49c768ba3c..b63e0501f5 100644 --- a/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl @@ -7,7 +7,7 @@ import "../../../../structs/dna_seq/DNASeqStructs.wdl" workflow ExomeReprocessing { - String pipeline_version = "3.1.21" + String pipeline_version = "3.2.0" input { File? input_cram diff --git a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md index 2cf6161455..37e685e086 100644 --- a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md @@ -1,7 +1,7 @@ # 3.1.23 2024-07-09 (Date of Last Commit) -* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers. +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers. cloud_provider is a new, required input. # 3.1.22 2024-07-01 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl index 84ff050377..49db5591dc 100644 --- a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl @@ -5,7 +5,7 @@ import "../../../../../tasks/broad/CopyFilesFromCloudToCloud.wdl" as Copy workflow ExternalExomeReprocessing { - String pipeline_version = "3.1.23" + String pipeline_version = "3.2.0" input { diff --git a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md index a276942d1d..6ad12f66c0 100644 --- a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md @@ -1,7 +1,7 @@ -# 2.1.23 +# 2.2.0 2024-07-09 (Date of Last Commit) -* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers. +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers. cloud_provider is a new, required input. # 2.1.22 2024-07-01 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl index 35b77bc3ca..341be24f78 100644 --- a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl @@ -6,7 +6,7 @@ import "../../../../../tasks/broad/CopyFilesFromCloudToCloud.wdl" as Copy workflow ExternalWholeGenomeReprocessing { - String pipeline_version = "2.1.23" + String pipeline_version = "2.2.0" input { File? input_cram diff --git a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md index 42da79d053..856a1a2f1c 100644 --- a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md @@ -1,7 +1,7 @@ -# 3.1.22 +# 3.2.0 2024-07-09 (Date of Last Commit) -* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers. cloud_provider is a new, required input. # 3.1.21 2024-07-01 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl index e7f6d51614..a65e723ad3 100644 --- a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl @@ -6,7 +6,7 @@ import "../../../../structs/dna_seq/DNASeqStructs.wdl" workflow WholeGenomeReprocessing { - String pipeline_version = "3.1.22" + String pipeline_version = "3.2.0" input { File? input_cram diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index 8b26ecfc7f..2cac5dc595 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -1,5 +1,6 @@ -# 5.2.1 -* Updated the Multiome.wdl to run on Azure +# 5.3.0 + +* Updated the Multiome.wdl to run on Azure. cloud_provider is a new, required input. # 5.2.0 2024-07-09 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 274141a76a..9da6addf65 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -9,7 +9,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow Multiome { - String pipeline_version = "5.2.1" + String pipeline_version = "5.3.0" input { diff --git a/pipelines/skylab/multiome/atac.changelog.md b/pipelines/skylab/multiome/atac.changelog.md index ae2a9705bc..f44b5b2328 100644 --- a/pipelines/skylab/multiome/atac.changelog.md +++ b/pipelines/skylab/multiome/atac.changelog.md @@ -1,7 +1,7 @@ -# 2.1.1 +# 2.2.0 2024-07-11 (Date of Last Commit) -* Updated the atac.wdl to run on Azure +* Updated the atac.wdl to run on Azure. cloud_provider is a new, required input. # 2.1.0 2024-07-09 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index baa608552e..b54f91043b 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -46,7 +46,7 @@ workflow ATAC { String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" } - String pipeline_version = "2.1.1" + String pipeline_version = "2.2.0" # Determine docker prefix based on cloud provider String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md index 3d353e09c7..9dcb41f6bc 100644 --- a/pipelines/skylab/optimus/Optimus.changelog.md +++ b/pipelines/skylab/optimus/Optimus.changelog.md @@ -1,7 +1,7 @@ -# 7.3.1 +# 7.4.0 2024-07-11 (Date of Last Commit) -* Updated the Optimus.wdl to run on Azure +* Updated the Optimus.wdl to run on Azure. cloud_provider is a new, required input. * Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. # 7.3.0 diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index 30a57be5db..43986ffb79 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -71,7 +71,7 @@ workflow Optimus { # version of this pipeline - String pipeline_version = "7.3.1" + String pipeline_version = "7.4.0" # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index b75d12836b..d6eefd1ae3 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,7 +1,7 @@ -# 1.2.1 +# 1.3.0 2024-07-11 (Date of Last Commit) -* Updated the PairedTag.wdl to run on Azure +* Updated the PairedTag.wdl to run on Azure. cloud_provider is a new, required input. # 1.2.0 2024-07-09 (Date of Last Commit) diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index cbf801e0ee..0ac9aeb8db 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow PairedTag { - String pipeline_version = "1.2.1" + String pipeline_version = "1.3.0" input { diff --git a/pipelines/skylab/slideseq/SlideSeq.changelog.md b/pipelines/skylab/slideseq/SlideSeq.changelog.md index 1ea407cea6..aed4dcd7a7 100644 --- a/pipelines/skylab/slideseq/SlideSeq.changelog.md +++ b/pipelines/skylab/slideseq/SlideSeq.changelog.md @@ -1,7 +1,7 @@ -# 3.1.9 +# 3.2.0 2024-07-11 (Date of Last Commit) -* Updated the Optimus.wdl to run on Azure. This change does not affect the SlideSeq pipeline. +* Updated the Optimus.wdl to run on Azure. cloud_provider is a new, required input. # 3.1.8 2024-07-09 (Date of Last Commit) diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index a5aba56884..1de6ce6fdf 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -25,7 +25,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow SlideSeq { - String pipeline_version = "3.1.9" + String pipeline_version = "3.2.0" input { Array[File] r1_fastq diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md index 3850f9db8b..4d00d91015 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md @@ -1,7 +1,7 @@ -# 1.3.6 +# 1.4.0 2024-07-11 (Date of Last Commit) -* Updated the PairedTag.wdl to run on Azure. This change does not affect the MultiSampleSmartSeq2SingleNucleus pipeline. +* Updated the PairedTag.wdl to run on Azure. cloud_provider is a new, required input. * Added new optional input parameter of gex_nhash_id to the STARAlign task; this does not impact the MultiSampleSmartSeq2SingleNucleus workflow # 1.3.5 diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl index 9007815574..61673ffcb5 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl @@ -57,7 +57,7 @@ workflow MultiSampleSmartSeq2SingleNucleus { } # Version of this pipeline - String pipeline_version = "1.3.6" + String pipeline_version = "1.4.0" if (false) { String? none = "None" From 2543a79702d0aabe05ba9ff77d7cffb9e4146dab Mon Sep 17 00:00:00 2001 From: npetrill Date: Mon, 15 Jul 2024 10:45:40 -0400 Subject: [PATCH 180/186] pointing to pinned version of dockers or the sha --- .../germline/joint_genotyping/reblocking/ReblockGVCF.wdl | 2 +- .../single_sample/exome/ExomeGermlineSingleSample.wdl | 2 +- .../broad/dna_seq/germline/variant_calling/VariantCalling.wdl | 2 +- pipelines/skylab/optimus/Optimus.wdl | 4 ++-- pipelines/skylab/slideseq/SlideSeq.wdl | 2 +- .../MultiSampleSmartSeq2SingleNucleus.wdl | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl index f9a14011dc..e64854bf30 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl @@ -27,7 +27,7 @@ workflow ReblockGVCF { String gvcf_basename = basename(gvcf, gvcf_file_extension) # docker images String gatk_docker_gcp = "us.gcr.io/broad-gatk/gatk:4.5.0.0" - String gatk_docker_azure = "dsppipelinedev.azurecr.io/gatk_reduced_layers:latest" + String gatk_docker_azure = "dsppipelinedev.azurecr.io/gatk_reduced_layers:1.0.0" String gatk_docker = if cloud_provider == "gcp" then gatk_docker_gcp else gatk_docker_azure # make sure either gcp or azr is supplied as cloud_provider input diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl index 00769b467c..1197dcbdd2 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl @@ -69,7 +69,7 @@ workflow ExomeGermlineSingleSample { # docker images String gatk_docker_gcp = "us.gcr.io/broad-gatk/gatk:4.5.0.0" - String gatk_docker_azure = "dsppipelinedev.azurecr.io/gatk_reduced_layers:latest" + String gatk_docker_azure = "dsppipelinedev.azurecr.io/gatk_reduced_layers:1.0.0" String gatk_docker = if cloud_provider == "gcp" then gatk_docker_gcp else gatk_docker_azure # make sure either gcp or azr is supplied as cloud_provider input diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index d352b628bc..ba265a80ef 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -41,7 +41,7 @@ workflow VariantCalling { # docker images String gatk_docker_gcp = "us.gcr.io/broad-gatk/gatk:4.5.0.0" - String gatk_docker_azure = "dsppipelinedev.azurecr.io/gatk_reduced_layers:latest" + String gatk_docker_azure = "dsppipelinedev.azurecr.io/gatk_reduced_layers:1.0.0" String gatk_docker = if cloud_provider == "gcp" then gatk_docker_gcp else gatk_docker_azure String gatk_1_3_docker_gcp = "us.gcr.io/broad-gotc-prod/gatk:1.3.0-4.2.6.1-1649964384" diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index 43986ffb79..0312b41b02 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -96,12 +96,12 @@ workflow Optimus { String star_merge_docker = "star-merge-npz:1.2" #TODO how do we handle these? - String alpine_docker = "alpine-bash:latest" + String alpine_docker = "alpine-bash@sha256:965a718a07c700a5204c77e391961edee37477634ce2f9cf652a8e4c2db858ff" String gcp_alpine_docker_prefix = "bashell/" String acr_alpine_docker_prefix = "dsppipelinedev.azurecr.io/" String alpine_docker_prefix = if cloud_provider == "gcp" then gcp_alpine_docker_prefix else acr_alpine_docker_prefix - String ubuntu_docker = "ubuntu_16_0_4:latest" + String ubuntu_docker = "ubuntu_16_0_4@sha256:025124e2f1cf4d29149958f17270596bffe13fc6acca6252977c572dd5ba01bf" String gcp_ubuntu_docker_prefix = "gcr.io/gcp-runtimes/" String acr_ubuntu_docker_prefix = "dsppipelinedev.azurecr.io/" String ubuntu_docker_prefix = if cloud_provider == "gcp" then gcp_ubuntu_docker_prefix else acr_ubuntu_docker_prefix diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index 1de6ce6fdf..409e3123b6 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -52,7 +52,7 @@ workflow SlideSeq { String warp_tools_docker_2_0_2 = "warp-tools:2.0.2-1709308985" String star_merge_docker = "star-merge-npz:1.2" - String ubuntu_docker = "ubuntu_16_0_4:latest" + String ubuntu_docker = "ubuntu_16_0_4@sha256:025124e2f1cf4d29149958f17270596bffe13fc6acca6252977c572dd5ba01bf" String gcp_ubuntu_docker_prefix = "gcr.io/gcp-runtimes/" String acr_ubuntu_docker_prefix = "dsppipelinedev.azurecr.io/" String ubuntu_docker_prefix = if cloud_provider == "gcp" then gcp_ubuntu_docker_prefix else acr_ubuntu_docker_prefix diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl index 61673ffcb5..068b35003d 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl @@ -43,7 +43,7 @@ workflow MultiSampleSmartSeq2SingleNucleus { String cloud_provider } - String ubuntu_docker = "ubuntu_16_0_4:latest" + String ubuntu_docker = "ubuntu_16_0_4@sha256:025124e2f1cf4d29149958f17270596bffe13fc6acca6252977c572dd5ba01bf" String gcp_ubuntu_docker_prefix = "gcr.io/gcp-runtimes/" String acr_ubuntu_docker_prefix = "dsppipelinedev.azurecr.io/" String ubuntu_docker_prefix = if cloud_provider == "gcp" then gcp_ubuntu_docker_prefix else acr_ubuntu_docker_prefix From a002052f10c61b1f94d73edd0744da2c9980b788 Mon Sep 17 00:00:00 2001 From: npetrill Date: Mon, 15 Jul 2024 10:49:05 -0400 Subject: [PATCH 181/186] extra space --- ...omeGermlineSingleSample.inputs.plumbing.masked_reference.json | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/input_files/WholeGenomeGermlineSingleSample.inputs.plumbing.masked_reference.json b/pipelines/broad/dna_seq/germline/single_sample/wgs/input_files/WholeGenomeGermlineSingleSample.inputs.plumbing.masked_reference.json index 309e93f9bd..871c5589de 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/input_files/WholeGenomeGermlineSingleSample.inputs.plumbing.masked_reference.json +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/input_files/WholeGenomeGermlineSingleSample.inputs.plumbing.masked_reference.json @@ -51,7 +51,6 @@ "WholeGenomeGermlineSingleSample.fingerprint_genotypes_index": "gs://broad-gotc-test-storage/single_sample/plumbing/bams/G96830.NA12878/G96830.NA12878.hg38.reference.fingerprint.vcf.gz.tbi", "WholeGenomeGermlineSingleSample.wgs_coverage_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_coverage_regions.hg38.interval_list", "WholeGenomeGermlineSingleSample.cloud_provider": "gcp", - "WholeGenomeGermlineSingleSample.papi_settings": { "preemptible_tries": 3, "agg_preemptible_tries": 3 From e3724ebd3010be4286a58cb26630452afd57fe27 Mon Sep 17 00:00:00 2001 From: Nikelle Petrillo <38223776+nikellepetrillo@users.noreply.github.com> Date: Mon, 15 Jul 2024 10:50:40 -0400 Subject: [PATCH 182/186] Apply suggestions from code review Co-authored-by: ekiernan <55763654+ekiernan@users.noreply.github.com> --- .../single_sample/exome/ExomeGermlineSingleSample.changelog.md | 2 +- .../ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md | 2 +- .../wgs/WholeGenomeGermlineSingleSample.changelog.md | 2 +- .../ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md | 2 +- .../genotyping/illumina/IlluminaGenotypingArray.changelog.md | 2 +- .../arrays/imputation/BroadInternalImputation.changelog.md | 2 +- .../internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md | 2 +- pipelines/broad/qc/CheckFingerprint.changelog.md | 2 +- pipelines/skylab/optimus/Optimus.changelog.md | 2 +- pipelines/skylab/snm3C/snm3C.changelog.md | 2 +- website/docs/Pipelines/Multiome_Pipeline/README.md | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md index daaa5acd40..45a45b81eb 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md @@ -1,7 +1,7 @@ # 3.1.21 2024-07-09 (Date of Last Commit) -* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. +* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers # 3.1.20 2024-07-01 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md index d355ea7e04..88c0a43cbf 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md @@ -1,7 +1,7 @@ # 1.0.18 2024-07-09 (Date of Last Commit) -* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline. +* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline # 1.0.17 2024-07-01 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md index 7d4242d6fb..f8be38c09f 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md @@ -2,7 +2,7 @@ 2024-07-09 (Date of Last Commit) * Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers. cloud_provider is a new, required input. -* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline. +* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers; this change does not affect this pipeline # 3.1.21 2024-07-01 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md index af72457ba8..21120b9c7e 100644 --- a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md +++ b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md @@ -1,7 +1,7 @@ # 1.0.18 2024-07-09 (Date of Last Commit) -* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline # 1.0.17 2024-07-01 (Date of Last Commit) diff --git a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md index 0a006dc85a..7e775a6553 100644 --- a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md +++ b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md @@ -1,7 +1,7 @@ # 1.12.19 2024-07-09 (Date of Last Commit) -* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers. +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers # 1.12.18 2024-07-01 (Date of Last Commit) diff --git a/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md b/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md index 6d45d66333..4ff223caf2 100644 --- a/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md +++ b/pipelines/broad/internal/arrays/imputation/BroadInternalImputation.changelog.md @@ -1,7 +1,7 @@ # 1.1.11 2024-05-21 (Date of Last Commit) -* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline. +* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers; this change does not affect this pipeline # 1.1.10 2023-12-18 (Date of Last Commit) diff --git a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md index 652e81bb01..407ff0c43d 100644 --- a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md +++ b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md @@ -1,6 +1,6 @@ # 1.0.31 2024-07-09 -* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline # 1.0.30 2024-07-01 (Date of Last Commit) diff --git a/pipelines/broad/qc/CheckFingerprint.changelog.md b/pipelines/broad/qc/CheckFingerprint.changelog.md index f139f145ad..9f11431c50 100644 --- a/pipelines/broad/qc/CheckFingerprint.changelog.md +++ b/pipelines/broad/qc/CheckFingerprint.changelog.md @@ -1,7 +1,7 @@ # 1.0.18 2024-07-00 (Date of Last Commit) -* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline. +* Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers; this does not affect this pipeline # 1.0.17 2024-07-01 (Date of Last Commit) diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md index 9dcb41f6bc..cdbfca2c42 100644 --- a/pipelines/skylab/optimus/Optimus.changelog.md +++ b/pipelines/skylab/optimus/Optimus.changelog.md @@ -2,7 +2,7 @@ 2024-07-11 (Date of Last Commit) * Updated the Optimus.wdl to run on Azure. cloud_provider is a new, required input. -* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. +* Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers # 7.3.0 2024-07-09 (Date of Last Commit) diff --git a/pipelines/skylab/snm3C/snm3C.changelog.md b/pipelines/skylab/snm3C/snm3C.changelog.md index 327b404dfb..8cf6455276 100644 --- a/pipelines/skylab/snm3C/snm3C.changelog.md +++ b/pipelines/skylab/snm3C/snm3C.changelog.md @@ -1,7 +1,7 @@ # 4.0.2 2024-07-09 (Date of Last Commit) -* Updated the snM3C wdl to run on Azure. This change does not affect the snM3C pipeline. +* Updated the snM3C wdl to run on Azure; this change does not affect the snM3C pipeline # 4.0.1 2024-06-26 (Date of Last Commit) diff --git a/website/docs/Pipelines/Multiome_Pipeline/README.md b/website/docs/Pipelines/Multiome_Pipeline/README.md index 51a820034d..bfe793457a 100644 --- a/website/docs/Pipelines/Multiome_Pipeline/README.md +++ b/website/docs/Pipelines/Multiome_Pipeline/README.md @@ -121,7 +121,7 @@ The Multiome workflow calls two WARP subworkflows, one external subworkflow (opt | multimappers_Rescue_matrix | `UniqueAndMult-Rescue.mtx` | Optional output produced when `soloMultiMappers` is "Rescue"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | | multimappers_PropUnique_matrix | `UniqueAndMult-PropUnique.mtx` | Optional output produced when `soloMultiMappers` is "PropUnique"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| | gex_aligner_metrics | `.star_metrics.tar` | Text file containing per barcode metrics (`CellReads.stats`) produced by the GEX pipeline STARsolo aligner. | -| library_metrics | `_library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | +| library_metrics | `__library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | | mtx_files | `.mtx_files.tar` | TAR file with STARsolo matrix market files (barcodes.tsv, features.tsv, and matrix.mtx) | TAR | | cell_barcodes_csv | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information.| | checkpoint_file | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | From 7807103ab81d8be91dfdd95b306c0b28393c7ef8 Mon Sep 17 00:00:00 2001 From: npetrill Date: Mon, 15 Jul 2024 10:54:21 -0400 Subject: [PATCH 183/186] changelog --- .../external/exome/ExternalExomeReprocessing.changelog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md index 37e685e086..ea7abd045b 100644 --- a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md @@ -1,4 +1,4 @@ -# 3.1.23 +# 3.2.0 2024-07-09 (Date of Last Commit) * Updated tasks GermlineVariantDiscovery.wdl and QC.wdl to allow multi-cloud dockers. cloud_provider is a new, required input. From ad7d2a88fc21c6a3318c6ec90a804887317a80e6 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Mon, 15 Jul 2024 12:53:19 -0400 Subject: [PATCH 184/186] doc reformatting --- website/docs/Pipelines/ATAC/README.md | 4 ++-- website/docs/Pipelines/PairedTag_Pipeline/README.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md index 286ea6898a..06989e960f 100644 --- a/website/docs/Pipelines/ATAC/README.md +++ b/website/docs/Pipelines/ATAC/README.md @@ -44,8 +44,8 @@ ATAC can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/stable/ ## Input Variables The following describes the inputs of the ATAC workflow. For more details on how default inputs are set for the Multiome workflow, see the [Multiome overview](../Multiome_Pipeline/README). -| Variable name | Description | -| --- |-----------------------------------------------------------------------------------------------------------------| +| Variable name | Description | +| --- |--- | | read1_fastq_gzipped | Fastq inputs (array of compressed read 1 FASTQ files). | | read2_fastq_gzipped | Fastq inputs (array of compressed read 2 FASTQ files containing cellular barcodes). | | read3_fastq_gzipped | Fastq inputs (array of compressed read 3 FASTQ files). | diff --git a/website/docs/Pipelines/PairedTag_Pipeline/README.md b/website/docs/Pipelines/PairedTag_Pipeline/README.md index 679bf97318..a203d53447 100644 --- a/website/docs/Pipelines/PairedTag_Pipeline/README.md +++ b/website/docs/Pipelines/PairedTag_Pipeline/README.md @@ -6,7 +6,7 @@ slug: /Pipelines/PairedTag_Pipeline/README # Paired-Tag Overview | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | -|:-------------------------------------------------------------------:| :---: | :----: | :--------------: | +|:---:| :---: | :---: | :---: | | [PairedTag_v1.0.1](https://github.com/broadinstitute/warp/releases) | June, 2024 | Kaylee Mathews | Please file GitHub issues in warp or contact [documentation authors](mailto:warp-pipelines-help@broadinstitute.org) | @@ -91,7 +91,7 @@ The Paired-Tag workflow inputs are specified in JSON configuration files. Exampl The Paired-Tag workflow calls two WARP subworkflows and an additional task which are described briefly in the table below. For more details on each subworkflow and task, see the documentation and WDL scripts linked in the table. | Subworkflow/Task | Software | Description | -| ----------- | -------- | ----------- | +| --- | --- | --- | | Optimus ([WDL](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab/optimus/Optimus.wdl) and [documentation](../Optimus_Pipeline/README)) | fastqprocess, STARsolo, Emptydrops | Workflow used to analyze 10x single-cell GEX data. | | PairedTagDemultiplex as demultiplex ([WDL](https://github.com/broadinstitute/warp/blob/develop/tasks/skylab/PairedTagUtils.wdl)) | UPStools | Task used to check the length of the read2 FASTQ (should be either 27 or 24 bp). If `preindex` is set to true, the task will perform demultiplexing of the 3-bp sample barcode from the read2 ATAC fastq files and stores it in the readname. It will then perform barcode orientation checking. The ATAC workflow will then add a combined 3 bp sample barcode and cellular barcode to the BB tag of the BAM. If `preindex` is false and then length is 27 bp, the task will perform trimming and subsequent barcode orientation checking. | | ATAC ([WDL](https://github.com/broadinstitute/warp/blob/develop/pipelines/skylab/multiome/atac.wdl) and [documentation](../ATAC/README)) | fastqprocess, bwa-mem, SnapATAC2 | Workflow used to analyze single-nucleus paired-tag DNA (histone modifications) data. | From 3c4d9115bb937220f614d22ebdd8f3b37b466c6d Mon Sep 17 00:00:00 2001 From: Nikelle Petrillo <38223776+nikellepetrillo@users.noreply.github.com> Date: Mon, 15 Jul 2024 13:00:39 -0400 Subject: [PATCH 185/186] Update pipelines/broad/arrays/imputation/Imputation.changelog.md --- pipelines/broad/arrays/imputation/Imputation.changelog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/broad/arrays/imputation/Imputation.changelog.md b/pipelines/broad/arrays/imputation/Imputation.changelog.md index 02b32dc771..e798bc7671 100644 --- a/pipelines/broad/arrays/imputation/Imputation.changelog.md +++ b/pipelines/broad/arrays/imputation/Imputation.changelog.md @@ -1,5 +1,5 @@ # 1.1.13 -2023-05-21 (Date of Last Commit) +2024-05-21 (Date of Last Commit) * Updated GermlineVariantDiscovery, BamProcessing, DragenTasks, Qc, and Utilities tasks to allow multi-cloud dockers. This change does not affect this pipeline. From 0549763e85dea41aba941bdd44f9b2b63bc82b66 Mon Sep 17 00:00:00 2001 From: Nikelle Petrillo <38223776+nikellepetrillo@users.noreply.github.com> Date: Mon, 15 Jul 2024 13:07:05 -0400 Subject: [PATCH 186/186] Update pipelines/skylab/slideseq/SlideSeq.changelog.md Co-authored-by: ekiernan <55763654+ekiernan@users.noreply.github.com> --- pipelines/skylab/slideseq/SlideSeq.changelog.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/slideseq/SlideSeq.changelog.md b/pipelines/skylab/slideseq/SlideSeq.changelog.md index aed4dcd7a7..bdef191cc9 100644 --- a/pipelines/skylab/slideseq/SlideSeq.changelog.md +++ b/pipelines/skylab/slideseq/SlideSeq.changelog.md @@ -1,7 +1,7 @@ # 3.2.0 2024-07-11 (Date of Last Commit) -* Updated the Optimus.wdl to run on Azure. cloud_provider is a new, required input. +* Updated the Optimus.wdl to run on Azure; cloud_provider is a new, required input # 3.1.8 2024-07-09 (Date of Last Commit)