From 08d6bbce9f4d46d679851498b4ed1b249d4d0c00 Mon Sep 17 00:00:00 2001 From: Nikelle Petrillo <38223776+nikellepetrillo@users.noreply.github.com> Date: Tue, 18 Jun 2024 09:22:12 -0400 Subject: [PATCH] Np make vm size an input to multiome (#1289) * add vm size as input to Multiome.wdl * add vm size as input to Multiome.wdl * add new input to overviews * add new input to overviews * add new input to overviews * add new input to overviews --- pipelines/skylab/multiome/Multiome.wdl | 7 ++- pipelines/skylab/multiome/atac.wdl | 17 ++++-- pipelines/skylab/paired_tag/PairedTag.wdl | 6 ++- .../Plumbing/BC011_BC015_downsampled.json | 3 +- .../Plumbing/BI015_downsampled.json | 3 +- website/docs/Pipelines/ATAC/README.md | 33 ++++++------ .../Pipelines/Multiome_Pipeline/README.md | 52 ++++++++++--------- 7 files changed, 70 insertions(+), 51 deletions(-) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 3c3b7d222b..1a150ea9ea 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -35,6 +35,8 @@ workflow Multiome { Array[File] atac_r1_fastq Array[File] atac_r2_fastq Array[File] atac_r3_fastq + # VM size used for several ATAC tasks + String vm_size = "Standard_M128s" # BWA tar reference File tar_bwa_reference # Chromosone sizes @@ -109,8 +111,9 @@ workflow Multiome { chrom_sizes = chrom_sizes, whitelist = atac_whitelist, adapter_seq_read1 = adapter_seq_read1, - annotations_gtf = annotations_gtf, - adapter_seq_read3 = adapter_seq_read3 + adapter_seq_read3 = adapter_seq_read3, + vm_size = vm_size, + annotations_gtf = annotations_gtf } call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes { input: diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl index 3c2f420c20..061e9c892b 100644 --- a/pipelines/skylab/multiome/atac.wdl +++ b/pipelines/skylab/multiome/atac.wdl @@ -30,6 +30,7 @@ workflow ATAC { Int num_threads_bwa = 128 Int mem_size_bwa = 512 String cpu_platform_bwa = "Intel Ice Lake" + String vm_size # Text file containing chrom_sizes for genome build (i.e. hg38) File chrom_sizes @@ -80,7 +81,8 @@ workflow ATAC { input: nthreads = num_threads_bwa, mem_size = mem_size_bwa, - cpu_platform = cpu_platform_bwa + cpu_platform = cpu_platform_bwa, + vm_size = vm_size } call FastqProcessing.FastqProcessATAC as SplitFastq { @@ -116,7 +118,8 @@ workflow ATAC { mem_size = mem_size_bwa, cpu_platform = cpu_platform_bwa, docker_path = docker_prefix + samtools_docker, - cloud_provider = cloud_provider + cloud_provider = cloud_provider, + vm_size = vm_size } if (preindex) { @@ -166,12 +169,14 @@ task GetNumSplits { Int mem_size String cpu_platform String docker_image = "ubuntu:latest" + String vm_size } parameter_meta { docker_image: "the ubuntu docker image (default: ubuntu:latest)" nthreads: "Number of threads per node (default: 128)" mem_size: "the size of memory used during alignment" + vm_size: "the virtual machine used for the task" } command <<< @@ -236,7 +241,7 @@ task GetNumSplits { cpu: nthreads cpuPlatform: cpu_platform memory: "${mem_size} GiB" - vm_size: "Standard_M128s" + vm_size: vm_size } output { @@ -327,7 +332,8 @@ task BWAPairedEndAlignment { Int disk_size = 2000 Int nthreads Int mem_size - String cpu_platform + String cpu_platform + String vm_size } parameter_meta { @@ -342,6 +348,7 @@ task BWAPairedEndAlignment { output_base_name: "basename to be used for the output of the task" docker_path: "The docker image path containing the runtime environment for this task" cloud_provider: "The cloud provider for the pipeline." + vm_size: "the virtual machine used for the task" } String bam_aligned_output_name = output_base_name + ".bam" @@ -471,7 +478,7 @@ task BWAPairedEndAlignment { cpu: nthreads cpuPlatform: cpu_platform memory: "${mem_size} GiB" - vm_size: "Standard_M128s" + vm_size: vm_size } output { diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index bc19f65160..ce1eb08599 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -34,6 +34,9 @@ workflow PairedTag { Array[File] atac_r1_fastq Array[File] atac_r2_fastq Array[File] atac_r3_fastq + + String vm_size = "Standard_M128s" + # BWA input File tar_bwa_reference File chrom_sizes @@ -119,7 +122,8 @@ workflow PairedTag { adapter_seq_read3 = adapter_seq_read3, annotations_gtf = annotations_gtf, preindex = preindex, - cloud_provider = cloud_provider + cloud_provider = cloud_provider, + vm_size = vm_size } if (preindex) { diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json index 9e7b18b679..470b1ce33c 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json +++ b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json @@ -23,5 +23,6 @@ "PairedTag.Atac_preindex.cpu_platform_bwa":"Intel Cascade Lake", "PairedTag.Atac_preindex.num_threads_bwa":"16", "PairedTag.Atac_preindex.mem_size_bwa":"64", - "PairedTag.soloMultiMappers":"Uniform" + "PairedTag.soloMultiMappers":"Uniform", + "PairedTag.cloud_provider": "gcp" } diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json index 2bdd7a8fe2..67560d3aee 100644 --- a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json +++ b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json @@ -23,5 +23,6 @@ "PairedTag.Atac_preindex.cpu_platform_bwa":"Intel Cascade Lake", "PairedTag.Atac_preindex.num_threads_bwa":"16", "PairedTag.Atac_preindex.mem_size_bwa":"64", - "PairedTag.soloMultiMappers":"Uniform" + "PairedTag.soloMultiMappers":"Uniform", + "PairedTag.cloud_provider": "gcp" } diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md index 76033520f8..d95d82a440 100644 --- a/website/docs/Pipelines/ATAC/README.md +++ b/website/docs/Pipelines/ATAC/README.md @@ -44,23 +44,24 @@ ATAC can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/stable/ ## Input Variables The following describes the inputs of the ATAC workflow. For more details on how default inputs are set for the Multiome workflow, see the [Multiome overview](../Multiome_Pipeline/README). -| Variable name | Description | -| --- | --- | -| read1_fastq_gzipped | Fastq inputs (array of compressed read 1 FASTQ files). | -| read2_fastq_gzipped | Fastq inputs (array of compressed read 2 FASTQ files containing cellular barcodes). | -| read3_fastq_gzipped | Fastq inputs (array of compressed read 3 FASTQ files). | -| input_id | Output prefix/base name for all intermediate files and pipeline outputs. | +| Variable name | Description | +| --- |-----------------------------------------------------------------------------------------------------------------| +| read1_fastq_gzipped | Fastq inputs (array of compressed read 1 FASTQ files). | +| read2_fastq_gzipped | Fastq inputs (array of compressed read 2 FASTQ files containing cellular barcodes). | +| read3_fastq_gzipped | Fastq inputs (array of compressed read 3 FASTQ files). | +| input_id | Output prefix/base name for all intermediate files and pipeline outputs. | | cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | -| preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false. | -| tar_bwa_reference | BWA reference (tar file containing reference fasta and corresponding files). | -| num_threads_bwa | Optional integer defining the number of CPUs per node for the BWA-mem alignment task (default: 128). | -| mem_size_bwa | Optional integer defining the memory size for the BWA-mem alignment task in GB (default: 512). | -| cpu_platform_bwa | Optional string defining the CPU platform for the BWA-mem alignment task (default: "Intel Ice Lake"). | -| annotations_gtf | CreateFragmentFile input variable: GTF file for SnapATAC2 to calculate TSS sites of fragment file.| -| chrom_sizes | CreateFragmentFile input variable: Text file containing chrom_sizes for genome build (i.e., hg38) | -| whitelist | Whitelist file for ATAC cellular barcodes. | -| adapter_seq_read1 | TrimAdapters input: Sequence adapter for read 1 fastq. | -| adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. | +| preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false. | +| tar_bwa_reference | BWA reference (tar file containing reference fasta and corresponding files). | +| num_threads_bwa | Optional integer defining the number of CPUs per node for the BWA-mem alignment task (default: 128). | +| mem_size_bwa | Optional integer defining the memory size for the BWA-mem alignment task in GB (default: 512). | +| cpu_platform_bwa | Optional string defining the CPU platform for the BWA-mem alignment task (default: "Intel Ice Lake"). | +| annotations_gtf | CreateFragmentFile input variable: GTF file for SnapATAC2 to calculate TSS sites of fragment file. | +| chrom_sizes | CreateFragmentFile input variable: Text file containing chrom_sizes for genome build (i.e., hg38) | +| whitelist | Whitelist file for ATAC cellular barcodes. | +| adapter_seq_read1 | TrimAdapters input: Sequence adapter for read 1 fastq. | +| adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. | +| vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s"). | String | ## ATAC tasks and tools diff --git a/website/docs/Pipelines/Multiome_Pipeline/README.md b/website/docs/Pipelines/Multiome_Pipeline/README.md index ecdbea40b1..4d77ad4dfe 100644 --- a/website/docs/Pipelines/Multiome_Pipeline/README.md +++ b/website/docs/Pipelines/Multiome_Pipeline/README.md @@ -52,32 +52,34 @@ Multiome can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/sta ## Inputs -| Input name | Description | Type | -| --- | --- | --- | -| input_id | Unique identifier describing the biological sample or replicate that corresponds with the FASTQ files; can be a human-readable name or UUID. | String | -| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | -| annotations_gtf | GTF file containing gene annotations used for GEX cell metric calculation and ATAC fragment metrics; must match the GTF used to build the STAR aligner. | File | -| gex_r1_fastq | Array of read 1 FASTQ files representing a single GEX 10x library. | Array[File] | -| gex_r2_fastq | Array of read 2 FASTQ files representing a single GEX 10x library.| Array[File] | -| gex_i1_fastq | Optional array of index FASTQ files representing a single GEX 10x library; multiplexed samples are not currently supported, but the file may be passed to the pipeline. | Array[File] | -| tar_star_reference | TAR file containing a species-specific reference genome and GTF for Optimus (GEX) pipeline. | File | -| mt_genes | Optional file for the Optimus (GEX) pipeline containing mitochondrial gene names used for metric calculation; default assumes 'mt' prefix in GTF (case insensitive). | File | -| counting_mode | Optional string that determines whether the Optimus (GEX) pipeline should be run in single-cell mode (sc_rna) or single-nucleus mode (sn_rna); default is "sn_rna". | String | -| tenx_chemistry_version | Optional integer for the Optimus (GEX) pipeline specifying the 10x version chemistry the data was generated with; validated by examination of the first read 1 FASTQ file read structure; default is "3". | Integer | -| emptydrops_lower | Optional threshold for UMIs for the Optimus (GEX) pipeline that empty drops tool should consider for determining cell; data below threshold is not removed; default is "100". | Integer | -| force_no_check | Optional boolean for the Optimus (GEX) pipeline indicating if the pipeline should perform checks; default is "false". | Boolean | +| Input name | Description | Type | +| --- |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| --- | +| input_id | Unique identifier describing the biological sample or replicate that corresponds with the FASTQ files; can be a human-readable name or UUID. | String | +| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | +| annotations_gtf | GTF file containing gene annotations used for GEX cell metric calculation and ATAC fragment metrics; must match the GTF used to build the STAR aligner. | File | +| gex_r1_fastq | Array of read 1 FASTQ files representing a single GEX 10x library. | Array[File] | +| gex_r2_fastq | Array of read 2 FASTQ files representing a single GEX 10x library. | Array[File] | +| gex_i1_fastq | Optional array of index FASTQ files representing a single GEX 10x library; multiplexed samples are not currently supported, but the file may be passed to the pipeline. | Array[File] | +| tar_star_reference | TAR file containing a species-specific reference genome and GTF for Optimus (GEX) pipeline. | File | +| mt_genes | Optional file for the Optimus (GEX) pipeline containing mitochondrial gene names used for metric calculation; default assumes 'mt' prefix in GTF (case insensitive). | File | +| counting_mode | Optional string that determines whether the Optimus (GEX) pipeline should be run in single-cell mode (sc_rna) or single-nucleus mode (sn_rna); default is "sn_rna". | String | +| tenx_chemistry_version | Optional integer for the Optimus (GEX) pipeline specifying the 10x version chemistry the data was generated with; validated by examination of the first read 1 FASTQ file read structure; default is "3". | Integer | +| emptydrops_lower | Optional threshold for UMIs for the Optimus (GEX) pipeline that empty drops tool should consider for determining cell; data below threshold is not removed; default is "100". | Integer | +| force_no_check | Optional boolean for the Optimus (GEX) pipeline indicating if the pipeline should perform checks; default is "false". | Boolean | | ignore_r1_read_length | Optional boolean for the Optimus (GEX) pipeline indicating if the pipeline should ignore barcode chemistry check; if "true", the workflow will not ensure the `10x_chemistry_version` input matches the chemistry in the read 1 FASTQ; default is "false". | Boolean | -| star_strand_mode | Optional string for the Optimus (GEX) pipeline for performing STARsolo alignment on forward stranded, reverse stranded, or unstranded data; default is "Forward". | String | -| count_exons | Optional boolean for the Optimus (GEX) pipeline indicating if the workflow should calculate exon counts **when in single-nucleus (sn_rna) mode**; if "true" in sc_rna mode, the workflow will return an error; default is "false". | Boolean | -| soloMultiMappers | Optional string describing whether or not the Optimus (GEX) pipeline should run STARsolo with the `--soloMultiMappers` flag. | String | -| atac_r1_fastq | Array of read 1 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] | -| atac_r2_fastq | Array of barcodes FASTQ files representing a single 10x multiome ATAC library. | Array[File] | -| atac_r3_fastq | Array of read 2 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] | -| tar_bwa_reference | TAR file containing the reference index files for BWA-mem alignment for the ATAC pipeline. | File | -| chrom_sizes | File containing the genome chromosome sizes; used to calculate ATAC fragment file metrics. | File | -| adapter_seq_read1 | Optional string describing the adapter sequence for ATAC read 1 paired-end reads to be used during adapter trimming with Cutadapt; default is "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG". | String | -| adapter_seq_read3 | Optional string describing the adapter sequence for ATAC read 2 paired-end reads to be used during adapter trimming with Cutadapt; default is "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG". | String | -| run_cellbender | Optional boolean used to determine if the Optimus (GEX) pipeline should run CellBender on the output gene expression h5ad file, `h5ad_output_file_gex`; default is "false". | Boolean | +| star_strand_mode | Optional string for the Optimus (GEX) pipeline for performing STARsolo alignment on forward stranded, reverse stranded, or unstranded data; default is "Forward". | String | +| count_exons | Optional boolean for the Optimus (GEX) pipeline indicating if the workflow should calculate exon counts **when in single-nucleus (sn_rna) mode**; if "true" in sc_rna mode, the workflow will return an error; default is "false". | Boolean | +| soloMultiMappers | Optional string describing whether or not the Optimus (GEX) pipeline should run STARsolo with the `--soloMultiMappers` flag. | String | +| atac_r1_fastq | Array of read 1 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] | +| atac_r2_fastq | Array of barcodes FASTQ files representing a single 10x multiome ATAC library. | Array[File] | +| atac_r3_fastq | Array of read 2 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] | +| tar_bwa_reference | TAR file containing the reference index files for BWA-mem alignment for the ATAC pipeline. | File | +| chrom_sizes | File containing the genome chromosome sizes; used to calculate ATAC fragment file metrics. | File | +| adapter_seq_read1 | Optional string describing the adapter sequence for ATAC read 1 paired-end reads to be used during adapter trimming with Cutadapt; default is "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG". | String | +| adapter_seq_read3 | Optional string describing the adapter sequence for ATAC read 2 paired-end reads to be used during adapter trimming with Cutadapt; default is "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG". | String | +| run_cellbender | Optional boolean used to determine if the Optimus (GEX) pipeline should run CellBender on the output gene expression h5ad file, `h5ad_output_file_gex`; default is "false". | Boolean | +| vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s"). | String | + #### Sample inputs for analyses in a Terra Workspace