From 08d6bbce9f4d46d679851498b4ed1b249d4d0c00 Mon Sep 17 00:00:00 2001
From: Nikelle Petrillo <38223776+nikellepetrillo@users.noreply.github.com>
Date: Tue, 18 Jun 2024 09:22:12 -0400
Subject: [PATCH] Np make vm size an input to multiome (#1289)

* add vm size as input to Multiome.wdl

* add vm size as input to Multiome.wdl

* add new input to overviews

* add new input to overviews

* add new input to overviews

* add new input to overviews
---
 pipelines/skylab/multiome/Multiome.wdl        |  7 ++-
 pipelines/skylab/multiome/atac.wdl            | 17 ++++--
 pipelines/skylab/paired_tag/PairedTag.wdl     |  6 ++-
 .../Plumbing/BC011_BC015_downsampled.json     |  3 +-
 .../Plumbing/BI015_downsampled.json           |  3 +-
 website/docs/Pipelines/ATAC/README.md         | 33 ++++++------
 .../Pipelines/Multiome_Pipeline/README.md     | 52 ++++++++++---------
 7 files changed, 70 insertions(+), 51 deletions(-)

diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl
index 3c3b7d222b..1a150ea9ea 100644
--- a/pipelines/skylab/multiome/Multiome.wdl
+++ b/pipelines/skylab/multiome/Multiome.wdl
@@ -35,6 +35,8 @@ workflow Multiome {
         Array[File] atac_r1_fastq
         Array[File] atac_r2_fastq
         Array[File] atac_r3_fastq
+        # VM size used for several ATAC tasks
+        String vm_size = "Standard_M128s"
         # BWA tar reference
         File tar_bwa_reference
         # Chromosone sizes 
@@ -109,8 +111,9 @@ workflow Multiome {
             chrom_sizes = chrom_sizes,
             whitelist = atac_whitelist,
             adapter_seq_read1 = adapter_seq_read1,
-            annotations_gtf = annotations_gtf,
-            adapter_seq_read3 = adapter_seq_read3
+            adapter_seq_read3 = adapter_seq_read3,
+            vm_size = vm_size,
+            annotations_gtf = annotations_gtf
     }
     call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes {
         input:
diff --git a/pipelines/skylab/multiome/atac.wdl b/pipelines/skylab/multiome/atac.wdl
index 3c2f420c20..061e9c892b 100644
--- a/pipelines/skylab/multiome/atac.wdl
+++ b/pipelines/skylab/multiome/atac.wdl
@@ -30,6 +30,7 @@ workflow ATAC {
     Int num_threads_bwa = 128
     Int mem_size_bwa = 512
     String cpu_platform_bwa = "Intel Ice Lake"
+    String vm_size
 
     # Text file containing chrom_sizes for genome build (i.e. hg38)
     File chrom_sizes
@@ -80,7 +81,8 @@ workflow ATAC {
     input:
        nthreads = num_threads_bwa, 
        mem_size = mem_size_bwa,
-       cpu_platform = cpu_platform_bwa
+       cpu_platform = cpu_platform_bwa,
+       vm_size = vm_size
   }
 
   call FastqProcessing.FastqProcessATAC as SplitFastq {
@@ -116,7 +118,8 @@ workflow ATAC {
         mem_size = mem_size_bwa,
         cpu_platform = cpu_platform_bwa,
         docker_path = docker_prefix + samtools_docker,
-        cloud_provider = cloud_provider
+        cloud_provider = cloud_provider,
+        vm_size = vm_size
   }
 
   if (preindex) {
@@ -166,12 +169,14 @@ task GetNumSplits {
     Int mem_size
     String cpu_platform 
     String docker_image = "ubuntu:latest"
+    String vm_size
   }
 
   parameter_meta {
     docker_image: "the ubuntu docker image (default: ubuntu:latest)"
     nthreads: "Number of threads per node (default: 128)"
     mem_size: "the size of memory used during alignment"
+    vm_size: "the virtual machine used for the task"
   }
 
   command <<<
@@ -236,7 +241,7 @@ task GetNumSplits {
     cpu: nthreads
     cpuPlatform: cpu_platform
     memory: "${mem_size} GiB"
-    vm_size: "Standard_M128s"
+    vm_size: vm_size
   }
 
   output {
@@ -327,7 +332,8 @@ task BWAPairedEndAlignment {
     Int disk_size = 2000
     Int nthreads
     Int mem_size
-    String cpu_platform 
+    String cpu_platform
+    String vm_size
   }
 
   parameter_meta {
@@ -342,6 +348,7 @@ task BWAPairedEndAlignment {
     output_base_name: "basename to be used for the output of the task"
     docker_path: "The docker image path containing the runtime environment for this task"
     cloud_provider: "The cloud provider for the pipeline."
+    vm_size: "the virtual machine used for the task"
   }
 
   String bam_aligned_output_name = output_base_name + ".bam"
@@ -471,7 +478,7 @@ task BWAPairedEndAlignment {
     cpu: nthreads
     cpuPlatform: cpu_platform
     memory: "${mem_size} GiB"
-    vm_size: "Standard_M128s"
+    vm_size: vm_size
   }
 
   output {
diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl
index bc19f65160..ce1eb08599 100644
--- a/pipelines/skylab/paired_tag/PairedTag.wdl
+++ b/pipelines/skylab/paired_tag/PairedTag.wdl
@@ -34,6 +34,9 @@ workflow PairedTag {
         Array[File] atac_r1_fastq
         Array[File] atac_r2_fastq
         Array[File] atac_r3_fastq
+
+        String vm_size = "Standard_M128s"
+
         # BWA input
         File tar_bwa_reference
         File chrom_sizes
@@ -119,7 +122,8 @@ workflow PairedTag {
             adapter_seq_read3 = adapter_seq_read3,
             annotations_gtf = annotations_gtf,
             preindex = preindex,
-            cloud_provider = cloud_provider
+            cloud_provider = cloud_provider,
+            vm_size = vm_size
     }
 
     if (preindex) {
diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json
index 9e7b18b679..470b1ce33c 100644
--- a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json
+++ b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json
@@ -23,5 +23,6 @@
   "PairedTag.Atac_preindex.cpu_platform_bwa":"Intel Cascade Lake",
   "PairedTag.Atac_preindex.num_threads_bwa":"16",
   "PairedTag.Atac_preindex.mem_size_bwa":"64", 
-  "PairedTag.soloMultiMappers":"Uniform"
+  "PairedTag.soloMultiMappers":"Uniform",
+  "PairedTag.cloud_provider": "gcp"
 }
diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json
index 2bdd7a8fe2..67560d3aee 100644
--- a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json
+++ b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json
@@ -23,5 +23,6 @@
   "PairedTag.Atac_preindex.cpu_platform_bwa":"Intel Cascade Lake",
   "PairedTag.Atac_preindex.num_threads_bwa":"16",
   "PairedTag.Atac_preindex.mem_size_bwa":"64", 
-  "PairedTag.soloMultiMappers":"Uniform"
+  "PairedTag.soloMultiMappers":"Uniform",
+  "PairedTag.cloud_provider": "gcp"
 }
diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md
index 76033520f8..d95d82a440 100644
--- a/website/docs/Pipelines/ATAC/README.md
+++ b/website/docs/Pipelines/ATAC/README.md
@@ -44,23 +44,24 @@ ATAC can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/stable/
 ## Input Variables
 The following describes the inputs of the ATAC workflow. For more details on how default inputs are set for the Multiome workflow, see the [Multiome overview](../Multiome_Pipeline/README).
 
-| Variable name | Description |
-| --- | --- |
-| read1_fastq_gzipped | Fastq inputs (array of compressed read 1 FASTQ files). |
-| read2_fastq_gzipped | Fastq inputs (array of compressed read 2 FASTQ files containing cellular barcodes). |
-| read3_fastq_gzipped | Fastq inputs (array of compressed read 3 FASTQ files). |
-| input_id | Output prefix/base name for all intermediate files and pipeline outputs. |
+| Variable name | Description                                                                                                     |
+| --- |-----------------------------------------------------------------------------------------------------------------|
+| read1_fastq_gzipped | Fastq inputs (array of compressed read 1 FASTQ files).                                                          |
+| read2_fastq_gzipped | Fastq inputs (array of compressed read 2 FASTQ files containing cellular barcodes).                             |
+| read3_fastq_gzipped | Fastq inputs (array of compressed read 3 FASTQ files).                                                          |
+| input_id | Output prefix/base name for all intermediate files and pipeline outputs.                                        |
 | cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String |
-| preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false. | 
-| tar_bwa_reference | BWA reference (tar file containing reference fasta and corresponding files). |
-| num_threads_bwa | Optional integer defining the number of CPUs per node for the BWA-mem alignment task (default: 128). |
-| mem_size_bwa | Optional integer defining the memory size for the BWA-mem alignment task in GB (default: 512). |
-| cpu_platform_bwa | Optional string defining the CPU platform for the BWA-mem alignment task (default: "Intel Ice Lake"). |
-| annotations_gtf | CreateFragmentFile input variable: GTF file for SnapATAC2 to calculate TSS sites of fragment file.|
-| chrom_sizes | CreateFragmentFile input variable: Text file containing chrom_sizes for genome build (i.e., hg38) |
-| whitelist | Whitelist file for ATAC cellular barcodes. |
-| adapter_seq_read1 | TrimAdapters input: Sequence adapter for read 1 fastq. |
-| adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. |
+| preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false.                | 
+| tar_bwa_reference | BWA reference (tar file containing reference fasta and corresponding files).                                    |
+| num_threads_bwa | Optional integer defining the number of CPUs per node for the BWA-mem alignment task (default: 128).            |
+| mem_size_bwa | Optional integer defining the memory size for the BWA-mem alignment task in GB (default: 512).                  |
+| cpu_platform_bwa | Optional string defining the CPU platform for the BWA-mem alignment task (default: "Intel Ice Lake").           |
+| annotations_gtf | CreateFragmentFile input variable: GTF file for SnapATAC2 to calculate TSS sites of fragment file.              |
+| chrom_sizes | CreateFragmentFile input variable: Text file containing chrom_sizes for genome build (i.e., hg38)               |
+| whitelist | Whitelist file for ATAC cellular barcodes.                                                                      |
+| adapter_seq_read1 | TrimAdapters input: Sequence adapter for read 1 fastq.                                                          |
+| adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq.                                                          |
+| vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s").                  | String |
 
 ## ATAC tasks and tools
 
diff --git a/website/docs/Pipelines/Multiome_Pipeline/README.md b/website/docs/Pipelines/Multiome_Pipeline/README.md
index ecdbea40b1..4d77ad4dfe 100644
--- a/website/docs/Pipelines/Multiome_Pipeline/README.md
+++ b/website/docs/Pipelines/Multiome_Pipeline/README.md
@@ -52,32 +52,34 @@ Multiome can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/sta
 
 ## Inputs
 
-| Input name | Description | Type |
-| --- | --- | --- |
-| input_id | Unique identifier describing the biological sample or replicate that corresponds with the FASTQ files; can be a human-readable name or UUID. | String |
-| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String |
-| annotations_gtf | GTF file containing gene annotations used for GEX cell metric calculation and ATAC fragment metrics; must match the GTF used to build the STAR aligner. | File |
-| gex_r1_fastq | Array of read 1 FASTQ files representing a single GEX 10x library. | Array[File] |
-| gex_r2_fastq | Array of read 2 FASTQ files representing a single GEX 10x library.| Array[File] |
-| gex_i1_fastq | Optional array of index FASTQ files representing a single GEX 10x library; multiplexed samples are not currently supported, but the file may be passed to the pipeline. | Array[File] |
-| tar_star_reference | TAR file containing a species-specific reference genome and GTF for Optimus (GEX) pipeline. | File | 
-| mt_genes | Optional file for the Optimus (GEX) pipeline containing mitochondrial gene names used for metric calculation; default assumes 'mt' prefix in GTF (case insensitive). | File |
-| counting_mode | Optional string that determines whether the Optimus (GEX) pipeline should be run in single-cell mode (sc_rna) or single-nucleus mode (sn_rna); default is "sn_rna". | String |
-| tenx_chemistry_version | Optional integer for the Optimus (GEX) pipeline specifying the 10x version chemistry the data was generated with; validated by examination of the first read 1 FASTQ file read structure; default is "3". | Integer |
-| emptydrops_lower | Optional threshold for UMIs for the Optimus (GEX) pipeline that empty drops tool should consider for determining cell; data below threshold is not removed; default is "100". | Integer |
-| force_no_check | Optional boolean for the Optimus (GEX) pipeline indicating if the pipeline should perform checks; default is "false". | Boolean |
+| Input name | Description                                                                                                                                                                                                                                                | Type |
+| --- |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| --- |
+| input_id | Unique identifier describing the biological sample or replicate that corresponds with the FASTQ files; can be a human-readable name or UUID.                                                                                                               | String |
+| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure".                                                                                                                                            | String |
+| annotations_gtf | GTF file containing gene annotations used for GEX cell metric calculation and ATAC fragment metrics; must match the GTF used to build the STAR aligner.                                                                                                    | File |
+| gex_r1_fastq | Array of read 1 FASTQ files representing a single GEX 10x library.                                                                                                                                                                                         | Array[File] |
+| gex_r2_fastq | Array of read 2 FASTQ files representing a single GEX 10x library.                                                                                                                                                                                         | Array[File] |
+| gex_i1_fastq | Optional array of index FASTQ files representing a single GEX 10x library; multiplexed samples are not currently supported, but the file may be passed to the pipeline.                                                                                    | Array[File] |
+| tar_star_reference | TAR file containing a species-specific reference genome and GTF for Optimus (GEX) pipeline.                                                                                                                                                                | File | 
+| mt_genes | Optional file for the Optimus (GEX) pipeline containing mitochondrial gene names used for metric calculation; default assumes 'mt' prefix in GTF (case insensitive).                                                                                       | File |
+| counting_mode | Optional string that determines whether the Optimus (GEX) pipeline should be run in single-cell mode (sc_rna) or single-nucleus mode (sn_rna); default is "sn_rna".                                                                                        | String |
+| tenx_chemistry_version | Optional integer for the Optimus (GEX) pipeline specifying the 10x version chemistry the data was generated with; validated by examination of the first read 1 FASTQ file read structure; default is "3".                                                  | Integer |
+| emptydrops_lower | Optional threshold for UMIs for the Optimus (GEX) pipeline that empty drops tool should consider for determining cell; data below threshold is not removed; default is "100".                                                                              | Integer |
+| force_no_check | Optional boolean for the Optimus (GEX) pipeline indicating if the pipeline should perform checks; default is "false".                                                                                                                                      | Boolean |
 | ignore_r1_read_length | Optional boolean for the Optimus (GEX) pipeline indicating if the pipeline should ignore barcode chemistry check; if "true", the workflow will not ensure the `10x_chemistry_version` input matches the chemistry in the read 1 FASTQ; default is "false". | Boolean |
-| star_strand_mode | Optional string for the Optimus (GEX) pipeline for performing STARsolo alignment on forward stranded, reverse stranded, or unstranded data; default is "Forward". | String |
-| count_exons | Optional boolean for the Optimus (GEX) pipeline indicating if the workflow should calculate exon counts **when in single-nucleus (sn_rna) mode**; if "true" in sc_rna mode, the workflow will return an error; default is "false". | Boolean |
-| soloMultiMappers | Optional string describing whether or not the Optimus (GEX) pipeline should run STARsolo with the `--soloMultiMappers` flag. | String |
-| atac_r1_fastq | Array of read 1 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] |
-| atac_r2_fastq | Array of barcodes FASTQ files representing a single 10x multiome ATAC library. | Array[File] |
-| atac_r3_fastq | Array of read 2 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] |
-| tar_bwa_reference | TAR file containing the reference index files for BWA-mem alignment for the ATAC pipeline. | File | 
-| chrom_sizes | File containing the genome chromosome sizes; used to calculate ATAC fragment file metrics. | File |
-| adapter_seq_read1 | Optional string describing the adapter sequence for ATAC read 1 paired-end reads to be used during adapter trimming with Cutadapt; default is "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG". | String |
-| adapter_seq_read3 | Optional string describing the adapter sequence for ATAC read 2 paired-end reads to be used during adapter trimming with Cutadapt; default is "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG". | String |
-| run_cellbender | Optional boolean used to determine if the Optimus (GEX) pipeline should run CellBender on the output gene expression h5ad file, `h5ad_output_file_gex`; default is "false". | Boolean |
+| star_strand_mode | Optional string for the Optimus (GEX) pipeline for performing STARsolo alignment on forward stranded, reverse stranded, or unstranded data; default is "Forward".                                                                                          | String |
+| count_exons | Optional boolean for the Optimus (GEX) pipeline indicating if the workflow should calculate exon counts **when in single-nucleus (sn_rna) mode**; if "true" in sc_rna mode, the workflow will return an error; default is "false".                         | Boolean |
+| soloMultiMappers | Optional string describing whether or not the Optimus (GEX) pipeline should run STARsolo with the `--soloMultiMappers` flag.                                                                                                                               | String |
+| atac_r1_fastq | Array of read 1 paired-end FASTQ files representing a single 10x multiome ATAC library.                                                                                                                                                                    | Array[File] |
+| atac_r2_fastq | Array of barcodes FASTQ files representing a single 10x multiome ATAC library.                                                                                                                                                                             | Array[File] |
+| atac_r3_fastq | Array of read 2 paired-end FASTQ files representing a single 10x multiome ATAC library.                                                                                                                                                                    | Array[File] |
+| tar_bwa_reference | TAR file containing the reference index files for BWA-mem alignment for the ATAC pipeline.                                                                                                                                                                 | File | 
+| chrom_sizes | File containing the genome chromosome sizes; used to calculate ATAC fragment file metrics.                                                                                                                                                                 | File |
+| adapter_seq_read1 | Optional string describing the adapter sequence for ATAC read 1 paired-end reads to be used during adapter trimming with Cutadapt; default is "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG".                                                                        | String |
+| adapter_seq_read3 | Optional string describing the adapter sequence for ATAC read 2 paired-end reads to be used during adapter trimming with Cutadapt; default is "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG".                                                                         | String |
+| run_cellbender | Optional boolean used to determine if the Optimus (GEX) pipeline should run CellBender on the output gene expression h5ad file, `h5ad_output_file_gex`; default is "false".                                                                                | Boolean |
+| vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s").                                                                                                                                                                   | String |
+
 
 #### Sample inputs for analyses in a Terra Workspace