Skip to content

Commit

Permalink
Np make vm size an input to multiome (#1289)
Browse files Browse the repository at this point in the history
* add vm size as input to Multiome.wdl

* add vm size as input to Multiome.wdl

* add new input to overviews

* add new input to overviews

* add new input to overviews

* add new input to overviews
  • Loading branch information
nikellepetrillo authored Jun 18, 2024
1 parent bfc20a4 commit 08d6bbc
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 51 deletions.
7 changes: 5 additions & 2 deletions pipelines/skylab/multiome/Multiome.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ workflow Multiome {
Array[File] atac_r1_fastq
Array[File] atac_r2_fastq
Array[File] atac_r3_fastq
# VM size used for several ATAC tasks
String vm_size = "Standard_M128s"
# BWA tar reference
File tar_bwa_reference
# Chromosone sizes
Expand Down Expand Up @@ -109,8 +111,9 @@ workflow Multiome {
chrom_sizes = chrom_sizes,
whitelist = atac_whitelist,
adapter_seq_read1 = adapter_seq_read1,
annotations_gtf = annotations_gtf,
adapter_seq_read3 = adapter_seq_read3
adapter_seq_read3 = adapter_seq_read3,
vm_size = vm_size,
annotations_gtf = annotations_gtf
}
call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes {
input:
Expand Down
17 changes: 12 additions & 5 deletions pipelines/skylab/multiome/atac.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ workflow ATAC {
Int num_threads_bwa = 128
Int mem_size_bwa = 512
String cpu_platform_bwa = "Intel Ice Lake"
String vm_size

# Text file containing chrom_sizes for genome build (i.e. hg38)
File chrom_sizes
Expand Down Expand Up @@ -80,7 +81,8 @@ workflow ATAC {
input:
nthreads = num_threads_bwa,
mem_size = mem_size_bwa,
cpu_platform = cpu_platform_bwa
cpu_platform = cpu_platform_bwa,
vm_size = vm_size
}

call FastqProcessing.FastqProcessATAC as SplitFastq {
Expand Down Expand Up @@ -116,7 +118,8 @@ workflow ATAC {
mem_size = mem_size_bwa,
cpu_platform = cpu_platform_bwa,
docker_path = docker_prefix + samtools_docker,
cloud_provider = cloud_provider
cloud_provider = cloud_provider,
vm_size = vm_size
}

if (preindex) {
Expand Down Expand Up @@ -166,12 +169,14 @@ task GetNumSplits {
Int mem_size
String cpu_platform
String docker_image = "ubuntu:latest"
String vm_size
}

parameter_meta {
docker_image: "the ubuntu docker image (default: ubuntu:latest)"
nthreads: "Number of threads per node (default: 128)"
mem_size: "the size of memory used during alignment"
vm_size: "the virtual machine used for the task"
}

command <<<
Expand Down Expand Up @@ -236,7 +241,7 @@ task GetNumSplits {
cpu: nthreads
cpuPlatform: cpu_platform
memory: "${mem_size} GiB"
vm_size: "Standard_M128s"
vm_size: vm_size
}

output {
Expand Down Expand Up @@ -327,7 +332,8 @@ task BWAPairedEndAlignment {
Int disk_size = 2000
Int nthreads
Int mem_size
String cpu_platform
String cpu_platform
String vm_size
}

parameter_meta {
Expand All @@ -342,6 +348,7 @@ task BWAPairedEndAlignment {
output_base_name: "basename to be used for the output of the task"
docker_path: "The docker image path containing the runtime environment for this task"
cloud_provider: "The cloud provider for the pipeline."
vm_size: "the virtual machine used for the task"
}

String bam_aligned_output_name = output_base_name + ".bam"
Expand Down Expand Up @@ -471,7 +478,7 @@ task BWAPairedEndAlignment {
cpu: nthreads
cpuPlatform: cpu_platform
memory: "${mem_size} GiB"
vm_size: "Standard_M128s"
vm_size: vm_size
}

output {
Expand Down
6 changes: 5 additions & 1 deletion pipelines/skylab/paired_tag/PairedTag.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ workflow PairedTag {
Array[File] atac_r1_fastq
Array[File] atac_r2_fastq
Array[File] atac_r3_fastq

String vm_size = "Standard_M128s"

# BWA input
File tar_bwa_reference
File chrom_sizes
Expand Down Expand Up @@ -119,7 +122,8 @@ workflow PairedTag {
adapter_seq_read3 = adapter_seq_read3,
annotations_gtf = annotations_gtf,
preindex = preindex,
cloud_provider = cloud_provider
cloud_provider = cloud_provider,
vm_size = vm_size
}

if (preindex) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,6 @@
"PairedTag.Atac_preindex.cpu_platform_bwa":"Intel Cascade Lake",
"PairedTag.Atac_preindex.num_threads_bwa":"16",
"PairedTag.Atac_preindex.mem_size_bwa":"64",
"PairedTag.soloMultiMappers":"Uniform"
"PairedTag.soloMultiMappers":"Uniform",
"PairedTag.cloud_provider": "gcp"
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,6 @@
"PairedTag.Atac_preindex.cpu_platform_bwa":"Intel Cascade Lake",
"PairedTag.Atac_preindex.num_threads_bwa":"16",
"PairedTag.Atac_preindex.mem_size_bwa":"64",
"PairedTag.soloMultiMappers":"Uniform"
"PairedTag.soloMultiMappers":"Uniform",
"PairedTag.cloud_provider": "gcp"
}
33 changes: 17 additions & 16 deletions website/docs/Pipelines/ATAC/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,23 +44,24 @@ ATAC can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/stable/
## Input Variables
The following describes the inputs of the ATAC workflow. For more details on how default inputs are set for the Multiome workflow, see the [Multiome overview](../Multiome_Pipeline/README).

| Variable name | Description |
| --- | --- |
| read1_fastq_gzipped | Fastq inputs (array of compressed read 1 FASTQ files). |
| read2_fastq_gzipped | Fastq inputs (array of compressed read 2 FASTQ files containing cellular barcodes). |
| read3_fastq_gzipped | Fastq inputs (array of compressed read 3 FASTQ files). |
| input_id | Output prefix/base name for all intermediate files and pipeline outputs. |
| Variable name | Description |
| --- |-----------------------------------------------------------------------------------------------------------------|
| read1_fastq_gzipped | Fastq inputs (array of compressed read 1 FASTQ files). |
| read2_fastq_gzipped | Fastq inputs (array of compressed read 2 FASTQ files containing cellular barcodes). |
| read3_fastq_gzipped | Fastq inputs (array of compressed read 3 FASTQ files). |
| input_id | Output prefix/base name for all intermediate files and pipeline outputs. |
| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String |
| preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false. |
| tar_bwa_reference | BWA reference (tar file containing reference fasta and corresponding files). |
| num_threads_bwa | Optional integer defining the number of CPUs per node for the BWA-mem alignment task (default: 128). |
| mem_size_bwa | Optional integer defining the memory size for the BWA-mem alignment task in GB (default: 512). |
| cpu_platform_bwa | Optional string defining the CPU platform for the BWA-mem alignment task (default: "Intel Ice Lake"). |
| annotations_gtf | CreateFragmentFile input variable: GTF file for SnapATAC2 to calculate TSS sites of fragment file.|
| chrom_sizes | CreateFragmentFile input variable: Text file containing chrom_sizes for genome build (i.e., hg38) |
| whitelist | Whitelist file for ATAC cellular barcodes. |
| adapter_seq_read1 | TrimAdapters input: Sequence adapter for read 1 fastq. |
| adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. |
| preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false. |
| tar_bwa_reference | BWA reference (tar file containing reference fasta and corresponding files). |
| num_threads_bwa | Optional integer defining the number of CPUs per node for the BWA-mem alignment task (default: 128). |
| mem_size_bwa | Optional integer defining the memory size for the BWA-mem alignment task in GB (default: 512). |
| cpu_platform_bwa | Optional string defining the CPU platform for the BWA-mem alignment task (default: "Intel Ice Lake"). |
| annotations_gtf | CreateFragmentFile input variable: GTF file for SnapATAC2 to calculate TSS sites of fragment file. |
| chrom_sizes | CreateFragmentFile input variable: Text file containing chrom_sizes for genome build (i.e., hg38) |
| whitelist | Whitelist file for ATAC cellular barcodes. |
| adapter_seq_read1 | TrimAdapters input: Sequence adapter for read 1 fastq. |
| adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. |
| vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s"). | String |

## ATAC tasks and tools

Expand Down
52 changes: 27 additions & 25 deletions website/docs/Pipelines/Multiome_Pipeline/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,32 +52,34 @@ Multiome can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/sta

## Inputs

| Input name | Description | Type |
| --- | --- | --- |
| input_id | Unique identifier describing the biological sample or replicate that corresponds with the FASTQ files; can be a human-readable name or UUID. | String |
| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String |
| annotations_gtf | GTF file containing gene annotations used for GEX cell metric calculation and ATAC fragment metrics; must match the GTF used to build the STAR aligner. | File |
| gex_r1_fastq | Array of read 1 FASTQ files representing a single GEX 10x library. | Array[File] |
| gex_r2_fastq | Array of read 2 FASTQ files representing a single GEX 10x library.| Array[File] |
| gex_i1_fastq | Optional array of index FASTQ files representing a single GEX 10x library; multiplexed samples are not currently supported, but the file may be passed to the pipeline. | Array[File] |
| tar_star_reference | TAR file containing a species-specific reference genome and GTF for Optimus (GEX) pipeline. | File |
| mt_genes | Optional file for the Optimus (GEX) pipeline containing mitochondrial gene names used for metric calculation; default assumes 'mt' prefix in GTF (case insensitive). | File |
| counting_mode | Optional string that determines whether the Optimus (GEX) pipeline should be run in single-cell mode (sc_rna) or single-nucleus mode (sn_rna); default is "sn_rna". | String |
| tenx_chemistry_version | Optional integer for the Optimus (GEX) pipeline specifying the 10x version chemistry the data was generated with; validated by examination of the first read 1 FASTQ file read structure; default is "3". | Integer |
| emptydrops_lower | Optional threshold for UMIs for the Optimus (GEX) pipeline that empty drops tool should consider for determining cell; data below threshold is not removed; default is "100". | Integer |
| force_no_check | Optional boolean for the Optimus (GEX) pipeline indicating if the pipeline should perform checks; default is "false". | Boolean |
| Input name | Description | Type |
| --- |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| --- |
| input_id | Unique identifier describing the biological sample or replicate that corresponds with the FASTQ files; can be a human-readable name or UUID. | String |
| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String |
| annotations_gtf | GTF file containing gene annotations used for GEX cell metric calculation and ATAC fragment metrics; must match the GTF used to build the STAR aligner. | File |
| gex_r1_fastq | Array of read 1 FASTQ files representing a single GEX 10x library. | Array[File] |
| gex_r2_fastq | Array of read 2 FASTQ files representing a single GEX 10x library. | Array[File] |
| gex_i1_fastq | Optional array of index FASTQ files representing a single GEX 10x library; multiplexed samples are not currently supported, but the file may be passed to the pipeline. | Array[File] |
| tar_star_reference | TAR file containing a species-specific reference genome and GTF for Optimus (GEX) pipeline. | File |
| mt_genes | Optional file for the Optimus (GEX) pipeline containing mitochondrial gene names used for metric calculation; default assumes 'mt' prefix in GTF (case insensitive). | File |
| counting_mode | Optional string that determines whether the Optimus (GEX) pipeline should be run in single-cell mode (sc_rna) or single-nucleus mode (sn_rna); default is "sn_rna". | String |
| tenx_chemistry_version | Optional integer for the Optimus (GEX) pipeline specifying the 10x version chemistry the data was generated with; validated by examination of the first read 1 FASTQ file read structure; default is "3". | Integer |
| emptydrops_lower | Optional threshold for UMIs for the Optimus (GEX) pipeline that empty drops tool should consider for determining cell; data below threshold is not removed; default is "100". | Integer |
| force_no_check | Optional boolean for the Optimus (GEX) pipeline indicating if the pipeline should perform checks; default is "false". | Boolean |
| ignore_r1_read_length | Optional boolean for the Optimus (GEX) pipeline indicating if the pipeline should ignore barcode chemistry check; if "true", the workflow will not ensure the `10x_chemistry_version` input matches the chemistry in the read 1 FASTQ; default is "false". | Boolean |
| star_strand_mode | Optional string for the Optimus (GEX) pipeline for performing STARsolo alignment on forward stranded, reverse stranded, or unstranded data; default is "Forward". | String |
| count_exons | Optional boolean for the Optimus (GEX) pipeline indicating if the workflow should calculate exon counts **when in single-nucleus (sn_rna) mode**; if "true" in sc_rna mode, the workflow will return an error; default is "false". | Boolean |
| soloMultiMappers | Optional string describing whether or not the Optimus (GEX) pipeline should run STARsolo with the `--soloMultiMappers` flag. | String |
| atac_r1_fastq | Array of read 1 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] |
| atac_r2_fastq | Array of barcodes FASTQ files representing a single 10x multiome ATAC library. | Array[File] |
| atac_r3_fastq | Array of read 2 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] |
| tar_bwa_reference | TAR file containing the reference index files for BWA-mem alignment for the ATAC pipeline. | File |
| chrom_sizes | File containing the genome chromosome sizes; used to calculate ATAC fragment file metrics. | File |
| adapter_seq_read1 | Optional string describing the adapter sequence for ATAC read 1 paired-end reads to be used during adapter trimming with Cutadapt; default is "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG". | String |
| adapter_seq_read3 | Optional string describing the adapter sequence for ATAC read 2 paired-end reads to be used during adapter trimming with Cutadapt; default is "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG". | String |
| run_cellbender | Optional boolean used to determine if the Optimus (GEX) pipeline should run CellBender on the output gene expression h5ad file, `h5ad_output_file_gex`; default is "false". | Boolean |
| star_strand_mode | Optional string for the Optimus (GEX) pipeline for performing STARsolo alignment on forward stranded, reverse stranded, or unstranded data; default is "Forward". | String |
| count_exons | Optional boolean for the Optimus (GEX) pipeline indicating if the workflow should calculate exon counts **when in single-nucleus (sn_rna) mode**; if "true" in sc_rna mode, the workflow will return an error; default is "false". | Boolean |
| soloMultiMappers | Optional string describing whether or not the Optimus (GEX) pipeline should run STARsolo with the `--soloMultiMappers` flag. | String |
| atac_r1_fastq | Array of read 1 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] |
| atac_r2_fastq | Array of barcodes FASTQ files representing a single 10x multiome ATAC library. | Array[File] |
| atac_r3_fastq | Array of read 2 paired-end FASTQ files representing a single 10x multiome ATAC library. | Array[File] |
| tar_bwa_reference | TAR file containing the reference index files for BWA-mem alignment for the ATAC pipeline. | File |
| chrom_sizes | File containing the genome chromosome sizes; used to calculate ATAC fragment file metrics. | File |
| adapter_seq_read1 | Optional string describing the adapter sequence for ATAC read 1 paired-end reads to be used during adapter trimming with Cutadapt; default is "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG". | String |
| adapter_seq_read3 | Optional string describing the adapter sequence for ATAC read 2 paired-end reads to be used during adapter trimming with Cutadapt; default is "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG". | String |
| run_cellbender | Optional boolean used to determine if the Optimus (GEX) pipeline should run CellBender on the output gene expression h5ad file, `h5ad_output_file_gex`; default is "false". | Boolean |
| vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s"). | String |


#### Sample inputs for analyses in a Terra Workspace

Expand Down

0 comments on commit 08d6bbc

Please sign in to comment.