Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ph pd 2514 multiome on terra #1223

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 31 additions & 5 deletions pipelines/skylab/multiome/Multiome.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ import "../../../pipelines/skylab/multiome/atac.wdl" as atac
import "../../../pipelines/skylab/optimus/Optimus.wdl" as optimus
import "../../../tasks/skylab/H5adUtils.wdl" as H5adUtils
import "https://raw.githubusercontent.com/broadinstitute/CellBender/v0.3.0/wdl/cellbender_remove_background.wdl" as CellBender
import "../../../tasks/broad/Utilities.wdl" as utils

workflow Multiome {
String pipeline_version = "3.2.0"
nikellepetrillo marked this conversation as resolved.
Show resolved Hide resolved

input {
String cloud_provider
String input_id

# Optimus Inputs
Expand All @@ -25,33 +27,56 @@ workflow Multiome {
Boolean ignore_r1_read_length = false
String star_strand_mode = "Forward"
Boolean count_exons = false
File gex_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_gex.txt"
String? soloMultiMappers

# ATAC inputs
# Array of input fastq files
Array[File] atac_r1_fastq
Array[File] atac_r2_fastq
Array[File] atac_r3_fastq

# BWA tar reference
File tar_bwa_reference
# Chromosone sizes
File chrom_sizes
# Trimadapters input
String adapter_seq_read1 = "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG"
String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"
# Whitelist
File atac_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_atac.txt"

# CellBender
Boolean run_cellbender = false

}

# Determine docker prefix based on cloud provider
String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/"
String acr_docker_prefix = "dsppipelinedev.azurecr.io/"
String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix

# Define docker images
String snap_atac_docker_image = "snapatac2:1.0.5-2.3.2-1709230223"

nikellepetrillo marked this conversation as resolved.
Show resolved Hide resolved
# Define all whitelist files
File gcp_gex_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_gex.txt"
File gcp_atac_whitelist = "gs://gcp-public-data--broad-references/RNA/resources/arc-v1/737K-arc-v1_atac.txt"
File azure_gex_whitelist = "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/arc-v1/737K-arc-v1_gex.txt"
File azure_atac_whitelist = "https://datasetpublicbroadref.blob.core.windows.net/dataset/RNA/resources/arc-v1/737K-arc-v1_atac.txt"

# Determine which whitelist files to use based on cloud provider
File gex_whitelist = if cloud_provider == "gcp" then gcp_gex_whitelist else azure_gex_whitelist
File atac_whitelist = if cloud_provider == "gcp" then gcp_atac_whitelist else azure_atac_whitelist

# Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error
if ((cloud_provider != "gcp") && (cloud_provider != "azure")) {
call utils.ErrorWithMessage as ErrorMessageIncorrectInput {
input:
message = "cloud_provider must be supplied with either 'gcp' or 'azure'."
}
}

# Call the Optimus workflow
call optimus.Optimus as Optimus {
input:
cloud_provider = cloud_provider,
counting_mode = counting_mode,
r1_fastq = gex_r1_fastq,
r2_fastq = gex_r2_fastq,
Expand All @@ -74,6 +99,7 @@ workflow Multiome {
# Call the ATAC workflow
call atac.ATAC as Atac {
input:
cloud_provider = cloud_provider,
read1_fastq_gzipped = atac_r1_fastq,
read2_fastq_gzipped = atac_r2_fastq,
read3_fastq_gzipped = atac_r3_fastq,
Expand All @@ -87,6 +113,7 @@ workflow Multiome {
}
call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes {
input:
docker_path = docker_prefix + snap_atac_docker_image,
atac_h5ad = Atac.snap_metrics,
gex_h5ad = Optimus.h5ad_output_file,
gex_whitelist = gex_whitelist,
Expand All @@ -108,7 +135,6 @@ workflow Multiome {
hardware_preemptible_tries = 2,
hardware_zones = "us-central1-a us-central1-c",
nvidia_driver_version = "470.82.01"

}
}

Expand Down
1 change: 1 addition & 0 deletions pipelines/skylab/multiome/atac.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"ATAC.TrimAdapters.adapter_seq_read1": "GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG",
"ATAC.TrimAdapters.adapter_seq_read2": "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG",
"ATAC.input_id": "scATAC",
"ATAC.cloud_provider":"gcp",
"ATAC.tar_bwa_reference": "gs://fc-dd55e131-ef49-4d02-aa2a-20640daaae1e/submissions/8f0dd71a-b42f-4503-b839-3f146941758a/IndexRef/53a91851-1f6c-4ab9-af66-b338ffb28b5a/call-BwaMem2Index/GRCh38.primary_assembly.genome.bwamem2.fa.tar",
"ATAC.preindex": "false"
}
57 changes: 43 additions & 14 deletions pipelines/skylab/multiome/atac.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ version 1.0
import "../../../tasks/skylab/MergeSortBam.wdl" as Merge
import "../../../tasks/skylab/FastqProcessing.wdl" as FastqProcessing
import "../../../tasks/skylab/PairedTagUtils.wdl" as AddBB
"../../../tasks/broad/Utilities.wdl" as utils
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"../../../tasks/broad/Utilities.wdl" as utils
import "../../../tasks/broad/Utilities.wdl" as utils


workflow ATAC {
meta {
Expand All @@ -18,6 +19,7 @@ workflow ATAC {

# Output prefix/base name for all intermediate files and pipeline outputs
String input_id
String cloud_provider

# Option for running files with preindex
Boolean preindex = false
Expand All @@ -43,6 +45,26 @@ workflow ATAC {

String pipeline_version = "1.1.8"

# Determine docker prefix based on cloud provider
String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/"
String acr_docker_prefix = "dsppipelinedev.azurecr.io/"
String docker_prefix = if cloud_provider == "gcp" then gcr_docker_prefix else acr_docker_prefix

# Docker image names
String warp_tools_2_0_0 = "warp-tools:2.0.0"
String cutadapt_docker = "cutadapt:1.0.0-4.4-1709146458"
String samtools_docker = "samtools-dist-bwa:3.0.0"
String upstools_docker = "upstools:1.0.0-2023.03.03-1704300311"
String snap_atac_docker = "snapatac2:1.0.4-2.3.1"

# Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error
if ((cloud_provider != "gcp") && (cloud_provider != "azure")) {
call utils.ErrorWithMessage as ErrorMessageIncorrectInput {
input:
message = "cloud_provider must be supplied with either 'gcp' or 'azure'."
}
}

parameter_meta {
read1_fastq_gzipped: "read 1 FASTQ file as input for the pipeline, contains read 1 of paired reads"
read2_fastq_gzipped: "read 2 FASTQ file as input for the pipeline, contains the cellular barcodes corresponding to the reads in the read1 FASTQ and read 3 FASTQ"
Expand All @@ -52,7 +74,6 @@ workflow ATAC {
num_threads_bwa: "Number of threads for bwa-mem2 task (default: 128)"
mem_size_bwa: "Memory size in GB for bwa-mem2 task (default: 512)"
cpu_platform_bwa: "CPU platform for bwa-mem2 task (default: Intel Ice Lake)"

}

call GetNumSplits {
Expand All @@ -69,7 +90,8 @@ workflow ATAC {
barcodes_fastq = read2_fastq_gzipped,
output_base_name = input_id,
num_output_files = GetNumSplits.ranks_per_node_out,
whitelist = whitelist
whitelist = whitelist,
docker_path = docker_prefix + warp_tools_2_0_0
}

scatter(idx in range(length(SplitFastq.fastq_R1_output_array))) {
Expand All @@ -79,7 +101,8 @@ workflow ATAC {
read3_fastq = SplitFastq.fastq_R3_output_array[idx],
output_base_name = input_id + "_" + idx,
adapter_seq_read1 = adapter_seq_read1,
adapter_seq_read3 = adapter_seq_read3
adapter_seq_read3 = adapter_seq_read3,
docker_path = docker_prefix + cutadapt_docker
}
}

Expand All @@ -91,21 +114,24 @@ workflow ATAC {
output_base_name = input_id,
nthreads = num_threads_bwa,
mem_size = mem_size_bwa,
cpu_platform = cpu_platform_bwa
cpu_platform = cpu_platform_bwa,
docker_path = docker_prefix + samtools_docker
}

if (preindex) {
call AddBB.AddBBTag as BBTag {
input:
bam = BWAPairedEndAlignment.bam_aligned_output,
input_id = input_id
input_id = input_id,
docker_path = docker_prefix + upstools_docker
}
call CreateFragmentFile as BB_fragment {
input:
bam = BBTag.bb_bam,
chrom_sizes = chrom_sizes,
annotations_gtf = annotations_gtf,
preindex = preindex
preindex = preindex,
docker_path = docker_prefix + snap_atac_docker
}
}
if (!preindex) {
Expand All @@ -114,7 +140,8 @@ workflow ATAC {
bam = BWAPairedEndAlignment.bam_aligned_output,
chrom_sizes = chrom_sizes,
annotations_gtf = annotations_gtf,
preindex = preindex
preindex = preindex,
docker_path = docker_prefix + snap_atac_docker

}
}
Expand Down Expand Up @@ -231,7 +258,7 @@ task TrimAdapters {
# Runtime attributes/docker
Int disk_size = ceil(2 * ( size(read1_fastq, "GiB") + size(read3_fastq, "GiB") )) + 200
Int mem_size = 4
String docker_image = "us.gcr.io/broad-gotc-prod/cutadapt:1.0.0-4.4-1686752919"
String docker_path
}

parameter_meta {
Expand All @@ -242,7 +269,7 @@ task TrimAdapters {
adapter_seq_read1: "cutadapt option for the sequence adapter for read 1 fastq"
adapter_seq_read3: "cutadapt option for the sequence adapter for read 3 fastq"
output_base_name: "base name to be used for the output of the task"
docker_image: "the docker image using cutadapt to be used (default:us.gcr.io/broad-gotc-prod/cutadapt:1.0.0-4.4-1686752919)"
docker_path: "The docker image path containing the runtime environment for this task"
mem_size: "the size of memory used during trimming adapters"
disk_size : "disk size used in trimming adapters step"
}
Expand All @@ -269,7 +296,7 @@ task TrimAdapters {

# use docker image for given tool cutadapat
runtime {
docker: docker_image
docker: docker_path
disks: "local-disk ${disk_size} HDD"
memory: "${mem_size} GiB"
}
Expand All @@ -290,7 +317,7 @@ task BWAPairedEndAlignment {
String read_group_sample_name = "RGSN1"
String suffix = "trimmed_adapters.fastq.gz"
String output_base_name
String docker_image = "us.gcr.io/broad-gotc-prod/samtools-dist-bwa:2.0.0"
String docker_path

# Runtime attributes
Int disk_size = 2000
Expand All @@ -309,7 +336,7 @@ task BWAPairedEndAlignment {
mem_size: "the size of memory used during alignment"
disk_size : "disk size used in bwa alignment step"
output_base_name: "basename to be used for the output of the task"
docker_image: "the docker image using BWA to be used (default: us.gcr.io/broad-gotc-prod/samtools-bwa-mem-2:1.0.0-2.2.1_x64-linux-1685469504)"
docker_path: "The docker image path containing the runtime environment for this task"
}

String bam_aligned_output_name = output_base_name + ".bam"
Expand Down Expand Up @@ -418,7 +445,7 @@ task BWAPairedEndAlignment {
>>>

runtime {
docker: docker_image
docker: docker_path
disks: "local-disk ${disk_size} SSD"
cpu: nthreads
cpuPlatform: cpu_platform
Expand All @@ -442,6 +469,7 @@ task CreateFragmentFile {
Int mem_size = 16
Int nthreads = 1
String cpuPlatform = "Intel Cascade Lake"
String docker_path
}

String bam_base_name = basename(bam, ".bam")
Expand All @@ -452,6 +480,7 @@ task CreateFragmentFile {
chrom_sizes: "Text file containing chrom_sizes for genome build (i.e. hg38)."
disk_size: "Disk size used in create fragment file step."
mem_size: "The size of memory used in create fragment file."
docker_path: "The docker image path containing the runtime environment for this task"
}

command <<<
Expand Down Expand Up @@ -492,7 +521,7 @@ task CreateFragmentFile {
>>>

runtime {
docker: "us.gcr.io/broad-gotc-prod/snapatac2:1.0.4-2.3.1"
docker: docker_path
disks: "local-disk ${disk_size} SSD"
memory: "${mem_size} GiB"
cpu: nthreads
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
"Multiome.annotations_gtf":"gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_v43.annotation.gtf",
"Multiome.input_id":"10k_PBMC_downsampled",
"Multiome.cloud_provider":"gcp",
"Multiome.gex_r1_fastq":[
"gs://broad-gotc-test-storage/Multiome/input/plumbing/fastq_R1_gex.fastq.gz"
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"gs://broad-gotc-test-storage/Multiome/input/scientific/10k_PBMC_Multiome/10k_PBMC_Multiome_nextgem_Chromium_Controller_gex_S1_L002_I1_001.fastq.gz"
],
"Multiome.input_id":"10k_PBMC",
"Multiome.cloud_provider":"gcp",
"Multiome.gex_r1_fastq":[
"gs://broad-gotc-test-storage/Multiome/input/scientific/10k_PBMC_Multiome/10k_PBMC_Multiome_nextgem_Chromium_Controller_gex_S1_L001_R1_001.fastq.gz",
"gs://broad-gotc-test-storage/Multiome/input/scientific/10k_PBMC_Multiome/10k_PBMC_Multiome_nextgem_Chromium_Controller_gex_S1_L002_R1_001.fastq.gz"
Expand Down
9 changes: 3 additions & 6 deletions tasks/skylab/FastqProcessing.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -243,10 +243,7 @@ task FastqProcessATAC {
String output_base_name
File whitelist
String barcode_index1 = basename(barcodes_fastq[0])

# [?] copied from corresponding optimus wdl for fastqprocessing
# using the latest build of warp-tools in GCR
String docker = "us.gcr.io/broad-gotc-prod/warp-tools:2.0.1"
String docker_path

# Runtime attributes [?]
Int mem_size = 5
Expand All @@ -272,7 +269,7 @@ task FastqProcessATAC {
read_structure: "A string that specifies the barcode (C) positions in the Read 2 fastq"
barcode_orientation: "A string that specifies the orientation of barcode needed for scATAC data. The default is FIRST_BP. Other options include LAST_BP, FIRST_BP_RC or LAST_BP_RC."
whitelist: "10x genomics cell barcode whitelist for scATAC"
docker: "(optional) the docker image containing the runtime environment for this task"
docker_path: "The docker image path containing the runtime environment for this task"
mem_size: "(optional) the amount of memory (MiB) to provision for this task"
cpu: "(optional) the number of cpus to provision for this task"
disk_size: "(optional) the amount of disk space (GiB) to provision for this task"
Expand Down Expand Up @@ -361,7 +358,7 @@ task FastqProcessATAC {
>>>

runtime {
docker: docker
docker: docker_path
cpu: cpu
memory: "${mem_size} MiB"
disks: "local-disk ${disk_size} HDD"
Expand Down
15 changes: 8 additions & 7 deletions tasks/skylab/H5adUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -184,22 +184,23 @@ task SingleNucleusOptimusH5adOutput {
}

task JoinMultiomeBarcodes {
input {
input {
File atac_h5ad
File atac_fragment
File gex_h5ad
File gex_whitelist
File atac_whitelist
String docker_path

Int nthreads = 1
String cpuPlatform = "Intel Cascade Lake"
}
String gex_base_name = basename(gex_h5ad, ".h5ad")
String atac_base_name = basename(atac_h5ad, ".h5ad")
String atac_fragment_base = basename(atac_fragment, ".tsv")
String gex_base_name = basename(gex_h5ad, ".h5ad")
String atac_base_name = basename(atac_h5ad, ".h5ad")
String atac_fragment_base = basename(atac_fragment, ".tsv")

Int machine_mem_mb = ceil((size(atac_h5ad, "MiB") + size(gex_h5ad, "MiB") + size(atac_fragment, "MiB")) * 3) + 10000
Int disk = ceil((size(atac_h5ad, "GiB") + size(gex_h5ad, "GiB") + size(atac_fragment, "GiB")) * 5) + 10
Int machine_mem_mb = ceil((size(atac_h5ad, "MiB") + size(gex_h5ad, "MiB") + size(atac_fragment, "MiB")) * 3) + 10000
Int disk = ceil((size(atac_h5ad, "GiB") + size(gex_h5ad, "GiB") + size(atac_fragment, "GiB")) * 5) + 10

parameter_meta {
atac_h5ad: "The resulting h5ad from the ATAC workflow."
Expand Down Expand Up @@ -278,7 +279,7 @@ task JoinMultiomeBarcodes {
>>>

runtime {
docker: "us.gcr.io/broad-gotc-prod/snapatac2:1.0.4-2.3.1-1700590229"
docker: docker_path
disks: "local-disk ~{disk} HDD"
memory: "${machine_mem_mb} MiB"
cpu: nthreads
Expand Down
Loading
Loading