Lk pd 2736 doublets (#1362)

Added doublet scores to gex h5ads, created new nhash for ATAC
broadinstitute · Oct 2, 2024 · 30e4994 · 30e4994
1 parent 0c6697b
commit 30e4994
Show file tree

Hide file tree

Showing 30 changed files with 148 additions and 120 deletions.
diff --git a/pipeline_versions.txt b/pipeline_versions.txt
@@ -28,15 +28,15 @@ BroadInternalUltimaGenomics	 1.1.0	2024-09-06
 BroadInternalImputation	 1.1.13	2024-09-06 
 BroadInternalArrays	 1.1.12	2024-09-06 
 MultiSampleSmartSeq2	 2.2.22	2024-09-11 
-MultiSampleSmartSeq2SingleNucleus	 2.0.0	2024-09-11 
-PairedTag	 1.6.1	2024-09-11 
+MultiSampleSmartSeq2SingleNucleus	 2.0.1	2024-09-24 
+PairedTag	 1.7.0	2024-09-24 
 SmartSeq2SingleSample	 5.1.21	2024-09-11 
 scATAC	 1.3.2	2023-08-03 
-Optimus	 7.6.1	2024-09-11 
-Multiome	 5.6.1	2024-09-11 
+Optimus	 7.7.0	2024-09-24 
+Multiome	 5.7.0	2024-09-24 
 snm3C	 4.0.4	2024-08-06 
 BuildIndices	 3.0.0	2023-12-06 
 atac	 2.3.1	2024-09-11 
-SlideSeq	 3.4.1	2024-09-11 
+SlideSeq	 3.4.2	2024-09-24 
 BuildCembaReferences	 1.0.0	2020-11-15 
 CEMBA	 1.1.7	2024-09-06 
diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md
@@ -1,3 +1,8 @@
+# 5.7.0
+2024-09-24 (Date of Last Commit)
+* Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; percent doublets are now available as a library-level metric and individual doublet scores for cell barcodes are in the h5ad
+* Updated gene_names in the final h5ad to be unique
+
 # 5.6.1
 2024-09-11 (Date of Last Commit)
 * Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the Multiome pipeline

diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl
@@ -9,14 +9,15 @@ import "../../../tasks/broad/Utilities.wdl" as utils
 
 workflow Multiome {
 
-    String pipeline_version = "5.6.1"
+    String pipeline_version = "5.7.0"
 
 
     input {
         String cloud_provider
         String input_id
         # Additional library aliquot ID
-        String? nhash_id
+        String? gex_nhash_id
+        String? atac_nhash_id
 
         # Optimus Inputs
         String counting_mode = "sn_rna"
@@ -89,7 +90,7 @@ workflow Multiome {
             i1_fastq = gex_i1_fastq,
             input_id = input_id + "_gex",
             output_bam_basename = input_id + "_gex",
-            gex_nhash_id = nhash_id,
+            gex_nhash_id = gex_nhash_id,
             tar_star_reference = tar_star_reference,
             annotations_gtf = annotations_gtf,
             mt_genes = mt_genes,
@@ -118,7 +119,7 @@ workflow Multiome {
             adapter_seq_read1 = adapter_seq_read1,
             vm_size = vm_size,
             annotations_gtf = annotations_gtf,
-            atac_nhash_id = nhash_id,
+            atac_nhash_id = atac_nhash_id,
             adapter_seq_read3 = adapter_seq_read3
     }
     call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes {

diff --git a/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json b/pipelines/skylab/multiome/test_inputs/Plumbing/10k_pbmc_downsampled.json
@@ -25,5 +25,6 @@
   "Multiome.Atac.num_threads_bwa":"16",
   "Multiome.Atac.mem_size_bwa":"64", 
   "Multiome.soloMultiMappers":"Uniform",
-  "Multiome.nhash_id":"example_1234"
+  "Multiome.gex_nhash_id":"example_1234",
+  "Multiome.atac_nhash_id":"example_1234"
 }
diff --git a/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json b/pipelines/skylab/multiome/test_inputs/Scientific/10k_pbmc.json
@@ -32,5 +32,6 @@
   "Multiome.Atac.cpu_platform_bwa":"Intel Cascade Lake",
   "Multiome.Atac.num_threads_bwa":"24",
   "Multiome.Atac.mem_size_bwa":"175",
-  "Multiome.nhash_id":"example_1234"
+  "Multiome.gex_nhash_id":"example_1234",
+  "Multiome.gex_nhash_id":"example_1234"
 }
diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md
@@ -1,3 +1,9 @@
+# 7.7.0
+2024-09-24 (Date of Last Commit)
+
+* Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; percent doublets are now available as a library-level metric and individual doublet scores for cell barcodes are in the h5ad
+* Updated gene_names in the final h5ad to be unique
+
 # 7.6.1
 2024-09-11 (Date of Last Commit)
 * Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the Optimus pipeline

diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl
@@ -26,7 +26,7 @@ workflow Optimus {
     Array[File]? i1_fastq
     String input_id
     # String for additional library aliquot ID
-    String? gex_nhash_id = ""
+    String? gex_nhash_id
     String output_bam_basename = input_id
     String? input_name
     String? input_id_metadata_field
@@ -71,7 +71,7 @@ workflow Optimus {
   # version of this pipeline
 
 
-  String pipeline_version = "7.6.1"
+  String pipeline_version = "7.7.0"
 
 
   # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays
@@ -91,8 +91,9 @@ workflow Optimus {
   String pytools_docker = "pytools:1.0.0-1661263730"
   String empty_drops_docker = "empty-drops:1.0.1-4.2"
   String star_docker = "star:1.0.1-2.7.11a-1692706072"
-  String warp_tools_docker_2_2_0 = "warp-tools:2.3.0"
-  String star_merge_docker = "star-merge-npz:1.2"
+  String warp_tools_docker_2_2_0 = "warp-tools:2.4.0"
+  String star_merge_docker = "star-merge-npz:1.3.0"
+
 
   #TODO how do we handle these?
   String alpine_docker = "alpine-bash@sha256:965a718a07c700a5204c77e391961edee37477634ce2f9cf652a8e4c2db858ff"
@@ -241,11 +242,13 @@ workflow Optimus {
       input:
         input_id = input_id,
         gex_nhash_id = gex_nhash_id,
+        expected_cells = expected_cells,
         input_name = input_name,
         input_id_metadata_field = input_id_metadata_field,
         input_name_metadata_field = input_name_metadata_field,
         annotation_file = annotations_gtf,
         library_metrics = MergeStarOutputs.library_metrics,
+        cellbarcodes = MergeStarOutputs.outputbarcodes,
         cell_metrics = CellMetrics.cell_metrics,
         gene_metrics = GeneMetrics.gene_metrics,
         sparse_count_matrix = MergeStarOutputs.sparse_counts,
@@ -276,11 +279,14 @@ workflow Optimus {
       input:
         input_id = input_id,
         gex_nhash_id = gex_nhash_id,
+        expected_cells = expected_cells,
         input_name = input_name,
+        counting_mode = counting_mode,
         input_id_metadata_field = input_id_metadata_field,
         input_name_metadata_field = input_name_metadata_field,
         annotation_file = annotations_gtf,
         library_metrics = MergeStarOutputs.library_metrics,
+        cellbarcodes = MergeStarOutputs.outputbarcodes,
         cell_metrics = CellMetrics.cell_metrics,
         gene_metrics = GeneMetrics.gene_metrics,
         sparse_count_matrix = MergeStarOutputs.sparse_counts,

diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md
@@ -1,3 +1,8 @@
+# 1.7.0
+2024-09-24 (Date of Last Commit)
+* Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; percent doublets are now available as a library-level metric and individual doublet scores for cell barcodes are in the h5ad
+* Updated gene_names in the final h5ad to be unique
+
 # 1.6.1
 2024-09-11 (Date of Last Commit)
 * Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the PairedTag pipeline

diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl
@@ -8,13 +8,14 @@ import "../../../tasks/broad/Utilities.wdl" as utils
 
 workflow PairedTag {
 
-    String pipeline_version = "1.6.1"
+    String pipeline_version = "1.7.0"
 
 
     input {
         String input_id
         # Additional library aliquot id
-        String? nhash_id
+        String? gex_nhash_id
+        String? atac_nhash_id
 
         # Optimus Inputs
         String counting_mode = "sn_rna"
@@ -97,7 +98,7 @@ workflow PairedTag {
             count_exons = count_exons,
             cloud_provider = cloud_provider,
             soloMultiMappers = soloMultiMappers,
-            gex_nhash_id = nhash_id
+            gex_nhash_id = gex_nhash_id
     }
 
     # Call the ATAC workflow
@@ -131,7 +132,7 @@ workflow PairedTag {
             preindex = preindex,
             cloud_provider = cloud_provider,
             vm_size = vm_size,
-            atac_nhash_id = nhash_id
+            atac_nhash_id = atac_nhash_id
     }
 
     if (preindex) {

diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/10k_pbmc_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/10k_pbmc_downsampled.json
@@ -25,5 +25,6 @@
   "PairedTag.Atac_preindex.mem_size_bwa":"64",
   "PairedTag.soloMultiMappers":"Uniform",
   "PairedTag.cloud_provider": "gcp",
-  "PairedTag.nhash_id":"example_1234"
+  "PairedTag.gex_nhash_id":"example_1234",
+  "PairedTag.atac_nhash_id":"example_1234"
 }
diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BC011_BC015_downsampled.json
@@ -25,5 +25,6 @@
   "PairedTag.Atac_preindex.mem_size_bwa":"64", 
   "PairedTag.soloMultiMappers":"Uniform",
   "PairedTag.cloud_provider": "gcp",
-  "PairedTag.nhash_id":"example_1234"
+  "PairedTag.gex_nhash_id":"example_1234",
+  "PairedTag.atac_nhash_id":"example_1234"
 }
diff --git a/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json b/pipelines/skylab/paired_tag/test_inputs/Plumbing/BI015_downsampled.json
@@ -25,5 +25,6 @@
   "PairedTag.Atac_preindex.mem_size_bwa":"64", 
   "PairedTag.soloMultiMappers":"Uniform",
   "PairedTag.cloud_provider": "gcp",
-  "PairedTag.nhash_id":"example_1234"
+  "PairedTag.gex_nhash_id":"example_1234",
+  "PairedTag.atac_nhash_id":"example_1234"
 }
diff --git a/pipelines/skylab/paired_tag/test_inputs/Scientific/10k_pbmc.json b/pipelines/skylab/paired_tag/test_inputs/Scientific/10k_pbmc.json
@@ -33,6 +33,7 @@
   "PairedTag.Atac_preindex.num_threads_bwa":"24",
   "PairedTag.Atac_preindex.mem_size_bwa":"175", 
   "PairedTag.soloMultiMappers":"Uniform",
-  "PairedTag.nhash_id":"example_1234",
+  "PairedTag.gex_nhash_id":"example_1234",
+  "PairedTag.atac_nhash_id":"example_1234",
   "PairedTag.cloud_provider": "gcp"
 }
diff --git a/pipelines/skylab/paired_tag/test_inputs/Scientific/BC011_10kPBMC.json b/pipelines/skylab/paired_tag/test_inputs/Scientific/BC011_10kPBMC.json
@@ -30,6 +30,7 @@
   "PairedTag.Atac_preindex.num_threads_bwa":"16",
   "PairedTag.Atac_preindex.mem_size_bwa":"64", 
   "PairedTag.soloMultiMappers":"Uniform",
-  "PairedTag.nhash_id":"example_1234",
+  "PairedTag.gex_nhash_id":"example_1234",
+  "PairedTag.atac_nhash_id":"example_1234",
   "PairedTag.cloud_provider": "gcp"
 }
diff --git a/pipelines/skylab/slideseq/SlideSeq.changelog.md b/pipelines/skylab/slideseq/SlideSeq.changelog.md
@@ -1,3 +1,8 @@
+# 3.4.2
+2024-09-24 (Date of Last Commit)
+
+* Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; this does not impact the slideseq workflow
+
 # 3.4.1
 2024-09-11 (Date of Last Commit)
 

diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl
@@ -25,7 +25,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils
 
 workflow SlideSeq {
 
-    String pipeline_version = "3.4.1"
+    String pipeline_version = "3.4.2"
 
     input {
         Array[File] r1_fastq
@@ -48,8 +48,8 @@ workflow SlideSeq {
     # docker images
     String pytools_docker = "pytools:1.0.0-1661263730"
     String picard_cloud_docker = "picard-cloud:2.26.10"
-    String warp_tools_docker_2_2_0 = "warp-tools:2.3.0"
-    String star_merge_docker = "star-merge-npz:1.2"
+    String warp_tools_docker_2_2_0 = "warp-tools:2.4.0"
+    String star_merge_docker = "star-merge-npz:1.3.0"
 
     String ubuntu_docker = "ubuntu_16_0_4@sha256:025124e2f1cf4d29149958f17270596bffe13fc6acca6252977c572dd5ba01bf"
     String gcp_ubuntu_docker_prefix = "gcr.io/gcp-runtimes/"

diff --git a/...tseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md b/...tseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md
@@ -1,3 +1,7 @@
+# 2.0.1
+2024-09-24 (Date of Last Commit)
+* Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; this does not affect the snSS2 workflow
+
 # 2.0.0
 2024-09-11 (Dat of Last Commit)
 

diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl
@@ -57,7 +57,7 @@ workflow MultiSampleSmartSeq2SingleNucleus {
   }
 
   # Version of this pipeline
-  String pipeline_version = "2.0.0"
+  String pipeline_version = "2.0.1"
 
   if (false) {
      String? none = "None"