Merge pull request #23 from peterk87/fix/87-and-cat-noempty-irma-cons…

…ensus-seqs Fix multiple issues (#87,#22,#46)
peterk87 · Oct 21, 2024 · 81d4823 · 81d4823
2 parents 2106785 + 48eb1a5
commit 81d4823
Show file tree

Hide file tree

Showing 8 changed files with 89 additions and 42 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,17 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[3.5.2](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.5.2)] - 2024-10-18
+
+This patch release fixes a few issues when running the pipeline.
+
+### Changes
+
+* fix: better handling of empty IRMA consensus sequences to avoid downstream analysis errors with VADR and BLASTN ([peterk87/nf-flu #22](https://github.com/peterk87/nf-flu/issues/22))
+* fix: Clair3 `versions.yml` indentation issue ([#87](https://github.com/CFIA-NCFAD/nf-flu/issues/87))
+* fix: removed capturing of cat and gzip versions in CAT_ILLUMINA_FASTQ process ([#46](https://github.com/CFIA-NCFAD/nf-flu/issues/46)) to avoid issue in some execution environments.
+* docs: update README.md
+
 ## [[3.5.1](https://github.com/CFIA-NCFAD/nf-flu/releases/tag/3.5.1)] - 2024-10-08
 
 This patch release fixes an issue ([#84](https://github.com/CFIA-NCFAD/nf-flu/issues/84)) with long sample names (over 50 characters) causing VADR to fail. `--noseqnamemax` has been added to the default arguments for VADR to avoid this issue.

diff --git a/README.md b/README.md
@@ -15,6 +15,8 @@ Since Influenza has a segmented genome consisting of 8 gene segments, the pipeli
 Users can also provide their own reference sequences to include in the top reference sequence selection process.
 After reference sequence selection, the pipeline performs read mapping to each reference sequence, variant calling and depth-masked consensus sequence generation.
 
+> **Note:** The officially supported version of the pipeline is [CFIA-NCFAD/nf-flu](https://github.com/CFIA-NCFAD/nf-flu). If you have issues with using the pipeline, please create an issue [here](https://github.com/CFIA-NCFAD/nf-flu/issues/new/choose) on [CFIA-NCFAD/nf-flu](https://github.com/CFIA-NCFAD/nf-flu) repo.
+
 ## Pipeline summary
 
 1. Download latest [NCBI Orthomyxoviridae sequences](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Undef&id=11308&lvl=3&keep=1&srchmode=1&unlock) and metadata (parsed from [NCBI Viruses FTP data](https://ftp.ncbi.nlm.nih.gov/genomes/Viruses/AllNucleotide/)).
@@ -109,6 +111,14 @@ Camacho, C., Coulouris, G., Avagyan, V., Ma, N., Papadopoulos, J., Bealer, K., M
 Zheng, Z., Li, S., Su, J., Leung, A.W.-S., Lam, T.-W., Luo, R., 2022. Symphonizing pileup and full-alignment for deep learning-based long-read variant calling. Nat Comput Sci 2, 797–803. https://doi.org/10.1038/s43588-022-00387-x
 ```
 
+### [Freebayes][]
+
+[Freebayes][] is used for variant calling.
+
+```text
+Garrison, E., Marth, G., 2012. Haplotype-based variant detection from short-read sequencing. arXiv:1207.3907 [q-bio]. https://doi.org/10.48550/arXiv.1207.3907
+```
+
 ### [IRMA][] Iterative Refinement Meta-Assembler
 
 ```text
@@ -222,3 +232,4 @@ Alejandro A Schäffer, Eneida L Hatcher, Linda Yankie, Lara Shonkwiler, J Rodney
 [MultiQC]: https://multiqc.info/
 [VADR]: https://github.com/ncbi/vadr
 [table2asn]: https://www.ncbi.nlm.nih.gov/genbank/table2asn/
+[Freebayes]: https://github.com/freebayes/freebayes
diff --git a/modules/local/cat_illumina_fastq.nf b/modules/local/cat_illumina_fastq.nf
@@ -19,7 +19,6 @@ process CAT_ILLUMINA_FASTQ {
 
   output:
   tuple val(meta), path("*.merged.fastq.gz"), emit: reads
-  path "versions.yml"                       , emit: versions
 
   when:
   task.ext.when == null || task.ext.when
@@ -47,12 +46,6 @@ process CAT_ILLUMINA_FASTQ {
   if [[ ${fqgzList.size} > 0 ]]; then
     cat ${readList.join(' ')} >> ${prefix}.merged.fastq.gz
   fi
-
-  cat <<-END_VERSIONS > versions.yml
-  "${task.process}":
-    cat: \$(echo \$(cat --help 2>&1) | sed 's/ (.*//')
-    gzip: \$(echo \$(gzip --help 2>&1) | sed 's/ (.*//')
-  END_VERSIONS
   """
     }
   } else {
@@ -97,12 +90,6 @@ if [[ ${read2gz.size} > 0 ]]; then
   | gzip -ck \\
   >> ${prefix}_2.merged.fastq.gz
 fi
-
-cat <<-END_VERSIONS > versions.yml
-"${task.process}":
-  cat: \$(echo \$(cat --help 2>&1) | sed 's/ (.*//')
-  gzip: \$(echo \$(gzip --help 2>&1) | sed 's/ (.*//')
-END_VERSIONS
 """
     }
   }

diff --git a/modules/local/clair3.nf b/modules/local/clair3.nf
@@ -34,37 +34,37 @@ process CLAIR3 {
   """
   CLAIR_BIN_DIR=\$(dirname \$(which run_clair3.sh))
   if [[ "${params.clair3_user_variant_model}" != "" ]] ; then
-      MODEL_PATH=${model_path}
+    MODEL_PATH="${model_path}"
   else
-      if [[ ${using_conda} = true ]] ; then
-          MODEL_PATH="\$CLAIR_BIN_DIR/${model_suffix}"
-      else [[ ${using_conda} = false ]]
-          MODEL_PATH="/opt/models/${params.clair3_variant_model}"
-          if [[ -d \$MODEL_PATH ]] ; then
-              echo "Using built-in model: \$MODEL_PATH"
-          else
-              MODEL_PATH="/usr/local/bin/models/${params.clair3_variant_model}"
-          fi
-          if [[ ! -d \$MODEL_PATH ]] ; then
-              echo "Model not found: \$MODEL_PATH"
-              exit 1
-          fi
+    if [[ ${using_conda} = true ]] ; then
+      MODEL_PATH="\$CLAIR_BIN_DIR/${model_suffix}"
+    else [[ ${using_conda} = false ]]
+      MODEL_PATH="/opt/models/${params.clair3_variant_model}"
+      if [[ -d \$MODEL_PATH ]] ; then
+        echo "Using built-in model: \$MODEL_PATH"
+      else
+        MODEL_PATH="/usr/local/bin/models/${params.clair3_variant_model}"
       fi
+      if [[ ! -d \$MODEL_PATH ]] ; then
+        echo "Model not found: \$MODEL_PATH"
+        exit 1
+      fi
+    fi
   fi
 
   samtools faidx $ref_fasta
 
   run_clair3.sh \\
-      --bam_fn=${bam[0]} \\
-      --ref_fn=$ref_fasta \\
-      --model_path="\$MODEL_PATH"\\
-      --threads=${task.cpus} \\
-      --platform="ont" \\
-      --output=${clair3_dir} \\
-      --haploid_sensitive \\
-      --enable_long_indel \\
-      --keep_iupac_bases \\
-      --include_all_ctgs
+    --bam_fn=${bam[0]} \\
+    --ref_fn=$ref_fasta \\
+    --model_path="\$MODEL_PATH"\\
+    --threads=${task.cpus} \\
+    --platform="ont" \\
+    --output=${clair3_dir} \\
+    --haploid_sensitive \\
+    --enable_long_indel \\
+    --keep_iupac_bases \\
+    --include_all_ctgs
 
   ln -s ${clair3_dir}/merge_output.vcf.gz ${vcf}
 

diff --git a/modules/local/irma.nf b/modules/local/irma.nf
@@ -38,14 +38,47 @@ process IRMA {
 
   IRMA $irma_module $reads $meta.id
 
+  cat_nonempty() {
+    awk '
+      # When a new header line is encountered
+      /^>/ {
+          # If there is an existing sequence (i.e., it is not empty), print the header and sequence
+          if (seqlen > 0) {
+              print header;
+              print seq;
+          }
+          # Set the new header and reset the sequence and sequence length for the next entry
+          header = \$0;
+          seq = "";
+          seqlen = 0;
+          next;
+      }
+
+      # For sequence lines, concatenate the sequence and update the length
+      {
+          seq = seq \$0;
+          seqlen += length(\$0);
+      }
+
+      # At the end of the file, print the last sequence if it is not empty
+      END {
+          if (seqlen > 0) {
+              print header;
+              print seq;
+          }
+      }
+    ' \$@
+  }
+
   if ls ${meta.id}/amended_consensus/*.fa > /dev/null 2>&1; then
-    cat ${meta.id}/amended_consensus/*.fa > ${meta.id}.irma.consensus.fasta
+    # use awk to concatenate only fasta sequences that actually have a sequence; ignore empty sequences
+    cat_nonempty ${meta.id}/amended_consensus/*.fa > ${meta.id}.irma.consensus.fasta
   fi
 
   if ls ${meta.id}/tables/*-allAlleles.txt > /dev/null 2>&1; then
     irma-alleles2fasta -n "${meta.id}" -i "${meta.id}/tables" -o majority-consensus
     if ls majority-consensus/*.fasta > /dev/null 2>&1; then
-      cat majority-consensus/*.fasta > ${meta.id}.irma.majority_consensus.fasta
+      cat_nonempty majority-consensus/*.fasta > ${meta.id}.irma.majority_consensus.fasta
     fi
   fi
 

diff --git a/nextflow.config b/nextflow.config
@@ -155,7 +155,7 @@ manifest {
   description     = 'Influenza A virus genome assembly pipeline'
   homePage        = 'https://github.com/CFIA-NCFAD/nf-flu'
   author          = 'Peter Kruczkiewicz, Hai Nguyen'
-  version         = '3.5.1'
+  version         = '3.5.2'
   nextflowVersion = '!>=22.10.1'
   mainScript      = 'main.nf'
   doi             = '10.5281/zenodo.13892044'

diff --git a/tests/run-nanopore-test.sh b/tests/run-nanopore-test.sh
@@ -62,6 +62,12 @@ while getopts "w:m:c:h" opt; do
     esac
 done
 
+shift $((OPTIND-1))
+
+if [[ "$1" == "--" ]]; then
+    shift
+fi
+
 info "Starting nf-flu Nanopore test execution script with ${CPU} CPU cores and ${MEMORY} memory..."
 
 FASTA_ZST_URL="https://api.figshare.com/v2/file/download/41415330"
@@ -135,4 +141,4 @@ nextflow run "$WORKFLOW_PATH" \
     --input samplesheet.csv \
     --ncbi_influenza_fasta $FASTA_ZST_FILE \
     --ncbi_influenza_metadata $CSV_ZST_FILE \
-    --max_cpus $CPU --max_memory "$MEMORY"
+    --max_cpus $CPU --max_memory "$MEMORY" $@
diff --git a/workflows/illumina.nf b/workflows/illumina.nf
@@ -150,7 +150,6 @@ workflow ILLUMINA {
 
   // Use ch_input_sorted for CAT_ILLUMINA_FASTQ to ensure IRMA triggers
   CAT_ILLUMINA_FASTQ(ch_input_sorted)
-  ch_versions = ch_versions.mix(CAT_ILLUMINA_FASTQ.out.versions.first().ifEmpty(null))
 
   // IRMA processing
   IRMA(CAT_ILLUMINA_FASTQ.out.reads, irma_module)