diff --git a/.gitignore b/.gitignore index bd661637..d7bdb72c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ results/ testing/ testing* *.pyc +.nf-test/ +test/ +.nf-test.log diff --git a/README.md b/README.md index c61fc523..ce8e7fbe 100644 --- a/README.md +++ b/README.md @@ -90,3 +90,5 @@ You can cite the `nf-core` publication as follows: > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. > > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). + +nextflow run main.nf -profile test_hackathon,docker --outdir=test \ No newline at end of file diff --git a/bin/calculate_TPM_HTSeq.R b/bin/calculate_TPM_HTSeq.R new file mode 100755 index 00000000..831f8be7 --- /dev/null +++ b/bin/calculate_TPM_HTSeq.R @@ -0,0 +1,104 @@ +#!/usr/bin/env Rscript + +#------------------------- +# +# Description: +# Script to count TPM values for HTSeq quantification results +# +# Created by B. Mika-Gospodorz +# Date: 29th May 2020 +# +# Input args: [1] = quantification_results_uniquely_mapped.tsv - HTSeq quantification results +# [2] = host gene attribute used for quantification, e.g. 'gene_id' +# [3] = pathogen gff file with gene features (from 3rd col) replaced with 'quant' +# [4] = host gff file with gene features (from 3rd col) replaced with 'quant' +# Output file: quantification_results_uniquely_mapped_NumReads_TPM.tsv +# +#------------------------- + +library(plyr) +library(rtracklayer) + + +args = commandArgs(trailingOnly=TRUE) +# read quantification table +gene_attribute <- args[2] +table_htseq <- read.table(args[1], sep="\t", stringsAsFactors=F, header = T,row.names = gene_attribute) +pathogen_gff <- import(args[3]) +host_gff <- import(args[4]) + + +# function to extract gene length from gff file +extract_transcript_length <- function(x,host_gff,gene_attribute){ + #find annotations that contain host transcripts + h_tr <- host_gff[mcols(host_gff)[gene_attribute][[1]]==x] + # Filter quant features + quants <- h_tr[h_tr$type == "quant",] + # merge overlapping features, so that the same bases are covered by reduced collection of features + t_length <- sum(width(reduce(quants))) + return(t_length) +} + +######################### extract gene length ####################################### + +### pathogen +names_gff_pathogen <- mcols(pathogen_gff)[gene_attribute][[1]] # extract gene names from gff file +common_pathogen = intersect(rownames(table_htseq), names_gff_pathogen) # find positions of pathogen genes in quantification table +pathogen_table_htseq <- table_htseq[common_pathogen,] # extract pathogen quantification results +pathogen_gff_match <- match(common_pathogen,names_gff_pathogen) # find positions of corresponding genes in gff file +pathogen_table_htseq <- cbind(length = pathogen_gff@ranges@width[pathogen_gff_match],pathogen_table_htseq) # extract gene length from gff file and combine it with quantification table + + +### host +names_gff_host <- mcols(host_gff)[gene_attribute][[1]] # extract gene names from gff file +lack_of_attribute <- which(is.na(names_gff_host)) #find positions without gene_attribute (eg.genes that don't have transcript_id attribute) +if (length(lack_of_attribute)> 0) { + host_gff <- host_gff[-lack_of_attribute] #remove positions without gene_attribute + names_gff_host <- names_gff_host[-lack_of_attribute] # extract gene names that contain gene_attribute +} +common_host = intersect(rownames(table_htseq),names_gff_host) # find positions of host genes in quantification table +host_table_htseq <- table_htseq[common_host,] #extract quantification results of host genes +transcript_length <- sapply(common_host, extract_transcript_length, host_gff=host_gff,gene_attribute = gene_attribute,simplify = TRUE) #extract gene length +host_table_htseq <- cbind(length = transcript_length,host_table_htseq) # add gene length into quantification table + + +# combine host and pathogen tables +htseq_table_with_gene_length = rbind(pathogen_table_htseq,host_table_htseq) + + +######################### calculate TPM values ####################################### + +# function to calculate TPMs +tpm <- function(counts, lengths) { + rate <- counts / lengths + return(rate / sum(rate) * 1e6) +} + +# function to add _TPM suffix +rename_add_TPM <- function(x) { + paste(x,"_TPM",sep='') +} + +# function to add _NumReads suffix +rename_add_NumReads <- function(x) { + paste(x,"_NumReads",sep='') +} + + +# calculate TPMs for each gene in each sample +TPMs <- apply(htseq_table_with_gene_length[2:dim(htseq_table_with_gene_length)[2]], 2, function(x) tpm(x, htseq_table_with_gene_length$length)) +colnames(TPMs) <- colnames(htseq_table_with_gene_length[,-1]) +# add TPM suffix to column names with TPM values +colnames(TPMs) <-sapply(colnames(TPMs),function(x) rename_add_TPM(x)) +# add NumReads suffix to column names with NumReads values +colnames_NR <- sapply(colnames(htseq_table_with_gene_length[,-1]),function(x) rename_add_NumReads(x)) +# add 'length' name for column which stores gene length +colnames(htseq_table_with_gene_length) <- (c('length',colnames_NR)) + +# combine results +quant_results <- cbind(name = rownames(TPMs), htseq_table_with_gene_length,TPMs) +names(quant_results)[1] <- gene_attribute + +# save results +write.table(quant_results,file = "quantification_results_uniquely_mapped_NumReads_TPM.tsv",sep = "\t", row.names = F, quote = F) + diff --git a/bin/collect_quantification_data.py b/bin/collect_quantification_data.py new file mode 100755 index 00000000..2de26703 --- /dev/null +++ b/bin/collect_quantification_data.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Jul 4 15:00:57 2019 + +@author: B. Mika-Gospodorz + +Input file: list of quantification tables +Output files: quantification_stats_*.tsv, quantification_results_*.tsv (HTSeq option) or +Description: Used to merge quantification results from all samples +""" + +import argparse + +import pandas as pd + + +# function to merge HTSeq quantification results +def collect_quantification_data_HTseq(input_files, profile, gene_attribute): + # initiate merged quantification data frame + quant_merged_table = pd.DataFrame() + for input_file in input_files: # iterate over sample results + # read quantification data + quant_table = pd.read_csv(input_file, sep="\t", header=0) + if input_file == input_files[0]: # initiate first column of quant_merged_table data frame + quant_merged_table = quant_table + else: # extend quant_merged_table data frame with results of new sample + quant_merged_table = pd.concat([quant_merged_table, quant_table], axis=1) + quant_merged_table.index.names = [gene_attribute] + # sort quant_merged_table by column labels + quant_merged_table = quant_merged_table.sort_index(axis=1) + + # extract last 5 rows of HTSeq quantification table that contain statistics and save results + alignment_stats = quant_merged_table["__no_feature":"__alignment_not_unique"] + alignment_stats.to_csv("quantification_stats_" + profile + ".tsv", sep="\t") + # remove statistics from quantification results and save quant_merged_table + quant_merged_table = quant_merged_table.drop( + ["__no_feature", "__ambiguous", "__too_low_aQual", "__not_aligned", "__alignment_not_unique"] + ) + quant_merged_table.to_csv("quantification_results_" + profile + ".tsv", sep="\t") + + +# function to merge Salmon quantification results +def collect_quantification_data_salmon(input_files, gene_attribute, organism): + for input_file in input_files: # iterate over sample results + if organism == "host_gene_level": # read tximport results + quant_table = pd.read_csv(input_file, sep="\t", header=0, index_col=0) + else: + if organism == "both": # read quantification table containing both host and pathogen transcripts + quant_table = pd.read_csv( + input_file + "/quant.sf", + sep="\t", + header=0, + index_col=0, + dtype={"Name": str, "Length": int, "EffectiveLength": float, "TPM": float, "NumReads": float}, + ) + elif organism == "pathogen": # read quantification table containing pathogen transcripts + quant_table = pd.read_csv( + input_file + "/pathogen_quant.sf", + sep="\t", + header=0, + index_col=0, + dtype={"Name": str, "Length": int, "EffectiveLength": float, "TPM": float, "NumReads": float}, + ) + elif organism == "host": # read quantification table containing host transcripts + quant_table = pd.read_csv( + input_file + "/host_quant.sf", + sep="\t", + header=0, + index_col=0, + dtype={"Name": str, "Length": int, "EffectiveLength": float, "TPM": float, "NumReads": float}, + ) + # create new column names that keep sample name + name_file = input_file.split("/") + new_col = [name_file[0] + "_" + column for column in quant_table.columns] + quant_table.columns = new_col + if input_file == input_files[0]: # initiate first column + quant_merged_table = quant_table + else: # extend quant_merged_table data frame with results of new sample + quant_merged_table = pd.concat([quant_merged_table, quant_table], axis=1) + quant_merged_table.index.names = [gene_attribute] + # sort quant_merged_table by column labels + quant_merged_table = quant_merged_table.sort_index(axis=1) + # save results + if organism == "both": + quant_merged_table.to_csv("combined_quant.tsv", sep="\t") + elif organism == "pathogen": + quant_merged_table.to_csv("pathogen_combined_quant.tsv", sep="\t") + elif organism == "host": + quant_merged_table.to_csv("host_combined_quant.tsv", sep="\t") + elif organism == "host_gene_level": + quant_merged_table.to_csv("host_combined_gene_level.tsv", sep="\t") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="""Combine counts from all samples""") + parser.add_argument( + "-i", "--input_files", metavar="", nargs="+", help="Path to quantification results " + ) + parser.add_argument("-q", "--quantifier", metavar="", help="name of quantifier") + parser.add_argument("-a", "--gene_attribute", metavar="", help="gene attribute") + parser.add_argument( + "-org", + "--organism", + metavar="", + help="host, pathogen, both, host_gene_level - option avaiable for Salmon", + ) + + args = parser.parse_args() + + # collect either Salmon or HTSeq quantification results + if args.quantifier == "htseq": + collect_quantification_data_HTseq(args.input_files, args.quantifier, args.gene_attribute) + if args.quantifier == "salmon": + collect_quantification_data_salmon(args.input_files, args.gene_attribute, args.organism) diff --git a/bin/collect_total_raw_read_pairs.py b/bin/collect_total_raw_read_pairs.py new file mode 100755 index 00000000..5c79a79d --- /dev/null +++ b/bin/collect_total_raw_read_pairs.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Jul 23 14:45:52 2020 + +@author: B. Mika-Gospodorz + +Input file: tsv file containing number of reads in fastq files created with count_total_reads.sh +Output file: total_raw_read_pairs_fastq.tsv file that contains number of raw read pairs in each sample +Description: Used to collect number of total raw read paires in each sample +""" + +import argparse + +import pandas as pd + +parser = argparse.ArgumentParser(description="""collect total raw read paires""") +parser.add_argument( + "-i", "--input_files", metavar="", default="*.tsv", help="Path to the total_raw_reads_fastq.tsv" +) +args = parser.parse_args() + + +# read total_raw_reads_fastq.tsv file +df_stats = pd.read_csv(args.input_files, sep="\t", index_col=0, header=None) + +# extract fastq file names +index_names = df_stats.index + +# remove '_1' and '_2' suffixes from fastq file names +sample_names = [name.rsplit("_", 1)[0] for name in index_names] + +# define new sample names and remove duplicated results (each sample contains results from both *_1 and *_2 fastq files +df_stats.index = sample_names +df_stats = df_stats.reset_index().drop_duplicates(subset="index", keep="last").set_index("index") + +# save results +df_stats.to_csv("total_raw_read_pairs_fastq.tsv", sep="\t", header=False) diff --git a/bin/combine_quant_annotations.py b/bin/combine_quant_annotations.py new file mode 100755 index 00000000..d7954f30 --- /dev/null +++ b/bin/combine_quant_annotations.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Apr 9 12:07:56 2020 + +@author: B.Mika-Gospdoorz + +Input files: .tsv quantification table with combined results from all samples + .tsv file with annotations extracted from gff using extract_annotations_from_gff.py +Output file: *combined_quant_gene_level_annotations.tsv with gene annotations and quantification results +Description: Used to combine annotations with quantification results +""" + +import argparse + +import pandas as pd + + +# function to combine annotations with quantification results +def combine_annotations_quant(quantification_table, annotations_table, gene_attribute, organism): + # read quantification results + col_names = pd.read_csv(quantification_table, sep="\t", nrows=0).columns + types_dict = {gene_attribute: str} + types_dict.update({col: float for col in col_names if col not in types_dict}) + quantification = pd.read_csv(quantification_table, sep="\t", index_col=0, dtype=types_dict) + # read annotations + annotations = pd.read_csv(annotations_table, sep="\t", index_col=0, dtype="str") + # combine annotations and quantification results + quant_merged_table = pd.concat([annotations, quantification], axis=1, join="inner").sort_index() + quant_merged_table.index.names = [gene_attribute] + # save results + if organism == "pathogen": + quant_merged_table.to_csv("pathogen_combined_quant_annotations.tsv", sep="\t") + elif organism == "host": + quant_merged_table.to_csv("host_combined_quant_annotations.tsv", sep="\t") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="""Combine annotations with quantification results""") + parser.add_argument( + "-q", "--quantification_table", metavar="", help="Path to quantification results " + ) + parser.add_argument( + "-annotations", "--annotations", metavar="", help="Path to annotations extracted from gff file" + ) + parser.add_argument("-a", "--gene_attribute", metavar="", help="gene attribute") + parser.add_argument("-org", "--organism", metavar="", help="host, pathogen or both") + + args = parser.parse_args() + + # combine annotations with quantification results + combine_annotations_quant(args.quantification_table, args.annotations, args.gene_attribute, args.organism) diff --git a/bin/combine_tables.py b/bin/combine_tables.py new file mode 100755 index 00000000..2bc248f9 --- /dev/null +++ b/bin/combine_tables.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Dec 6 09:26:36 2019 + +@author: B.Mika-Gospodorz + + +Input files: list of .txt files with mapping statistics for different samples generated with count_multi_mapped_reads.sh, count_uniquely_mapped_read_pairs, count_uniquely_mapped_read_pairs.sh or count_multi_mapped_read_pairs.sh +Output file: .tsv file with combined statistics from all samples +Description: Used to collect mapping statistics from all samples +""" + + +import argparse + +import pandas as pd + + +# function to combine quantification results +def combine_stat_tables(input_files, suffix): + combined_tables = pd.DataFrame() # initialize data frame + for input_file in input_files: # iterate over input files + # read file + table_read = pd.read_csv(input_file, sep=" ", header=None, index_col=1) + # extract sample name + sample_name = str(table_read[0][0]) + table_read = table_read.drop(0, 1) + table_read.columns = [sample_name] + # add new table to combined_tables data frame + combined_tables = pd.concat([combined_tables, table_read], axis=1) + # transpose table to have 'host' and 'pathogen' labels as column names + combined_tables = combined_tables.transpose() + combined_tables.columns = ["host_" + suffix, "pathogen_" + suffix] + return combined_tables + + +parser = argparse.ArgumentParser(description="""Combine mapping statistis from all samples""") +parser.add_argument( + "-i", + "--input_files", + metavar="", + nargs="+", + default="*.txt", + help="Path to mapping statistic results ", +) +parser.add_argument("-o", "--output_dir", metavar="", help="output dir", default=".") +parser.add_argument("-s", "--suffix", metavar="", help="uniquely_mapped/multi_mapped", default=".") +args = parser.parse_args() + +# combine results +combineTables = combine_stat_tables(args.input_files, args.suffix) + +# save results +combineTables.to_csv(args.output_dir, sep="\t") diff --git a/bin/count_cross_mapped_reads.sh b/bin/count_cross_mapped_reads.sh new file mode 100755 index 00000000..0ee1fe18 --- /dev/null +++ b/bin/count_cross_mapped_reads.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +#------------------------- +# +# Description: +# Script to count cross mapped reads in *_cross_mapped_reads.txt file created by remove_crossmapped_reads_BAM.sh +# +# Created by B. Mika-Gospodorz +# Date: 27th May 2020 +# +# Input files: list of *_cross_mapped_reads.txt files +# Output: cross_mapped_reads_sum.txt +# +#------------------------- + + + +for i in "$@" # iterate over .txt files +do +lines=$(wc -l $i | awk '{print $1}') # count number of lines +echo -n -e "$i\t$lines\n" >> cross_mapped_reads_sum.txt # appends to cross_mapped_reads_sum.txt +done + + diff --git a/bin/count_multi_mapped_read_pairs.sh b/bin/count_multi_mapped_read_pairs.sh new file mode 100755 index 00000000..f7a31aa7 --- /dev/null +++ b/bin/count_multi_mapped_read_pairs.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#------------------------- +# +# Description: +# Script to count multi mapped reads (paired-end reads) from .bam file +# +# Created by B. Mika-Gospodorz +# Date: 17th July 2020 +# +# Input files: $1 bam file +# $2 reference_host_names.txt file that contains host references extracted with extract_reference_names_from_fasta_files.sh +# $3 reference_pathogen_names.txt file that contains pathogen references extracted with extract_reference_names_from_fasta_files.sh +# $4 sample name +# $5 output file name *_multi_mapped.txt +# Output: *_multi_mapped.txt file with list of multi-mapped reads +# +#------------------------- + + +bam_file=$1 +host_reference_names=$2 +pathogen_reference_names=$3 +sample_name=$4 +out_name=$5 + + +# count reads that mapped onto multiple positions of host genome (-f 66 to extract read mapped in proper pair and first in pair) +host=$(samtools view -f 66 -h $bam_file | grep -f $host_reference_names | fgrep -wv NH:i:1 | fgrep -w HI:i:1 | echo "$sample_name host `wc -l`") +# count reads that mapped onto multiple positions of pathogen genome +pathogen=$(samtools view -f 66 -h $bam_file | grep -f $pathogen_reference_names | fgrep -wv NH:i:1 | fgrep -w HI:i:1 | echo "$sample_name pathogen `wc -l`") +# combine information about host and pathogen multi-mapped reads +counts="${host}\n${pathogen}" +# save into file +echo -e $counts > $out_name + + diff --git a/bin/count_multi_mapped_reads.sh b/bin/count_multi_mapped_reads.sh new file mode 100755 index 00000000..7c444576 --- /dev/null +++ b/bin/count_multi_mapped_reads.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#------------------------- +# +# Description: +# Script to count multi mapped reads in .bam file +# +# Created by B. Mika-Gospodorz +# Date: 27th May 2020 +# +# Input files: $1 bam file +# $2 reference_host_names.txt file that contains host references extracted with extract_reference_names_from_fasta_files.sh +# $3 reference_pathogen_names.txt file that contains pathogen references extracted with extract_reference_names_from_fasta_files.sh +# $4 sample name +# $5 output file name +# Output: *_multi_mapped.txt file with list of multi-mapped reads +# +#------------------------- + + +bam_file=$1 +host_reference_names=$2 +pathogen_reference_names=$3 +sample_name=$4 +out_name=$5 + +# count reads that mapped onto multiple positions of host genome (-F 4 to extract mapped reads) +host=$(samtools view -F 4 -h $bam_file | grep -f $host_reference_names | fgrep -wv NH:i:1 | fgrep -w HI:i:1 | echo "$sample_name host `wc -l`") +# count reads that mapped onto multiple positions of pathogen genome +pathogen=$(samtools view -F 4 -h $bam_file | grep -f $pathogen_reference_names | fgrep -wv NH:i:1 | fgrep -w HI:i:1 | echo "$sample_name pathogen `wc -l`") +# combine information about host and pathogen multi-mapped reads +counts="${host}\n${pathogen}" +# save into file +echo -e $counts > $out_name + + + diff --git a/bin/count_total_reads.sh b/bin/count_total_reads.sh new file mode 100755 index 00000000..d0043f09 --- /dev/null +++ b/bin/count_total_reads.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +#------------------------- +# +# Description: +# Script to count number of reads in fastq files +# +# Created by B. Mika-Gospodorz +# Date: 2nd April 2020 +# +# Input files: list of fastq files +# Output: name of sample followed by number of reads +# +#------------------------- + + +for i in "$@" # iterate over fastq files +do +lines=$(zcat -f $i| wc -l) # count lines +count=$(($lines / 4)) # divide number of lines by 4 (fastq file uses four lines per read) +# modify sample name +name=$(echo "$i" | tr -s \: _) +name2=$(echo "$name" | tr -s - _) +name2=$(echo "${name/.fastq.gz/}") +name2=$(echo "${name2/.fq.gz/}") +name2=$(echo "${name2/.fastq/}") +name2=$(echo "${name2/.fq/}") +name2=$(echo "${name2/_trimmed/}") +echo -n -e "$name2\t$count\n" # return sample name followed by number of reads +done + diff --git a/bin/count_uniquely_mapped_read_pairs.sh b/bin/count_uniquely_mapped_read_pairs.sh new file mode 100755 index 00000000..aee1d6be --- /dev/null +++ b/bin/count_uniquely_mapped_read_pairs.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +#------------------------- +# +# Description: +# Script to count multi mapped reads (paired-end reads) from .bam file +# +# Created by B. Mika-Gospodorz +# Date: 17th July 2020 +# +# Input files: $1 bam file +# $2 reference_host_names.txt file that contains host references extracted with extract_reference_names_from_fasta_files.sh +# $3 reference_pathogen_names.txt file that contains pathogen references extracted with extract_reference_names_from_fasta_files.sh +# $4 sample name +# $5 output file name *_multi_mapped.txt +# Output: *_uniquely_mapped.txt file with list of uniquely mapped reads +#------------------------- + +bam_file=$1 +host_reference_names=$2 +pathogen_reference_names=$3 +sample_name=$4 +out_name=$5 + +# count reads that mapped uniquely onto host genome (-f 66 to extract read mapped in proper pair and first in pair) +host=$(samtools view -f 66 -h $bam_file | grep -f $host_reference_names | fgrep -w NH:i:1 | echo "$sample_name host `wc -l`") +# count reads that mapped uniquely onto pathogen genome +pathogen=$(samtools view -f 66 -h $bam_file | grep -f $pathogen_reference_names | fgrep -w NH:i:1 | echo "$sample_name pathogen `wc -l`") +# combine information about host and pathogen uniquely mapped reads +counts="${host}\n${pathogen}" +# save into file +echo -e $counts > $out_name diff --git a/bin/count_uniquely_mapped_reads.sh b/bin/count_uniquely_mapped_reads.sh new file mode 100755 index 00000000..63b7d501 --- /dev/null +++ b/bin/count_uniquely_mapped_reads.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +#------------------------- +# +# Description: +# Script to count multi mapped reads in .bam file +# +# Created by B. Mika-Gospodorz +# Date: 27th May 2020 +# +# Input files: $1 bam file +# $2 reference_host_names.txt file that contains host references extracted with extract_reference_names_from_fasta_files.sh +# $3 reference_pathogen_names.txt file that contains pathogen references extracted with extract_reference_names_from_fasta_files.sh +# $4 sample name +# $5 output file name +# Output: *_uniquely_mapped.txt file with list of uniquely mapped reads +# +#------------------------- + +bam_file=$1 +host_reference_names=$2 +pathogen_reference_names=$3 +sample_name=$4 +out_name=$5 + +# count reads that mapped uniquely onto host genome (-F 4 to extract mapped reads) +host=$(samtools view -F 4 -h $bam_file | grep -f $host_reference_names | fgrep -w NH:i:1 | echo "$sample_name host `wc -l`") +# count reads that mapped uniquely onto pathogen genome +pathogen=$(samtools view -F 4 -h $bam_file | grep -f $pathogen_reference_names | fgrep -w NH:i:1 | echo "$sample_name pathogen `wc -l`") +# combine information about host and pathogen uniquely mapped reads +counts="${host}\n${pathogen}" +# save into file +echo -e $counts > $out_name + + diff --git a/bin/extract_crossmapped_reads.py b/bin/extract_crossmapped_reads.py new file mode 100755 index 00000000..a70c0af5 --- /dev/null +++ b/bin/extract_crossmapped_reads.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Sep 2 14:37:04 2019 + +@author: B.Mika-Gospodorz + +Input files: stdin with bam file with multi-mapped reads, reference_host_names.txt and reference_pathogen_names.txt files that contain references extracted with extract_reference_names_from_fasta_files.sh +Output: txt file with cross-mapped reads +Description: Used to identify and extract reads that mapped onto both host and pathogen genomes (cross-mapped reads). The script is executed by remove_crossmapped_reads_BAM.sh and remove_crossmapped_read_paires_BAM.sh scripts. +""" + +import argparse +import sys + +import pysam + + +# function to identify references given read mapped to +def check_reference_organisms(read_reference_name, host_reference_names, pathogen_reference_names): + reference = "" + if ( + read_reference_name in host_reference_names + ): # if reference of read is in the list of host references, set host as reference + reference = "host" + elif ( + read_reference_name in pathogen_reference_names + ): # if reference of read is in the list of pathogen references, set pathogen as reference + reference = "pathogen" + else: + print(("There is no " + read_reference_name + " in the reference name set")) + return reference + + +# function to add read and its reference to dictionary +def add_read(multimapped_reads, read_name, reference_name): + if read_name in multimapped_reads: # if read is in the dict, and reference is not defined, append the reference + if reference_name not in multimapped_reads[read_name]: + multimapped_reads[(read_name)].append((reference_name)) + else: # else create new key (read) and set reference as value + multimapped_reads[read_name] = [reference_name] + + +# function to find and save cross-mapped reads +def find_and_save_cross_mapped_reads(multimapped_reads_with_reference, output_file_name): + # extract reads with more than 1 reference name defined (cross-mapped reads) + crossmapeed_reads = [ + read_name + for read_name, reference_name in list(multimapped_reads_with_reference.items()) + if len(reference_name) > 1 + ] + # save cross-mapped reads + with open(output_file_name, "w") as f: + for cross_mapped_read in crossmapeed_reads: + f.write(str(cross_mapped_read) + "\n") + + +# function to identify and save cross-mapped reads +def read_reads_from_samfile(sam_file_name, host_reference_names, pathogen_reference_names, output_file_name): + # read bam file + samfile = pysam.AlignmentFile(sam_file_name, "rb") + # initialize dictionary of multi-mapped reads + multimapped_reads = dict() + for read in samfile: # iterate over reads from bam file + # find reference the read mapped to + reference_organisms = check_reference_organisms( + read.reference_name, host_reference_names, pathogen_reference_names + ) + # add read and reference to multimapped_reads dict. + add_read(multimapped_reads, read.query_name, reference_organisms) + # find and save cross-mapped reads + find_and_save_cross_mapped_reads(multimapped_reads, output_file_name) + + +parser = argparse.ArgumentParser() +parser.add_argument( + "-o", "--output", default="cross_mapped_reads.txt", metavar="output_file_name", help="output file name" +) +parser.add_argument( + "-h_ref", "--host_reference_names", metavar="", help="Path to reference_host_names.txt file" +) +parser.add_argument( + "-p_ref", + "--pathogen_reference_names", + metavar="", + help="Path to reference_pathogen_names.txt file", +) +args = parser.parse_args() + +# create list of host and pathogen chromosome/plasmid names +host_reference_names = [line.rstrip() for line in open(args.host_reference_names)] +pathogen_reference_names = [line.rstrip() for line in open(args.pathogen_reference_names)] + + +# identify and extract cross-mapped reads +read_reads_from_samfile(sys.stdin, host_reference_names, pathogen_reference_names, args.output) diff --git a/bin/extract_reference_names_from_fasta_files.sh b/bin/extract_reference_names_from_fasta_files.sh new file mode 100755 index 00000000..61476336 --- /dev/null +++ b/bin/extract_reference_names_from_fasta_files.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +#------------------------- +# +# Description: +# Script to extract chromosome and plasmid names from genome fasta files +# +# Created by B. Mika-Gospodorz +# Date: 5th September 2019 +# +# Input files: $1 name of output file, eg. reference_host_names.txt +# $@ list of fasta files +# Output: file with a given name defined in the first argument that contains names of chromosomes or plasmids defined in the fasta files +# +#------------------------- + +# assign output file name to variable +output_name=$1 +shift + +for file in "$@" #iterate over fasta files +do + grep ">" ${file} | awk -F" " '{print $1}' | sed 's/>//' >> ${output_name} # extract name followed '>' character and append to output file +done diff --git a/bin/mapping_stats.py b/bin/mapping_stats.py new file mode 100755 index 00000000..6abf47f8 --- /dev/null +++ b/bin/mapping_stats.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Dec 6 10:25:47 2019 + +@author: B. Mika-Gospodorz + +Input files: host and pathogen quantification tables (e.g. pathogen_quant_salmon.tsv, host_quantification_uniquely_mapped_htseq.tsv), raw read statistics, star statistics +Output file: tsv file +Description: Used to collect mapping and quantifications statistics for STAR, HTSeq, Salmon or HTSeq results. +""" + +import argparse + +import pandas as pd + + +# function to sum up number of mapped reads from quantification table +def mapping_stats(quantification_table_path, gene_attribute, organism): + # read quantification table + col_names = pd.read_csv(quantification_table_path, sep="\t", nrows=0).columns + types_dict = {gene_attribute: str} + types_dict.update({col: float for col in col_names if col not in types_dict}) + quantification_table = pd.read_csv(quantification_table_path, sep="\t", index_col=0, dtype=types_dict) + quantification_table = quantification_table.fillna(0) + # initialize dict. + quant_sum = {} + for sample_name in quantification_table: # iterate over columns of quantification_table + if ( + "NumReads" in sample_name + ): # for each column (sample) with 'NumReads' sum up the number of reads and add into quant_sum dict. + quant_sum.update({sample_name: sum(quantification_table[sample_name])}) + # create data frame from dict. + total_counts_pd = pd.DataFrame.from_dict(quant_sum, orient="index") + total_counts_pd.columns = [organism] + return total_counts_pd + + +parser = argparse.ArgumentParser(description="""collects and generates mapping statistics""") +parser.add_argument( + "-q_p", + "--quantification_table_pathogen", + metavar="", + default=".", + help="Path to pathogen quantification table; Salmon and HTSeq", +) +parser.add_argument( + "-q_h", + "--quantification_table_host", + metavar="", + default=".", + help="Path to host quantification table; Salmon and HTSeq", +) +parser.add_argument( + "-total_raw", + "--total_no_raw_reads", + metavar="", + default=".", + help="Path to table with total number of raw reads for each sample; Salmon and STAR", +) +parser.add_argument( + "-total_processed", + "--total_no_processed_reads", + metavar="", + help="Path to table with total number of processed reads by STAR or Salmon", +) +parser.add_argument( + "-m_u", + "--mapped_uniquely", + metavar="", + default=".", + help="Path to STAR mapping stats of uniquely mapped reads; STAR", +) +parser.add_argument( + "-m_m", + "--mapped_multi", + metavar="", + default=".", + help="Path to STAR mapping stats of multi mapped reads; STAR", +) +parser.add_argument( + "-c_m", + "--cross_mapped", + metavar="", + default=".", + help="Path to STAR mapping stats of cross_mapped reads; STAR", +) +parser.add_argument( + "-star", "--star_stats", metavar="", default=".", help="Path to mapping statistics of STAR; HTSeq" +) +parser.add_argument( + "-star_pr", + "--star_processed", + metavar="", + default=".", + help="Path to STAR stats of processed reads; Salmon in alignment-based mode", +) +parser.add_argument( + "-a", "--gene_attribute", default=".", help="gene attribute used in quantification; Salmon and HTSeq" +) +parser.add_argument("-t", "--tool", metavar="", help="salmon, salmon_alignment, htseq or star") +parser.add_argument("-o", "--output_dir", metavar="", help="output dir", default=".") +args = parser.parse_args() + +# collect statistics for Salmon Selective Alignment mode +if args.tool == "salmon": + # collect assigned pathogen reads + pathogen_total_counts = mapping_stats(args.quantification_table_pathogen, args.gene_attribute, "pathogen") + # collect assigned host reads + host_total_counts = mapping_stats(args.quantification_table_host, args.gene_attribute, "host") + # combine host and pathogen mapped reads + combined_total_mapped_reads = pd.concat([pathogen_total_counts, host_total_counts], axis=1) + # rename colnames - remove '_NumReads' suffix + new_index1 = [sample.split("_NumReads")[0] for sample in combined_total_mapped_reads.index] + combined_total_mapped_reads.index = new_index1 + # calculate total mapped reads + combined_total_mapped_reads["total_mapped_reads"] = combined_total_mapped_reads.sum(axis=1) + if args.total_no_raw_reads.endswith(".tsv"): # if tsv table is defined in total_no_raw_reads argument + # read total number of raw reads + total_reads = pd.read_csv(args.total_no_raw_reads, sep="\t", index_col=0, names=["total_raw_reads"]) + # read total number of reads processed by Salmon + processed_reads_salmon = pd.read_csv( + args.total_no_processed_reads, sep="\t", index_col=0, names=["processed_reads"] + ) + # combine statistics + results_df = pd.concat([combined_total_mapped_reads, processed_reads_salmon, total_reads], axis=1) + # calculate unmapped reads + results_df["unmapped_reads"] = results_df["processed_reads"] - results_df["total_mapped_reads"] + # calculate trimmed reads + results_df["trimmed_reads"] = results_df["total_raw_reads"] - results_df["processed_reads"] + else: # if tsv table is not defined in total_no_raw_reads argument + # read total number of reads processed by Salmon + processed_reads_salmon = pd.read_csv( + args.total_no_processed_reads, sep="\t", index_col=0, names=["processed_reads"] + ) + results_df = pd.concat([combined_total_mapped_reads, processed_reads_salmon], axis=1) + # calculate unmapped reads + results_df["unmapped_reads"] = results_df["processed_reads"] - results_df["total_mapped_reads"] + # save results + results_df2 = results_df.sort_index() + results_df2.to_csv(args.output_dir, sep="\t") + +# collect statistics for Salmon alignment-based mode +elif args.tool == "salmon_alignment": + # collect assigned pathogen reads + pathogen_total_counts = mapping_stats(args.quantification_table_pathogen, args.gene_attribute, "pathogen") + # collect assigned host reads + host_total_counts = mapping_stats(args.quantification_table_host, args.gene_attribute, "host") + # combine host and pathogen mapped reads + combined_total_assigned_reads = pd.concat([pathogen_total_counts, host_total_counts], axis=1) + # rename colnames - remove '_NumReads' suffix + new_index1 = [sample.split("_NumReads")[0] for sample in combined_total_assigned_reads.index] + combined_total_assigned_reads.index = new_index1 + # calculate total assigned reads + combined_total_assigned_reads["total_assigned_reads"] = combined_total_assigned_reads.sum(axis=1) + # extracted mapped reads from salmon log file + combined_total_mapped_reads = pd.read_csv( + args.total_no_processed_reads, sep="\t", index_col=0, names=["total_mapped_reads"] + ) + # read total number of reads processed by STAR + processed_reads_star = pd.read_csv(args.star_processed, sep="\t", index_col=0, names=["processed_reads"]) + if args.total_no_raw_reads.endswith(".tsv"): + # read total number of raw reads + total_reads = pd.read_csv(args.total_no_raw_reads, sep="\t", index_col=0, names=["total_raw_reads"]) + results_df = pd.concat( + [processed_reads_star, total_reads, combined_total_assigned_reads, combined_total_mapped_reads], axis=1 + ) + # calculate unmapped reads + results_df["unmapped_reads"] = results_df["processed_reads"] - results_df["total_mapped_reads"] + # calculate unassigned reads + results_df["unassigned_reads"] = results_df["total_mapped_reads"] - results_df["total_assigned_reads"] + # calculate trimmed reads + results_df["trimmed_reads"] = results_df["total_raw_reads"] - results_df["processed_reads"] + else: + results_df = pd.concat( + [processed_reads_star, combined_total_assigned_reads, combined_total_mapped_reads], axis=1 + ) + # calculate unmapped reads + results_df["unmapped_reads"] = results_df["processed_reads"] - results_df["total_mapped_reads"] + # calculate unassigned reads + results_df["unassigned_reads"] = results_df["total_mapped_reads"] - results_df["total_assigned_reads"] + # save results + results_df2 = results_df.sort_index() + results_df2.to_csv(args.output_dir, sep="\t") + +# collect statistics for HTSeq +elif args.tool == "htseq": + # collect assigned pathogen reads + pathogen_total_counts = mapping_stats( + args.quantification_table_pathogen, args.gene_attribute, "pathogen_assigned_reads" + ) + # collect assigned host reads + host_total_counts = mapping_stats(args.quantification_table_host, args.gene_attribute, "host_assigned_reads") + combined_total_mapped_reads = pd.concat([pathogen_total_counts, host_total_counts], axis=1) + # rename colnames - remove '_NumReads' suffix + new_index1 = [sample.split("_NumReads")[0] for sample in combined_total_mapped_reads.index] + combined_total_mapped_reads.index = new_index1 + # calculate total assigned reads + combined_total_mapped_reads["total_assigned_reads"] = combined_total_mapped_reads.sum(axis=1) + # read alignment statistics + star_stats = pd.read_csv(args.star_stats, sep="\t", index_col=0) + # combine statistics + results_df = pd.concat([star_stats, combined_total_mapped_reads], axis=1) + # calculate unassigned host reads + results_df["unassigned_host_reads"] = results_df["host_uniquely_mapped_reads"] - results_df["host_assigned_reads"] + # calculate unassigned pathogen reads + results_df["unassigned_pathogen_reads"] = ( + results_df["pathogen_uniquely_mapped_reads"] - results_df["pathogen_assigned_reads"] + ) + # save results + results_df2 = results_df.sort_index() + results_df2.to_csv(args.output_dir, sep="\t") + +# collect statistics for STAR +elif args.tool == "star": + # read total number of uniquely mapped reads + mapped_uniquely = pd.read_csv(args.mapped_uniquely, sep="\t", index_col=0) + # read total number of multi-mapped reads + mapped_multi = pd.read_csv(args.mapped_multi, sep="\t", index_col=0) + # read total number of cross-mapped reads + cross_mapped = pd.read_csv(args.cross_mapped, sep="\t", header=None, index_col=0, names=["cross_mapped_reads"]) + cross_mapped.index = [sample.replace("_cross_mapped_reads.txt", "") for sample in cross_mapped.index] + # combine statistics + combined_total_mapped_reads = pd.concat([mapped_uniquely, mapped_multi, cross_mapped], axis=1) + # calculate total mapped reads + combined_total_mapped_reads["total_mapped_reads"] = combined_total_mapped_reads.sum(axis=1) + if args.total_no_raw_reads.endswith(".tsv"): + # read total number of raw reads + total_reads = pd.read_csv(args.total_no_raw_reads, sep="\t", index_col=0, names=["total_raw_reads"]) + # read total number of processed reads + processed_reads_star = pd.read_csv( + args.total_no_processed_reads, sep="\t", index_col=0, names=["processed_reads"] + ) + # combine statistics + results_df = pd.concat([processed_reads_star, total_reads, combined_total_mapped_reads], axis=1) + # calculate unmapped reads + results_df["unmapped_reads"] = results_df["processed_reads"] - results_df["total_mapped_reads"] + # calculate trimmed reads + results_df["trimmed_reads"] = results_df["total_raw_reads"] - results_df["processed_reads"] + else: + # read total number of processed reads + processed_reads_star = pd.read_csv( + args.total_no_processed_reads, sep="\t", index_col=0, names=["processed_reads"] + ) + # combine statistics + results_df = pd.concat([processed_reads_star, combined_total_mapped_reads], axis=1) + # calculate unmapped reads + results_df["unmapped_reads"] = results_df["processed_reads"] - results_df["total_mapped_reads"] + # save results + results_df2 = results_df.sort_index() + results_df2.to_csv(args.output_dir, sep="\t") diff --git a/bin/plot_mapping_stats_htseq.py b/bin/plot_mapping_stats_htseq.py new file mode 100755 index 00000000..55da7593 --- /dev/null +++ b/bin/plot_mapping_stats_htseq.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Jun 3 18:23:21 2020 + +@author: B. Mika-Gospodorz + +Input files: tsv file +Output file: tsv and pdf files +Description: Used to plot mapping statistics for HTSeq +""" + +import argparse + +import matplotlib +import pandas as pd + +# do not use Xwindows backend +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np + + +# function to plot mapping statistics +def plot_mapping_stats(df_comb, no_samples, profile, x_lab, xticks_np, percentage, n_col, m): + # define color palette + my_cmap2 = matplotlib.cm.get_cmap("tab20") + my_cmap4 = matplotlib.cm.get_cmap("tab20c") + color = [ + "#af6f09", + "#840000", + "#f9bc08", + "#ef4026", + "#f7d560", + "#ff796c", + my_cmap4.colors[6], + my_cmap2.colors[15], + my_cmap2.colors[14], + ] + # reverse sample order + df_comb = df_comb.loc[reversed(df_comb.index)] + # make plot + fig = df_comb.plot( + kind="barh", stacked=True, figsize=(60, no_samples + 3), legend=True, color=color, width=0.8, fontsize=40 + ) + fig.spines["top"].set_visible(False) + fig.spines["right"].set_visible(False) + fig.spines["bottom"].set_visible(True) + fig.spines["left"].set_visible(False) + # set x label + plt.xlabel(x_lab, fontsize=40, labelpad=15) + # set x label ticks + plt.xticks(xticks_np) + if percentage == False: # set scientific format of label ticks if there is no percentage values + fig.ticklabel_format(style="scientific", axis="x", scilimits=(m, m)) + fig.xaxis.major.formatter._useMathText = True + tx = fig.xaxis.get_offset_text() + tx.set_fontsize(40) + plt.xticks(xticks_np, rotation=90, fontsize=40) + plt.tick_params(axis="x", which="major", pad=10) + # set legent position and format + fig.legend( + loc="upper center", + ncol=n_col, + bbox_to_anchor=(0.5, 1.4), + bbox_transform=plt.gcf().transFigure, + frameon=False, + prop={"size": 50}, + ) + # set sample names as y labels + fig.set_yticklabels(df_comb.index) + # save used table + df_comb.to_csv("mapping_stats_" + profile + ".tsv", sep="\t") + # save plot + plt.savefig( + "mapping_stats_" + profile + ".pdf", dpi=300, orientation="landscape", transparent=False, bbox_inches="tight" + ) + + +parser = argparse.ArgumentParser(description="""plot mapping statistics""") +parser.add_argument( + "-i", "--input_files", metavar="", default="*.tsv", help="Path to htseq mapping statistics tsv table " +) +args = parser.parse_args() + + +# read mapping statistic table as data frame +df_stats = pd.read_csv(args.input_files, sep="\t", index_col=0) + +# extract number of samples +no_samples = df_stats.shape[0] + + +# calculate percentage of mapping staistics +if "trimmed_reads" in df_stats.columns: # if statistics contain information on number of trimmed reads + pathogen_uniquely_mapped_percent = (df_stats["pathogen_assigned_reads"] / df_stats["total_raw_reads"]) * 100 + host_uniquely_mapped_percent = (df_stats["host_assigned_reads"] / df_stats["total_raw_reads"]) * 100 + unassigned_pathogen_reads = (df_stats["unassigned_pathogen_reads"] / df_stats["total_raw_reads"]) * 100 + unassigned_host_reads = (df_stats["unassigned_host_reads"] / df_stats["total_raw_reads"]) * 100 + pathogen_multi_mapped_percent = (df_stats["pathogen_multi_mapped_reads"] / df_stats["total_raw_reads"]) * 100 + host_multi_mapped_percent = (df_stats["host_multi_mapped_reads"] / df_stats["total_raw_reads"]) * 100 + cross_mapped_percent = (df_stats["cross_mapped_reads"] / df_stats["total_raw_reads"]) * 100 + unmapped_percent = (df_stats["unmapped_reads"] / df_stats["total_raw_reads"]) * 100 + trimmed_percent = (df_stats["trimmed_reads"] / df_stats["total_raw_reads"]) * 100 + + df_comb = pd.concat( + [ + host_uniquely_mapped_percent, + pathogen_uniquely_mapped_percent, + unassigned_host_reads, + unassigned_pathogen_reads, + host_multi_mapped_percent, + pathogen_multi_mapped_percent, + cross_mapped_percent, + unmapped_percent, + trimmed_percent, + ], + axis=1, + ) + df_comb.columns = [ + "host quantified reads", + "pathogen quantified reads", + "host unquantified reads", + "pathogen unquantified reads", + "host multi-mapped reads", + "pathogen multi-mapped reads", + "cross-mapped reads", + "unmapped reads", + "trimmed reads", + ] + # define no. of columns of plot legend + n_col = 5 +else: + pathogen_uniquely_mapped_percent = (df_stats["pathogen_assigned_reads"] / df_stats["processed_reads"]) * 100 + host_uniquely_mapped_percent = (df_stats["host_assigned_reads"] / df_stats["processed_reads"]) * 100 + unassigned_pathogen_reads = (df_stats["unassigned_pathogen_reads"] / df_stats["processed_reads"]) * 100 + unassigned_host_reads = (df_stats["unassigned_host_reads"] / df_stats["processed_reads"]) * 100 + pathogen_multi_mapped_percent = (df_stats["pathogen_multi_mapped_reads"] / df_stats["processed_reads"]) * 100 + host_multi_mapped_percent = (df_stats["host_multi_mapped_reads"] / df_stats["processed_reads"]) * 100 + cross_mapped_percent = (df_stats["cross_mapped_reads"] / df_stats["processed_reads"]) * 100 + unmapped_percent = (df_stats["unmapped_reads"] / df_stats["processed_reads"]) * 100 + df_comb = pd.concat( + [ + host_uniquely_mapped_percent, + pathogen_uniquely_mapped_percent, + unassigned_host_reads, + unassigned_pathogen_reads, + host_multi_mapped_percent, + pathogen_multi_mapped_percent, + cross_mapped_percent, + unmapped_percent, + ], + axis=1, + ) + df_comb.columns = [ + "host quantified reads", + "pathogen quantified reads", + "host unquantified reads", + "pathogen unquantified reads", + "host multi-mapped reads", + "pathogen multi-mapped reads", + "cross-mapped reads", + "unmapped reads", + ] + # define no. of columns of plot legend + n_col = 4 + + +# plot mapping statistics expressed as percentages +plot_mapping_stats( + df_comb, no_samples, "samples_percentage", "[ % ]", np.arange(0, 105, step=5), percentage=True, n_col=n_col, m=0 +) + + +# plot mapping statistics +if "trimmed_reads" in df_stats.columns: # if no. of trimmed reads is defined + df_comb = pd.concat( + [ + df_stats["host_assigned_reads"], + df_stats["pathogen_assigned_reads"], + df_stats["unassigned_host_reads"], + df_stats["unassigned_pathogen_reads"], + df_stats["host_multi_mapped_reads"], + df_stats["pathogen_multi_mapped_reads"], + df_stats["cross_mapped_reads"], + df_stats["unmapped_reads"], + df_stats["trimmed_reads"], + ], + axis=1, + ) + df_comb.columns = [ + "host quantified reads", + "pathogen quantified reads", + "host unquantified reads", + "pathogen unquantified reads", + "host multi-mapped reads", + "pathogen multi-mapped reads", + "cross-mapped reads", + "unmapped reads", + "trimmed reads", + ] + # calculate max total no. of reads and define x label values, round to 7 digits + max_limit = round(df_stats["total_raw_reads"].max(), -6) + if max_limit > 0: # if total number of reads is higher than 10^6 specify label ticks adjusted to this magnitude + # divide max_limit by 20 (number of label ticks) to get step + step2 = int(max_limit / 20) + # define label ticks + array_labels = np.arange(0, max_limit + step2, step=step2) + array_labels2 = np.around(array_labels) + # set m value to specify format of scientific format of x label ticks + m = 6 + else: # define label ticks if number of reads is smaller than 10^6 + step = int(df_stats["total_raw_reads"].max() / 20) + array_labels = np.arange(0, df_stats["total_raw_reads"].max() + step, step=step) + array_labels2 = np.around(array_labels) + # set m value to specify format of scientific format of x label ticks + m = 0 + # define no. of columns of plot legend + n_col = 5 +else: + df_comb = pd.concat( + [ + df_stats["host_assigned_reads"], + df_stats["pathogen_assigned_reads"], + df_stats["unassigned_host_reads"], + df_stats["unassigned_pathogen_reads"], + df_stats["host_multi_mapped_reads"], + df_stats["pathogen_multi_mapped_reads"], + df_stats["cross_mapped_reads"], + df_stats["unmapped_reads"], + ], + axis=1, + ) + df_comb.columns = [ + "host quantified reads", + "pathogen quantified reads", + "host unquantified reads", + "pathogen unquantified reads", + "host multi-mapped reads", + "pathogen multi-mapped reads", + "cross-mapped reads", + "unmapped reads", + ] + # calculate max total no. of processed reads and define x label values, round to 7 digits + max_limit = round(df_stats["processed_reads"].max(), -6) + if max_limit > 0: # if total number of reads is higher than 10^6 specify label ticks adjusted to this magnitude + # divide max_limit by 20 (number of label ticks) to get step + step2 = int(max_limit / 20) + # define label ticks + array_labels = np.arange(0, max_limit + step2, step=step2) + array_labels2 = np.around(array_labels) + # set m value to specify format of scientific format of x label ticks + m = 6 + else: # define label ticks if number of reads is smaller than 10^6 + step = int(df_stats["processed_reads"].max() / 20) + array_labels = np.arange(0, df_stats["processed_reads"].max() + step, step=step) + array_labels2 = np.around(array_labels) + # set m value to specify format of scientific format of x label ticks + m = 0 + # define no. of columns of plot legend + n_col = 4 + + +# plot mapping statistics +plot_mapping_stats( + df_comb, no_samples, "samples_total_reads", "Number of reads", array_labels2, percentage=False, n_col=n_col, m=m +) diff --git a/bin/plot_mapping_stats_star.py b/bin/plot_mapping_stats_star.py new file mode 100755 index 00000000..e2a9a77d --- /dev/null +++ b/bin/plot_mapping_stats_star.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue May 26 15:51:22 2020 + +@author: B. Mika-Gospodorz + +Input files: tsv file +Output file: tsv and pdf files +Description: Used to plot mapping statistics for STAR +""" + +import argparse + +import matplotlib +import pandas as pd + +# do not use Xwindows backend +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import numpy as np + + +# function to plot mapping statistics +def plot_mapping_stats(df_comb, no_samples, profile, x_lab, xticks_np, percentage, m): + # define color palette + my_cmap2 = matplotlib.cm.get_cmap("tab20") + my_cmap4 = matplotlib.cm.get_cmap("tab20c") + color = [ + my_cmap4.colors[12], + my_cmap4.colors[13], + "#4984b8", + "#95d0fc", + my_cmap4.colors[6], + my_cmap2.colors[15], + my_cmap2.colors[14], + ] + # reverse sample order + df_comb = df_comb.loc[reversed(df_comb.index)] + # make plot + fig = df_comb.plot( + kind="barh", stacked=True, figsize=(60, no_samples + 3), legend=True, color=color, width=0.8, fontsize=40 + ) + fig.spines["top"].set_visible(False) + fig.spines["right"].set_visible(False) + fig.spines["bottom"].set_visible(True) + fig.spines["left"].set_visible(False) + # set x label + plt.xlabel(x_lab, fontsize=40, labelpad=15) + # set x label ticks + plt.xticks(xticks_np) + if percentage == False: # set scientific format of label ticks if there is no percentage values + fig.ticklabel_format(style="scientific", axis="x", scilimits=(m, m)) + fig.xaxis.major.formatter._useMathText = True + tx = fig.xaxis.get_offset_text() + tx.set_fontsize(40) + plt.xticks(xticks_np, rotation=90, fontsize=40) + plt.tick_params(axis="x", which="major", pad=10) + # set legent position and format + fig.legend( + loc="upper center", + ncol=4, + bbox_to_anchor=(0.5, 1.4), + bbox_transform=plt.gcf().transFigure, + frameon=False, + prop={"size": 50}, + ) + # set sample names as y labels + fig.set_yticklabels(df_comb.index) + # save used table + df_comb.to_csv("mapping_stats_" + profile + ".tsv", sep="\t") + # save plot + plt.savefig( + "mapping_stats_star_" + profile + ".pdf", + dpi=300, + orientation="landscape", + transparent=False, + bbox_inches="tight", + ) + + +parser = argparse.ArgumentParser(description="""plot mapping statistics""") +parser.add_argument( + "-i", + "--input_files", + metavar="", + default="*.csv", + help="Path to the outputfiles from mapping statistics ", +) +args = parser.parse_args() + + +# read mapping statistic table as data frame +df_stats = pd.read_csv(args.input_files, sep="\t", index_col=0) + +# extract number of samples +no_samples = df_stats.shape[0] + + +# calculate percentage of mapping staistics +if "trimmed_reads" in df_stats.columns: + pathogen_uniquely_mapped_percent = (df_stats["pathogen_uniquely_mapped_reads"] / df_stats["total_raw_reads"]) * 100 + host_uniquely_mapped_percent = (df_stats["host_uniquely_mapped_reads"] / df_stats["total_raw_reads"]) * 100 + pathogen_multi_mapped_percent = (df_stats["pathogen_multi_mapped_reads"] / df_stats["total_raw_reads"]) * 100 + host_multi_mapped_percent = (df_stats["host_multi_mapped_reads"] / df_stats["total_raw_reads"]) * 100 + cross_mapped_percent = (df_stats["cross_mapped_reads"] / df_stats["total_raw_reads"]) * 100 + unmapped_percent = (df_stats["unmapped_reads"] / df_stats["total_raw_reads"]) * 100 + trimmed_percent = (df_stats["trimmed_reads"] / df_stats["total_raw_reads"]) * 100 + + df_comb = pd.concat( + [ + host_uniquely_mapped_percent, + host_multi_mapped_percent, + pathogen_uniquely_mapped_percent, + pathogen_multi_mapped_percent, + cross_mapped_percent, + unmapped_percent, + trimmed_percent, + ], + axis=1, + ) + df_comb.columns = [ + "host uniquely mapped reads", + "host multi-mapped reads", + "pathogen uniquely mapped reads", + "pathogen multi-mapped reads", + "cross-mapped reads", + "unmapped reads", + "trimmed reads", + ] +else: + pathogen_uniquely_mapped_percent = (df_stats["pathogen_uniquely_mapped_reads"] / df_stats["processed_reads"]) * 100 + host_uniquely_mapped_percent = (df_stats["host_uniquely_mapped_reads"] / df_stats["processed_reads"]) * 100 + pathogen_multi_mapped_percent = (df_stats["pathogen_multi_mapped_reads"] / df_stats["processed_reads"]) * 100 + host_multi_mapped_percent = (df_stats["host_multi_mapped_reads"] / df_stats["processed_reads"]) * 100 + cross_mapped_percent = (df_stats["cross_mapped_reads"] / df_stats["processed_reads"]) * 100 + unmapped_percent = (df_stats["unmapped_reads"] / df_stats["processed_reads"]) * 100 + + df_comb = pd.concat( + [ + host_uniquely_mapped_percent, + host_multi_mapped_percent, + pathogen_uniquely_mapped_percent, + pathogen_multi_mapped_percent, + cross_mapped_percent, + unmapped_percent, + ], + axis=1, + ) + df_comb.columns = [ + "host uniquely mapped reads", + "host multi-mapped reads", + "pathogen uniquely mapped reads", + "pathogen multi-mapped reads", + "cross-mapped reads", + "unmapped reads", + ] + +# plot mapping statistics expressed as percentages +plot_mapping_stats(df_comb, no_samples, "samples_percentage", "[ % ]", np.arange(0, 105, step=5), percentage=True, m=0) + + +# plot mapping statistics +if "trimmed_reads" in df_stats.columns: # if no. of trimmed reads is defined + df_comb = pd.concat( + [ + df_stats["host_uniquely_mapped_reads"], + df_stats["host_multi_mapped_reads"], + df_stats["pathogen_uniquely_mapped_reads"], + df_stats["pathogen_multi_mapped_reads"], + df_stats["cross_mapped_reads"], + df_stats["unmapped_reads"], + df_stats["trimmed_reads"], + ], + axis=1, + ) + df_comb.columns = [ + "host uniquely mapped reads", + "host multi-mapped reads", + "pathogen uniquely mapped reads", + "pathogen multi-mapped reads", + "cross-mapped reads", + "unmapped reads", + "trimmed reads", + ] + # calculate max total no. of reads and define x label values, round to 7 digits + max_limit = round(df_stats["total_raw_reads"].max(), -6) + if max_limit > 0: # if total number of reads is higher than 10^6 specify label ticks adjusted to this magnitude + # divide max_limit by 20 (number of label ticks) to get step + step2 = int(max_limit / 20) + # define label ticks + array_labels = np.arange(0, max_limit + step2, step=step2) + array_labels2 = np.around(array_labels) + # set m value to specify format of scientific format of x label ticks + m = 6 + else: # define label ticks if number of reads is smaller than 10^6 + step = int(df_stats["total_raw_reads"].max() / 20) + array_labels = np.arange(0, df_stats["total_raw_reads"].max() + step, step=step) + array_labels2 = np.around(array_labels) + # set m value to specify format of scientific format of x label ticks + m = 0 +else: + df_comb = pd.concat( + [ + df_stats["host_uniquely_mapped_reads"], + df_stats["host_multi_mapped_reads"], + df_stats["pathogen_uniquely_mapped_reads"], + df_stats["pathogen_multi_mapped_reads"], + df_stats["cross_mapped_reads"], + df_stats["unmapped_reads"], + ], + axis=1, + ) + df_comb.columns = [ + "host uniquely mapped reads", + "host multi-mapped reads", + "pathogen uniquely mapped reads", + "pathogen multi-mapped reads", + "cross-mapped reads", + "unmapped reads", + ] + # calculate max total no. of processed reads and define x label values, round to 7 digits + max_limit = round(df_stats["processed_reads"].max(), -6) + if max_limit > 0: # if total number of reads is higher than 10^6 specify label ticks adjusted to this magnitude + # divide max_limit by 20 (number of label ticks) to get step + step2 = int(max_limit / 20) + # define label ticks + array_labels = np.arange(0, max_limit + step2, step=step2) + array_labels2 = np.around(array_labels) + # set m value to specify format of scientific format of x label ticks + m = 6 + else: # define label ticks if number of reads is smaller than 10^6 + step = int(df_stats["processed_reads"].max() / 20) + array_labels = np.arange(0, df_stats["processed_reads"].max() + step, step=step) + array_labels2 = np.around(array_labels) + # set m value to specify format of scientific format of x label ticks + m = 0 + +# plot mapping statistics +plot_mapping_stats(df_comb, no_samples, "samples_total_reads", "Number of reads", array_labels2, percentage=False, m=m) diff --git a/bin/remove_crossmapped_read_pairs_BAM.sh b/bin/remove_crossmapped_read_pairs_BAM.sh new file mode 100755 index 00000000..34bbd0be --- /dev/null +++ b/bin/remove_crossmapped_read_pairs_BAM.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +#------------------------- +# +# Description: +# Script to remove cross-mapped reads from bam file +# +# Created by B. Mika-Gospodorz +# Date: 17th July 2020 +# +# Input files: $1 bam file +# $2 directory where the extract_crossmapped_reads.py script is stored +# $3 reference_host_names.txt file that contains host references extracted with extract_reference_names_from_fasta_files.sh +# $4 reference_pathogen_names.txt file that contains pathogen references extracted with extract_reference_names_from_fasta_files.sh +# $5 output file name for list of cross-mapped reads, e.g. *_cross_mapped_reads.txt +# $6 output file name for bam file with removed cross-mapped reads, e.g. *_no_crossmapped.bam +# Output: *_cross_mapped_reads.txt, *_no_crossmapped.bam +# +#------------------------- + + +alignment=$1 +extract_crossmapped_reads_script_path=$2 +host_reference=$3 +pathogen_reference=$4 +out_name=$5 + +# extract multi-mapped reads from bam file (-f 0x2 - read mapped in proper pair) and run extract_crossmapped_reads.py to extract reads that mapped onto both host and pathogen genomes simultaneously +samtools view -f 0x2 -h $alignment | fgrep -vw NH:i:1 | python $extract_crossmapped_reads_script_path/extract_crossmapped_reads.py -h_ref $host_reference -p_ref $pathogen_reference -o $out_name + +cross_mapped_fragments=$out_name +out_bam_name=$6 + +# remove cross-mapped reads from bam file and create a new one +samtools view -h $alignment | fgrep -wvf $cross_mapped_fragments | samtools view -bS -o $out_bam_name - diff --git a/bin/remove_crossmapped_reads_BAM.sh b/bin/remove_crossmapped_reads_BAM.sh new file mode 100755 index 00000000..5dab4677 --- /dev/null +++ b/bin/remove_crossmapped_reads_BAM.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +#------------------------- +# +# Description: +# Script to remove cross-mapped reads from bam file +# +# Created by B. Mika-Gospodorz +# Date: 2nd June 2020 +# +# Input files: $1 bam file +# $2 directory where the extract_crossmapped_reads.py script is stored +# $3 reference_host_names.txt file that contains host references extracted with extract_reference_names_from_fasta_files.sh +# $4 reference_pathogen_names.txt file that contains pathogen references extracted with extract_reference_names_from_fasta_files.sh +# $5 output file name for list of cross-mapped reads, e.g. *_cross_mapped_reads.txt +# $6 output file name for bam file with removed cross-mapped reads, e.g. *_no_crossmapped.bam +# Output: *_cross_mapped_reads.txt, *_no_crossmapped.bam +# +#------------------------- + +alignment=$1 +extract_crossmapped_reads_script_path=$2 +host_reference=$3 +pathogen_reference=$4 +out_name=$5 + +# extract multi-mapped reads from bam file and run extract_crossmapped_reads.py to extract reads that mapped onto both host and pathogen genomes simultaneously +samtools view -F 4 -h $alignment | fgrep -vw NH:i:1 | python $extract_crossmapped_reads_script_path/extract_crossmapped_reads.py -h_ref $host_reference -p_ref $pathogen_reference -o $out_name + + +cross_mapped_reads=$out_name +out_bam_name=$6 + +# remove cross-mapped reads from bam file and create a new one +samtools view -h $alignment | fgrep -wvf $cross_mapped_reads | samtools view -bS -o $out_bam_name - + diff --git a/bin/scatter_plots.py b/bin/scatter_plots.py new file mode 100755 index 00000000..061a7ca7 --- /dev/null +++ b/bin/scatter_plots.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Apr 9 22:56:24 2020 + +@author: B. Mika-Gospodorz + +Input files: quantification tsv file +Output file: pdf files +Description: Used to plot scatter plots of technical replicates +""" + + +import argparse + +import matplotlib +import pandas as pd + +# do not use Xwindows backend +matplotlib.use("Agg") +import itertools +import math + +import matplotlib.pyplot as plt +import numpy as np +import seaborn as sns +from scipy import stats + + +# function to plot scatter plots +def scatter_plot(TPM_log, TPMs, name_1, name_2, lim_plot, organism): + g1 = sns.JointGrid("rep1", y="rep2", data=TPM_log, height=6) + g1 = g1.plot_joint(plt.scatter, edgecolor="black", linewidth=0.5) + # calculate pearson's correlation coefficient + stat = lambda a, b: stats.pearsonr(TPMs.rep1, TPMs.rep2) + # add label with pearson's correlation coefficient + g1 = g1.annotate( + stat, template="{stat}: {val:.4f}", stat="Pearson's r", loc="upper left", fontsize=15 + ) # {stat}: {val:.2f} (p = {p:.3g}) with p-value + g1.ax_marg_x.set_axis_off() + g1.ax_marg_y.set_axis_off() + plt.xlabel(r"$\mathrm{log_{10}TPM}$" + "\n" + "\n" + name_1.split("_TPM")[0], fontsize=15, labelpad=5) + plt.ylabel(name_2.split("_TPM")[0] + "\n" + "\n" + r"$\mathrm{log_{10}TPM}$", fontsize=15, labelpad=5) + plt.xlim(-0.15, lim_plot) + plt.ylim(-0.15, lim_plot) + plt.title(organism, fontsize=15, fontweight="bold") + plt.tick_params(axis="both", labelsize=15) + plt.subplots_adjust(bottom=0.1) + # save plot + plt.savefig("scatter_plot_" + name_1 + "_" + name_2 + "_" + organism + ".pdf", dpi=300, bbox_inches="tight") + plt.close(plt.gcf()) + + +parser = argparse.ArgumentParser(description="""Plots scatter plots of replicates""") +parser.add_argument( + "-q", "--quantification_table", metavar="", help="Path to the quantification table" +) +parser.add_argument("-a", "--gene_attribute", help="gene attribute") +parser.add_argument("-org", "--organism", help="host or pathogen") + + +args = parser.parse_args() +quantification_table_path = args.quantification_table +gene_attribute = args.gene_attribute + +# read quantification table as data frame +col_names = pd.read_csv(quantification_table_path, sep="\t", nrows=0).columns +types_dict = {gene_attribute: str} +types_dict.update({col: float for col in col_names if col not in types_dict}) +quantification_table = pd.read_csv(quantification_table_path, sep="\t", index_col=0, dtype=types_dict) + +# extract columns with 'TPM' suffix +TMP_column = [column for column in quantification_table.columns if "TPM" in column] + +# find axes' limits +TPM_table_plus_1 = quantification_table[TMP_column] + 1 +TPM_table_log = TPM_table_plus_1.apply(np.log10, axis=1) +max_TPM = TPM_table_log.max().max() +lim_plot = math.ceil(max_TPM) + 0.5 + + +# remove '_TPM' suffix from sample names +columns_conditions_no_TPM = [column[:-4] for column in TMP_column] + +# extract conditions - remove indication of which replicate it is (_1, _2 or _3) +columns_conditions = [name.rsplit("_", 1)[0] for name in columns_conditions_no_TPM] +# find positions of technical replicates in quantification table +patterns = {} +for i in range(0, len(columns_conditions)): # iterate over number of samples + prefix = columns_conditions[i] # extract condition + d1 = { + prefix: [i] + } # dictionary with condition and its position in columns conditions, which is equivalent to positions in quantification table + if prefix in patterns.keys(): # if condition already exist in the dictionary, then append the position + patterns[prefix].insert(patterns[prefix][0], list(d1.values())[0][0]) + else: # append condition and its position + patterns.update(d1) + + +# create list of replicates' position lists, e.g. [[1, 2, 0], [3, 4, 5], [6, 7, 8]] +condition_list = [patterns[condition] for condition in patterns.keys()] +# plot scatter plots of technical replicates +for cond in condition_list: # iterate over list of technical replicates' positions + if len(cond) > 1: + sample_cond = [TMP_column[c] for c in cond] # extract column names for replicates in quantification table + TPMs = quantification_table[sample_cond] + combinations = list(itertools.combinations(TPMs, 2)) # create list of possible combinations of sample pairs + for com in combinations: # iterate over sample pair combinations + TPMs = pd.concat( + [quantification_table[com[0]], quantification_table[com[1]]], axis=1 + ) # extract TPM values of two replicates + TPMs.columns = ["rep1", "rep2"] + TPM_plus1 = TPMs + 1 # add 1 to deal with 0 values + TPM_log = TPM_plus1.apply(np.log10, axis=1) # log-transformation + # plot scatter plot + scatter_plot(TPM_log, TPMs, com[0], com[1], lim_plot, args.organism) diff --git a/bin/split_quant_tables.sh b/bin/split_quant_tables.sh new file mode 100755 index 00000000..d8ed6a96 --- /dev/null +++ b/bin/split_quant_tables.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +#------------------------- +# +# Description: +# Script to split HTSeq quantification tables into host and pathogen results +# +# Created by B. Mika-Gospodorz +# Date: 3rd June 2020 +# +# Input files: $1 HTSeq quantification table +# $2 tsv file that contains host annotations extracted with extract_annotations_from_gff.py +# $3 tsv file that contains pathogen annotations extracted with extract_annotations_from_gff.py +# $4 output file name +# Output: host_* and pathogen_* tsv files defined in the 4th argument +# +#------------------------- + +quant_table=$1 +host_annotations=$2 +pathogen_annotations=$3 +out_name=$4 + +# extract host quantification results based on gene annotation from tsv file (extract first column from host annotation table with gene names and extract quantification results for them) +awk -F"\t" '!(NR<=1) {print $1}' $host_annotations | awk 'NR==FNR{a[$0]=$0}NR>FNR{if($1==a[$1])print $0}' - $quant_table > host_quant +# copy header from quantification table to host quantification table +awk 'NR==1 {print; exit}' $quant_table | cat - host_quant > host_$out_name + + +# extract pathogen quantification results based on gene annotation from tsv file +awk -F"\t" '!(NR<=1) {print $1}' $pathogen_annotations | awk 'NR==FNR{a[$0]=$0}NR>FNR{if($1==a[$1])print $0}' - $quant_table > pathogen_quant +# copy header from quantification table to pathogen quantification table +awk 'NR==1 {print; exit}' $quant_table | cat - pathogen_quant > pathogen_$out_name diff --git a/conf/modules.config b/conf/modules.config index 6cbb04cd..e11a6b1f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -196,6 +196,40 @@ process { ] } + withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR_HTSEQ:HTSEQ_COUNT' { + ext.args = [ + "-t quant", + "-f bam", + "-r pos", + "--max-reads-in-buffer=${params.max_reads_in_buffer}", + "-a ${params.minaqual}"].join(' ').trim() + } + + withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:STAR:STAR_ALIGN' { + ext.args = ['--readFilesCommand gunzip -c', + "--outSAMunmapped ${params.outSAMunmapped}", + "--outSAMattributes ${params.outSAMattributes}", + "--sjdbGTFfeatureExon ${params.sjdbGTFfeatureExon}", + "--sjdbGTFtagExonParentTranscript ${params.sjdbGTFtagExonParentTranscript}", + "--quantMode ${params.quantMode}", + "--quantTranscriptomeBan ${params.quantTranscriptomeBan}", + "--outFilterMultimapNmax ${params.outFilterMultimapNmax}", + "--outFilterType ${params.outFilterType}", + "--limitBAMsortRAM ${params.limitBAMsortRAM}", + "--alignSJoverhangMin ${params.alignSJoverhangMin}", + "--alignSJDBoverhangMin ${params.alignSJDBoverhangMin}", + "--outFilterMismatchNmax ${params.outFilterMismatchNmax}", + "--outFilterMismatchNoverReadLmax ${params.outFilterMismatchNoverReadLmax}", + "--alignIntronMin ${params.alignIntronMin}", + "--alignIntronMax ${params.alignIntronMax}", + "--alignMatesGapMax ${params.alignMatesGapMax}", + "--winAnchorMultimapNmax ${params.winAnchorMultimapNmax}"].join(' ').trim() + publishDir = [ + path: { "${params.outdir}/STAR_SALMON/star/" }, + mode: params.publish_dir_mode + ] + } + withName: 'NFCORE_DUALRNASEQ:DUALRNASEQ:SALMON_ALIGNMENT_BASED:SALMON_QUANT' { publishDir = [ diff --git a/conf/test_hackathon.config b/conf/test_hackathon.config index 1f29e568..9cc8b3c1 100644 --- a/conf/test_hackathon.config +++ b/conf/test_hackathon.config @@ -32,5 +32,5 @@ params { fasta_pathogen = "https://github.com/nf-core/test-datasets/raw/dualrnaseq/references/SL1344_sub.fasta" gff_pathogen = "https://github.com/nf-core/test-datasets/raw/dualrnaseq/references/SL1344_sub.gff3" - + mapping_statistics = true } diff --git a/modules.json b/modules.json index b6029734..30bdbe9f 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "bbmap/bbduk": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", @@ -20,6 +25,11 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, + "htseq/count": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", @@ -45,9 +55,9 @@ "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", "installed_by": ["modules"] }, - "unzip": { + "unzipfiles": { "branch": "master", - "git_sha": "5e7b1ef9a5a2d9258635bcbf70fcf37dacd1b247", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] } } diff --git a/modules/local/check_replicates/main.nf b/modules/local/check_replicates/main.nf new file mode 100644 index 00000000..8a5e10a7 --- /dev/null +++ b/modules/local/check_replicates/main.nf @@ -0,0 +1,21 @@ +process CHECK_REPLICATES { + tag "check_replicates" + + label 'process_high' + + input: + val(sample_name) from scatter_plots.collect() + + output: + stdout repl_scatter_plots_salmon_pathogen + stdout repl_scatter_plots_salmon_host + stdout repl_scatter_plots_salmon_alignment_host + stdout repl_scatter_plots_salmon_alignment_pathogen + stdout repl_scatter_plots_htseq_pathogen + stdout repl_scatter_plots_htseq_host + + shell: + ''' + python !{workflow.projectDir}/bin/check_replicates.py -s !{sample_name} 2>&1 + ''' +} \ No newline at end of file diff --git a/modules/local/collect_processed_reads_star/main.nf b/modules/local/collect_processed_reads_star/main.nf new file mode 100644 index 00000000..b7492031 --- /dev/null +++ b/modules/local/collect_processed_reads_star/main.nf @@ -0,0 +1,23 @@ +process COLLECT_PROCESSED_READS_STAR { + publishDir "${params.outdir}/mapping_statistics/STAR", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR" + tag "collect_processed_reads_STAR" + + label 'process_high' + + input: + path(process_reads) + + output: + path("processed_reads_star.tsv"), emit: tsv + path "versions.yml" , emit: versions + + script: + """ + cat $process_reads > processed_reads_star.tsv + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/collect_stats_star_multi_mapped/main.nf b/modules/local/collect_stats_star_multi_mapped/main.nf new file mode 100644 index 00000000..fa4b5a76 --- /dev/null +++ b/modules/local/collect_stats_star_multi_mapped/main.nf @@ -0,0 +1,27 @@ +process COLLECT_STATS_STAR_MULTI_MAPPED { + publishDir "${params.outdir}/mapping_statistics/STAR", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR" + tag "collect_multi_mapped_reads_STAR" + + label 'process_high' + + input: + path stats + + output: + path "multi_mapped_reads_star.tsv", emit: tsv + path "versions.yml" , emit: versions + + script: + """ + python $workflow.projectDir/bin/combine_tables.py \\ + -i $stats \\ + -o multi_mapped_reads_star.tsv \\ + -s multi_mapped_reads + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version + END_VERSIONS + """ +} diff --git a/modules/local/collect_stats_star_uniquely_mapped/main.nf b/modules/local/collect_stats_star_uniquely_mapped/main.nf new file mode 100644 index 00000000..7c1d87a8 --- /dev/null +++ b/modules/local/collect_stats_star_uniquely_mapped/main.nf @@ -0,0 +1,27 @@ +process COLLECT_STATS_STAR_UNIQUELY_MAPPED { + publishDir "${params.outdir}/mapping_statistics/STAR", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR" + tag "collect_uniq_mapped_reads_STAR" + + label 'process_high' + + input: + path stats + + output: + path "uniquely_mapped_reads_star.tsv", emit: tsv + path "versions.yml" , emit: versions + + script: + """ + python $workflow.projectDir/bin/combine_tables.py \\ + -i $stats \\ + -o uniquely_mapped_reads_star.tsv \\ + -s uniquely_mapped_reads + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/combine_annotations_quant_uniquely_mapped_host/main.nf b/modules/local/combine_annotations_quant_uniquely_mapped_host/main.nf new file mode 100644 index 00000000..c3b317cb --- /dev/null +++ b/modules/local/combine_annotations_quant_uniquely_mapped_host/main.nf @@ -0,0 +1,32 @@ +process COMBINE_ANNOTATIONS_QUANT_UNIQUELY_MAPPED_HOST { + tag "comb_annots_quant_pathogen" + label 'process_high' + + publishDir "${params.outdir}/HTSeq", mode: params.publish_dir_mode + storeDir "${params.outdir}/HTSeq" + + + input: + path quantification_table + path annotation_table + val attribute + val organism + + output: + path "pathogen_combined_quant_annotations.tsv", emit: tsv + path "versions.yml" , emit: versions + + script: + """ + $workflow.projectDir/bin/combine_quant_annotations.py \\ + -q $quantification_table \\ + -annotations $annotation_table \\ + -a $attribute -org $organism + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + + """ +} \ No newline at end of file diff --git a/modules/local/combine_quantification_tables_htseq_uniquely_mapped/main.nf b/modules/local/combine_quantification_tables_htseq_uniquely_mapped/main.nf new file mode 100644 index 00000000..8accf519 --- /dev/null +++ b/modules/local/combine_quantification_tables_htseq_uniquely_mapped/main.nf @@ -0,0 +1,28 @@ +process COMBINE_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED { + tag "comb_quants_htseq_uniq_mapped" + label 'process_high' + publishDir "${params.outdir}/HTSeq", mode: params.publish_dir_mode + storeDir "${params.outdir}/HTSeq" + + + input: + path input_quantification + val(host_attribute) + + output: + path "quantification_results_*.tsv", emit: tsv + path "versions.yml" , emit: versions + + script: + """ + python $workflow.projectDir/bin/collect_quantification_data.py \\ + -i $input_quantification \\ + -q htseq \\ + -a $host_attribute + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/count_crossmapped_reads/main.nf b/modules/local/count_crossmapped_reads/main.nf new file mode 100644 index 00000000..7f85910e --- /dev/null +++ b/modules/local/count_crossmapped_reads/main.nf @@ -0,0 +1,23 @@ +process COUNT_CROSSMAPPED_READS { + tag "count_crossmapped_reads" + publishDir "${params.outdir}/mapping_statistics/STAR", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR" + + label 'process_high' + + input: + path(cross_mapped_reads) + + output: + path "cross_mapped_reads_sum.txt", emit: txt + path "versions.yml" , emit: versions + + script: + """ + $workflow.projectDir/bin/count_cross_mapped_reads.sh $cross_mapped_reads + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/count_total_read_pairs/main.nf b/modules/local/count_total_read_pairs/main.nf new file mode 100644 index 00000000..921d2130 --- /dev/null +++ b/modules/local/count_total_read_pairs/main.nf @@ -0,0 +1,22 @@ +process COUNT_TOTAL_READS_PAIRS { + tag "count_total_reads" + publishDir "${params.outdir}/mapping_statistics", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics" + + label 'process_high' + + input: + path(tsv) + output: + path "total_raw_read_pairs_fastq.tsv", emit:tsv + + script: + """ + $workflow.projectDir/bin/collect_total_raw_read_pairs.py -i $tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/count_total_reads/main.nf b/modules/local/count_total_reads/main.nf new file mode 100644 index 00000000..3a1e2e20 --- /dev/null +++ b/modules/local/count_total_reads/main.nf @@ -0,0 +1,21 @@ +process COUNT_TOTAL_READS { + tag "count_total_reads" + publishDir "${params.outdir}/mapping_statistics", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics" + + label 'process_high' + + input: + path(fastq) + output: + path "total_raw_reads_fastq.tsv", emit:tsv + + script: + """ + $workflow.projectDir/bin/count_total_reads.sh $fastq >> total_raw_reads_fastq.tsv + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/extract_processed_reads.nf b/modules/local/extract_processed_reads.nf index 799439fa..e8f38e01 100644 --- a/modules/local/extract_processed_reads.nf +++ b/modules/local/extract_processed_reads.nf @@ -13,6 +13,7 @@ process EXTRACT_PROCESSED_READS { output: path("${meta.id}.txt"), emit: collect_results + path "versions.yml" , emit: versions script: def prefix = task.ext.prefix ?: "${meta.id}" @@ -27,6 +28,10 @@ process EXTRACT_PROCESSED_READS { processed=\$(grep "Number of input reads" ${json_file} | sed 's/Number of input reads//g'| sed 's/[^a-zA-Z0-9]//g') echo -e "${prefix}\t\${processed}" > ${prefix}.txt fi + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS """ } diff --git a/modules/local/extract_reference_name_star/main.nf b/modules/local/extract_reference_name_star/main.nf new file mode 100644 index 00000000..a59e27f4 --- /dev/null +++ b/modules/local/extract_reference_name_star/main.nf @@ -0,0 +1,18 @@ +process EXTRA_REFERENCE_NAME_STAR { + tag "extract_ref_names_host_star" + label 'process_high' + + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + + input: + path(fasta) + val(organism) + + output: + path "reference_names_*.txt", emit: txt + + script: + """ + $workflow.projectDir/bin/extract_reference_names_from_fasta_files.sh reference_names_${organism}.txt $fasta + """ +} \ No newline at end of file diff --git a/modules/local/htseq.nf b/modules/local/htseq.nf index c4c3a2be..5cdf6587 100644 --- a/modules/local/htseq.nf +++ b/modules/local/htseq.nf @@ -1,5 +1,5 @@ -process HTSEQ { - tag "$meta.id" +process HTSEQ_COUNT { + tag "$sample_name.id" label 'process_high' conda "bioconda::htseq=2.0.2-0" @@ -8,14 +8,12 @@ process HTSEQ { 'quay.io/biocontainers/htseq:2.0.2--py38h7a2e8c7_0' }" input: - tuple val(meta), path(gff) - val(sample_name), path(st) - val(host_attribute) - val(stranded) + tuple val(sample_name), path(st) + tuple path(gff) val(quantifier) output: - tuple val(meta), path("*_count.txt"), emit: counts + tuple val(sample_name), path("*_count.txt"), emit: txt path("versions.yml"), emit: versions when: @@ -23,22 +21,22 @@ process HTSEQ { script: def args = task.ext.args ?: '' - def output_file = sample_name + "_count.txt" + def output_file = "${sample_name.id}_count.txt" """ htseq-count \\ -n ${task.cpus} \\ -t $quantifier \\ -f bam \\ -r pos $st $gff \\ - -i $host_attribute \\ - -s $stranded \\ + -i $params.host_gff_attribute \\ + -s $params.stranded \\ --max-reads-in-buffer=${params.max_reads_in_buffer} \\ -a ${params.minaqual} \\ ${params.htseq_params} \\ $args \\ > $output_file - sed -i '1{h;s/.*/'"$sample_name"'/;G}' "$output_file" + sed -i '1{h;s/.*/'"${sample_name.id}"'/;G}' "$output_file" cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/htseq_quantification_stats_uniquely_mapped/main.nf b/modules/local/htseq_quantification_stats_uniquely_mapped/main.nf new file mode 100644 index 00000000..770061d1 --- /dev/null +++ b/modules/local/htseq_quantification_stats_uniquely_mapped/main.nf @@ -0,0 +1,35 @@ +process HTSEQ_QUANTIFICATION_STATS_UNIQUELY_MAPPED { + storeDir "${params.outdir}/mapping_statistics/HTSeq" + publishDir "${params.outdir}/mapping_statistics/HTSeq", mode: params.publish_dir_mode + tag "quantification_stats_htseq" + + label 'process_high' + + input: + path quant_table_host + path quant_table_pathogen + val attribute + path star_stats + + + output: + path ('htseq_uniquely_mapped_reads_stats.tsv'), emit: tsv + path "versions.yml" , emit: versions + + script: + """ + python $workflow.projectDir/bin/mapping_stats.py \\ + -q_p $quant_table_pathogen \\ + -q_h $quant_table_host \\ + -a $attribute \\ + -star $star_stats \\ + -t htseq \\ + -o htseq_uniquely_mapped_reads_stats.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version + END_VERSIONS + + """ +} \ No newline at end of file diff --git a/modules/local/htseq_uniquely_mapped_tpm/main.nf b/modules/local/htseq_uniquely_mapped_tpm/main.nf new file mode 100644 index 00000000..328e6904 --- /dev/null +++ b/modules/local/htseq_uniquely_mapped_tpm/main.nf @@ -0,0 +1,31 @@ +process HTSEQ_UNIQUELY_MAPPED_TPM { + tag "htseq_uniquely_mapped_TPM" + label 'process_high' + + publishDir "${params.outdir}/HTSeq", mode: params.publish_dir_mode + storeDir "${params.outdir}/HTSeq" + + input: + path input_quantification + val(host_attribute) + path gff_host + path gff_pathogen + + output: + path "quantification_results_uniquely_mapped_NumReads_TPM.tsv", emit: tsv + path "versions.yml" , emit: versions + + script: + """ + $workflow.projectDir/bin/calculate_TPM_HTSeq.R \\ + $input_quantification \\ + $host_attribute \\ + $gff_pathogen \\ + $gff_host + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/multi_mapping_stats/main.nf b/modules/local/multi_mapping_stats/main.nf new file mode 100644 index 00000000..47676e8f --- /dev/null +++ b/modules/local/multi_mapping_stats/main.nf @@ -0,0 +1,36 @@ +process MULTI_MAPPING_STATS { + tag "$sample_name" + publishDir "${params.outdir}/mapping_statistics/STAR/multi_mapped", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR/multi_mapped" + + label 'process_high' + + input: + tuple val(sample_name), path(alignment) + path(host_reference_names) + path(pathogen_reference_names) + + output: + path("*_multi_mapped.txt"), emit: txt + path "versions.yml" , emit: versions + + script: + def name = "${sample_name.id}_multi_mapped.txt" + if (params.single_end){ + """ + $workflow.projectDir/bin/count_multi_mapped_reads.sh ${alignment} ${host_reference_names} ${pathogen_reference_names} ${sample_name.id} ${name} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ + } else { + """ + $workflow.projectDir/bin/count_multi_mapped_read_pairs.sh ${alignment} ${host_reference_names} ${pathogen_reference_names} ${sample_name.id} ${name} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ + } +} \ No newline at end of file diff --git a/modules/local/plot_mapping_stats_host_pathogen_htseq_uniquely_mapped/main.nf b/modules/local/plot_mapping_stats_host_pathogen_htseq_uniquely_mapped/main.nf new file mode 100644 index 00000000..f13581ce --- /dev/null +++ b/modules/local/plot_mapping_stats_host_pathogen_htseq_uniquely_mapped/main.nf @@ -0,0 +1,25 @@ +process PLOT_MAPPING_STATS_HOST_PATHOGEN_HTSEQ_UNIQUELY_MAPPED{ + tag "$name2" + publishDir "${params.outdir}/mapping_statistics/HTSeq", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/HTSeq" + + label 'process_high' + + input: + path(stats) + + output: + path "*.tsv" + path "*.pdf" + path "versions.yml" , emit: versions + + script: + """ + python $workflow.projectDir/bin/plot_mapping_stats_htseq.py -i $stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/plot_rna_class_uniquely_mapped_combined/main.nf b/modules/local/plot_rna_class_uniquely_mapped_combined/main.nf new file mode 100644 index 00000000..d3c434cc --- /dev/null +++ b/modules/local/plot_rna_class_uniquely_mapped_combined/main.nf @@ -0,0 +1,27 @@ +process PLOT_RNA_CLASS_UNIQUELY_MAPPED_COMBINED { + publishDir "${params.outdir}/mapping_statistics/HTSeq/RNA_classes_host", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/HTSeq/RNA_classes_host" + tag "plt_rna_stats_htseq_host_all" + + label 'process_high' + + input: + path stats_table + val (organism) + + output: + path "RNA_class_stats_combined_host.pdf" + path "versions.yml" , emit: versions + + script: + """ + python $workflow.projectDir/bin/plot_RNA_class_stats_combined.py \\ + -i $stats_table \\ + -org $organism + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version + END_VERSIONS + """ +} diff --git a/modules/local/plot_rna_class_uniquely_mapped_each/main.nf b/modules/local/plot_rna_class_uniquely_mapped_each/main.nf new file mode 100644 index 00000000..b437ec63 --- /dev/null +++ b/modules/local/plot_rna_class_uniquely_mapped_each/main.nf @@ -0,0 +1,28 @@ +process PLOT_RNA_CLASS_UNIQUELY_MAPPED_EACH{ + publishDir "${params.outdir}/mapping_statistics/${tool}/RNA_classes_pathogen", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/${tool}/RNA_classes_pathogen" + tag "plot_rna_stats_htseq_pathogen" + + label 'process_high' + + input: + file stats_table + val(tool) + + output: + path "*.pdf" + path "versions.yml" , emit: versions + + script: + """ + python $workflow.projectDir/bin/plot_RNA_class_stats_each.py \\ + -i $stats_table + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version + END_VERSIONS + """ +} + + \ No newline at end of file diff --git a/modules/local/plot_star_mapping_stats/main.nf b/modules/local/plot_star_mapping_stats/main.nf new file mode 100644 index 00000000..c91ff0d2 --- /dev/null +++ b/modules/local/plot_star_mapping_stats/main.nf @@ -0,0 +1,24 @@ +process PLOT_STAR_MAPPING_STATS { + tag "plot_star_mapping_stats" + publishDir "${params.outdir}/mapping_statistics/STAR", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR" + + label 'process_high' + + input: + path(stats) + + output: + path "*.tsv" + path "*.pdf" + path "versions.yml" , emit: versions + + script: + """ + python $workflow.projectDir/bin/plot_mapping_stats_star.py -i $stats + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/remove_crossmapped_reads/main.nf b/modules/local/remove_crossmapped_reads/main.nf new file mode 100644 index 00000000..150e8e7a --- /dev/null +++ b/modules/local/remove_crossmapped_reads/main.nf @@ -0,0 +1,42 @@ +process REMOVE_CROSSMAPPED_READS { + tag "$sample_name.id" + label 'process_high' + publishDir "${params.outdir}/STAR/multimapped_reads", mode: params.publish_dir_mode + storeDir "${params.outdir}/STAR/multimapped_reads" + + + input: + tuple val(sample_name), path(alignment) + path(host_reference) + path(pathogen_reference) + + + output: + tuple val(sample_name), path("*_no_crossmapped.bam"), emit: alignment_multi_mapping_stats + path("*_cross_mapped_reads.txt"), emit: count_crossmapped_reads + path "versions.yml" , emit: versions + + script: + def bam_file_without_crossmapped = "${sample_name.id}_no_crossmapped.bam" + def cross_mapped_reads = "${sample_name.id}_cross_mapped_reads.txt" + + if (params.single_end){ + """ + $workflow.projectDir/bin/remove_crossmapped_reads_BAM.sh $alignment $workflow.projectDir/bin $host_reference $pathogen_reference $cross_mapped_reads $bam_file_without_crossmapped + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ + } else { + """ + $workflow.projectDir/bin/remove_crossmapped_read_pairs_BAM.sh $alignment $workflow.projectDir/bin $host_reference $pathogen_reference $cross_mapped_reads $bam_file_without_crossmapped + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ + } +} \ No newline at end of file diff --git a/modules/local/replace_attribute_pathogen_gff_htseq/main.nf b/modules/local/replace_attribute_pathogen_gff_htseq/main.nf new file mode 100644 index 00000000..6a1e2dc7 --- /dev/null +++ b/modules/local/replace_attribute_pathogen_gff_htseq/main.nf @@ -0,0 +1,26 @@ +process REPLACE_ATTRIBUTE_PATHOGEN_GFF_HTSEQ { + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + tag "repl_attribute_pathogen_gff" + + label 'process_high' + + input: + path(gff) + val(host_attribute) + val(pathogen_attribute) + + output: + path "${outfile_name}", emit: gff3 + + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"_new_attribute.gff3") + """ + $workflow.projectDir/bin/replace_attribute_gff.sh $gff ${outfile_name} $host_attribute $pathogen_attribute + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/replace_gene_feature.nf b/modules/local/replace_gene_feature.nf index 6eef2c5e..9de4cede 100644 --- a/modules/local/replace_gene_feature.nf +++ b/modules/local/replace_gene_feature.nf @@ -1,16 +1,16 @@ - process REPLACE_GENE_FEATURE_GFF_SALMON { - tag "repl_gene_feature_gff" - label 'process_high' +process REPLACE_GENE_FEATURE_GFF { + tag "repl_gene_feature_gff" + label 'process_high' - input: - path(gff) - val(features) - output: - path "${outfile_name}" + input: + path(gff) + val(features) + output: + path "${outfile_name}" - script: - outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"_quant_feature.gff3") - """ - $workflow.projectDir/bin/replace_feature_gff.sh $gff ${outfile_name} $features - """ - } \ No newline at end of file + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"_quant_feature.gff3") + """ + $workflow.projectDir/bin/replace_feature_gff.sh $gff ${outfile_name} $features + """ +} \ No newline at end of file diff --git a/modules/local/replace_gene_feature_gff_htseq/main.nf b/modules/local/replace_gene_feature_gff_htseq/main.nf new file mode 100644 index 00000000..2a286e9d --- /dev/null +++ b/modules/local/replace_gene_feature_gff_htseq/main.nf @@ -0,0 +1,25 @@ +process REPLACE_GENE_FEATURE_GFF_HOST_HTSEQ { + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + tag "repl_gene_feature_gff_host" + + label 'process_high' + + input: + path(gff) + val(features) + + output: + path "${outfile_name}" into combine_gff_host + + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"_quant_feature.gff3") + """ + $workflow.projectDir/bin/replace_feature_gff.sh $gff ${outfile_name} $features + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ +} diff --git a/modules/local/rna_class_statistics_uniquely_mapped/main.nf b/modules/local/rna_class_statistics_uniquely_mapped/main.nf new file mode 100644 index 00000000..df1589d1 --- /dev/null +++ b/modules/local/rna_class_statistics_uniquely_mapped/main.nf @@ -0,0 +1,35 @@ +process RNA_CLASS_STATISTICS_UNIQUELY_MAPPED { + publishDir "${params.outdir}/mapping_statistics/${tool}/RNA_classes_host", mode: params.publish_dir_mode + tag "rna_class_stats_htseq_host" + + label 'process_high' + + input: + path(quant_table) + val(attribute) + path(gene_annotations) + path(rna_classes_to_replace) + val(organism) + val(tool) + + output: + path "host_RNA_classes_percentage_*.tsv", emit: tsv + path "host_gene_types_groups_*", emit: random + path "versions.yml" , emit: versions + + shell: + ''' + python !{workflow.projectDir}/bin/RNA_class_content.py \\ + -q !{quant_table} \\ + -a !{attribute} \\ + -annotations !{gene_annotations} \\ + -rna !{rna_classes_to_replace} \\ + -q_tool !{tool} \\ + -org !{organism} 2>&1 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version + END_VERSIONS + ''' +} diff --git a/modules/local/scatter_plot/main.nf b/modules/local/scatter_plot/main.nf new file mode 100644 index 00000000..aa6b9a97 --- /dev/null +++ b/modules/local/scatter_plot/main.nf @@ -0,0 +1,35 @@ +process SCATTER_PLOT { + publishDir "${params.outdir}/mapping_statistics/HTSeq/scatter_plots", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/HTSeq/scatter_plots" + tag "scatter_plot_pathogen_htseq" + + label 'process_high' + + input: + path quant_table + val attribute + val replicates + val pathogen_table_non_empty + val host + + output: + path ('*.pdf'), optional: true + path "versions.yml" , emit: versions + + when: + replicates.toBoolean() + pathogen_table_non_empty.toBoolean() + + script: + """ + python $workflow.projectDir/bin/scatter_plots.py \\ + -q $quant_table \\ + -a $attribute \\ + -org $host + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/split_quantification_tables_htseq_uniquely_mapped/main.nf b/modules/local/split_quantification_tables_htseq_uniquely_mapped/main.nf new file mode 100644 index 00000000..8b02f943 --- /dev/null +++ b/modules/local/split_quantification_tables_htseq_uniquely_mapped/main.nf @@ -0,0 +1,29 @@ +process SPLIT_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED { + tag "split_quants_uniq_mapped_host" + label 'process_high' + publishDir "${params.outdir}/HTSeq", mode: params.publish_dir_mode + + input: + path quant_table + path host_annotations + path pathogen_annotations + + output: + path 'host_quantification_uniquely_mapped_htseq.tsv', emit: host + path 'pathogen_quantification_uniquely_mapped_htseq.tsv', emit: pathogen + env host_tab, emit: scatterplots_host_htseq + env pathonen_tab, emit: scatterplots_pathogen_htseq + path "versions.yml" , emit: versions + + script: + """ + $workflow.projectDir/bin/split_quant_tables.sh $quant_table $host_annotations $pathogen_annotations quantification_uniquely_mapped_htseq.tsv + pathonen_tab=\$(if [ \$(cat pathogen_quantification_uniquely_mapped_htseq.tsv | wc -l) -gt 1 ]; then echo "true"; else echo "false"; fi) + host_tab=\$(if [ \$(cat host_quantification_uniquely_mapped_htseq.tsv | wc -l) -gt 1 ]; then echo "true"; else echo "false"; fi) + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ +} \ No newline at end of file diff --git a/modules/local/star_mapping_stats/main.nf b/modules/local/star_mapping_stats/main.nf new file mode 100644 index 00000000..a415fc7c --- /dev/null +++ b/modules/local/star_mapping_stats/main.nf @@ -0,0 +1,34 @@ +process STAR_MAPPING_STATS { + storeDir "${params.outdir}/mapping_statistics/STAR" + publishDir "${params.outdir}/mapping_statistics/STAR", mode: params.publish_dir_mode + tag "star_mapping_stats" + + label 'process_high' + + input: + path total_raw_reads + path total_processed_reads + path uniquely_mapped_reads + path multi_mapped_reads + path cross_mapped_reads + + output: + path ('star_mapping_stats.tsv'), emit: tsv + path "versions.yml" , emit: versions + + script: + """ + python $workflow.projectDir/bin/mapping_stats.py \\ + -total_raw $total_raw_reads \\ + -total_processed $total_processed_reads \\ + -m_u $uniquely_mapped_reads \\ + -m_m $multi_mapped_reads \\ + -c_m $cross_mapped_reads \\ + -t star -o star_mapping_stats.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ +} diff --git a/modules/local/unique_mapping_stats_star/main.nf b/modules/local/unique_mapping_stats_star/main.nf new file mode 100644 index 00000000..20b5a687 --- /dev/null +++ b/modules/local/unique_mapping_stats_star/main.nf @@ -0,0 +1,36 @@ +process UNIQUE_MAPPING_STATS_STAR { + + label 'process_high' + tag "$sample_name.id" + publishDir "${params.outdir}/mapping_statistics/STAR/uniquely_mapped", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR/uniquely_mapped" + + input: + tuple val(sample_name), path(alignment) + path(host_reference_names) + path(pathogen_reference_names) + + output: + path("*_uniquely_mapped.txt"), emit: txt + path "versions.yml" , emit: versions + + script: + def name = "${sample_name.id}_uniquely_mapped.txt" + if (params.single_end){ + """ + $workflow.projectDir/bin/count_uniquely_mapped_reads.sh ${alignment} ${host_reference_names} ${pathogen_reference_names} ${sample_name.id} ${name} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ + } else { + """ + $workflow.projectDir/bin/count_uniquely_mapped_read_pairs.sh ${alignment} ${host_reference_names} ${pathogen_reference_names} ${sample_name.id} ${name} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + version: 1.0.0 + END_VERSIONS + """ + } +} \ No newline at end of file diff --git a/modules/nf-core/bbmap/bbduk/environment.yml b/modules/nf-core/bbmap/bbduk/environment.yml new file mode 100644 index 00000000..1221474c --- /dev/null +++ b/modules/nf-core/bbmap/bbduk/environment.yml @@ -0,0 +1,7 @@ +name: bbmap_bbduk +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bbmap=39.01 diff --git a/modules/nf-core/bbmap/bbduk/main.nf b/modules/nf-core/bbmap/bbduk/main.nf new file mode 100644 index 00000000..6453afc6 --- /dev/null +++ b/modules/nf-core/bbmap/bbduk/main.nf @@ -0,0 +1,43 @@ +process BBMAP_BBDUK { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bbmap:39.01--h5c4e2a8_0': + 'biocontainers/bbmap:39.01--h5c4e2a8_0' }" + + input: + tuple val(meta), path(reads) + path contaminants + + output: + tuple val(meta), path('*.fastq.gz'), emit: reads + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def raw = meta.single_end ? "in=${reads[0]}" : "in1=${reads[0]} in2=${reads[1]}" + def trimmed = meta.single_end ? "out=${prefix}.fastq.gz" : "out1=${prefix}_1.fastq.gz out2=${prefix}_2.fastq.gz" + def contaminants_fa = contaminants ? "ref=$contaminants" : '' + """ + maxmem=\$(echo \"$task.memory\"| sed 's/ GB/g/g') + bbduk.sh \\ + -Xmx\$maxmem \\ + $raw \\ + $trimmed \\ + threads=$task.cpus \\ + $args \\ + $contaminants_fa \\ + &> ${prefix}.bbduk.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bbmap/bbduk/meta.yml b/modules/nf-core/bbmap/bbduk/meta.yml new file mode 100644 index 00000000..9a1f0562 --- /dev/null +++ b/modules/nf-core/bbmap/bbduk/meta.yml @@ -0,0 +1,50 @@ +name: bbmap_bbduk +description: Adapter and quality trimming of sequencing reads +keywords: + - trimming + - adapter trimming + - quality trimming + - fastq +tools: + - bbmap: + description: BBMap is a short read aligner, as well as various other bioinformatic tools. + homepage: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + documentation: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + licence: ["UC-LBL license (see package)"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - contaminants: + type: file + description: | + Reference files containing adapter and/or contaminant sequences for sequence kmer matching +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified fastq reads + pattern: "*fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - log: + type: file + description: Bbduk log file + pattern: "*bbduk.log" +authors: + - "@MGordon09" +maintainers: + - "@MGordon09" diff --git a/modules/nf-core/htseq/count/environment.yml b/modules/nf-core/htseq/count/environment.yml new file mode 100644 index 00000000..4c11f606 --- /dev/null +++ b/modules/nf-core/htseq/count/environment.yml @@ -0,0 +1,7 @@ +name: htseq_count +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::htseq=2.0.3 diff --git a/modules/nf-core/htseq/count/main.nf b/modules/nf-core/htseq/count/main.nf new file mode 100644 index 00000000..7ab08b8f --- /dev/null +++ b/modules/nf-core/htseq/count/main.nf @@ -0,0 +1,48 @@ +process HTSEQ_COUNT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/htseq:2.0.3--py310ha14a713_0': + 'biocontainers/htseq:2.0.3--py310ha14a713_0' }" + + input: + tuple val(meta), path(input), path(index) + tuple val(meta2), path(gtf) + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + htseq-count \\ + ${input} \\ + ${gtf} \\ + ${args} \\ + > ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + htseq: \$(echo \$(htseq-count --version ) | sed 's/^.*htseq-count //; s/Using.*\$//' )) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + : \$(echo \$(htseq-count --version ) | sed 's/^.*htseq-count //; s/Using.*\$//' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/htseq/count/meta.yml b/modules/nf-core/htseq/count/meta.yml new file mode 100644 index 00000000..1a036ffa --- /dev/null +++ b/modules/nf-core/htseq/count/meta.yml @@ -0,0 +1,52 @@ +--- +name: "htseq_count" +description: count how many reads map to each feature +keywords: + - htseq + - count + - gtf + - annotation +tools: + - "htseq/count": + description: "HTSeq is a Python library to facilitate processing and analysis of data from high-throughput sequencing (HTS) experiments." + homepage: "https://htseq.readthedocs.io/en/latest/" + documentation: "https://htseq.readthedocs.io/en/latest/index.html" + doi: "10.1093/bioinformatics/btu638" + licence: "['GPL v3']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - meta2: + type: map + description: | + .gtf file information + e.g. `[ id:'test' ]` + - input: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - index: + type: file + description: Contains indexed bam file + pattern: "*.bai" + - gtf: + type: file + description: Contains the features in the GTF format + pattern: "*.gtf" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - txt: + type: file + description: File containing feature counts output + pattern: ".txt" +authors: + - "@zehrahazalsezer" +maintainers: + - "@zehrahazalsezer" diff --git a/nextflow.config b/nextflow.config index e954d833..872b498b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -81,10 +81,25 @@ params { alignMatesGapMax = 1000000 winAnchorMultimapNmax = 999 - // Run workflows - run_salmon_selective_alignment = true - run_salmon_alignment_based_mode = true + //-------- + // HTSeq: + //-------- + run_htseq_uniquely_mapped = false + stranded = "yes" + max_reads_in_buffer = 30000000 + minaqual = 10 + gene_feature_gff_to_quantify_host = ["exon","tRNA"] + gene_feature_gff_to_quantify_pathogen = ["gene", "sRNA", "tRNA", "rRNA"] + host_gff_attribute = "gene_id" + pathogen_gff_attribute = "locus_tag" + htseq_params = "" + // Run workflows + run_salmon_selective_alignment = false + run_salmon_alignment_based_mode = false + run_star = true + run_htseq_uniquely_mapped = true + mapping_statistics = true // Config options custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" @@ -204,6 +219,8 @@ if (!params.igenomes_ignore) { // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. +process.container = 'nfcore/dualrnaseq:1.0.0' + env { PYTHONNOUSERSITE = 1 R_PROFILE_USER = "/.Rprofile" diff --git a/nf-test.config b/nf-test.config new file mode 100644 index 00000000..93c46ef4 --- /dev/null +++ b/nf-test.config @@ -0,0 +1,5 @@ +config { + testsDir "tests" + workDir ".nf-test" + libDir "tests/lib" +} \ No newline at end of file diff --git a/subworkflows/local/prepare_reference_files.nf b/subworkflows/local/prepare_reference_files.nf index cd0e6045..fd1f930c 100644 --- a/subworkflows/local/prepare_reference_files.nf +++ b/subworkflows/local/prepare_reference_files.nf @@ -15,9 +15,11 @@ include { } from '../../modules/local/replace_attribute' include { - REPLACE_GENE_FEATURE_GFF_SALMON as REPLACE_GENE_FEATURE_GFF_PATHOGEN_SALMON; - REPLACE_GENE_FEATURE_GFF_SALMON as REPLACE_GENE_FEATURE_GFF_HOST_SALMON - } from '../../modules/local/replace_gene_feature' + REPLACE_GENE_FEATURE_GFF as REPLACE_GENE_FEATURE_GFF_PATHOGEN_SALMON; + REPLACE_GENE_FEATURE_GFF as REPLACE_GENE_FEATURE_GFF_HOST_SALMON; + REPLACE_GENE_FEATURE_GFF as REPLACE_GENE_FEATURE_GFF_HOST_HTSEQ; + REPLACE_GENE_FEATURE_GFF as REPLACE_GENE_FEATURE_GFF_PATHOGEN_HTSEQ; +} from '../../modules/local/replace_gene_feature' include { COMBINE_FILES as COMBINE_HOST_GENOME_TRNA_GFF_STAR_SALMON; @@ -28,6 +30,7 @@ include { COMBINE_FILES as COMBINE_HOST_GFF_FILES } from '../../modules/local/combine_files' +include { REPLACE_ATTRIBUTE_PATHOGEN_GFF_HTSEQ } from '../../modules/local/replace_attribute_pathogen_gff_htseq/main' include { PREPARE_HOST_TRANSCRIPTOME } from './prepare_host_transcriptome' include { PREPARE_PATHOGEN_TRANSCRIPTOME } from './prepare_pathogen_transcriptome' @@ -39,6 +42,10 @@ include { EXTRACT_ANNOTATIONS as EXTRACT_ANNOTATIONS_PATHOGEN_SALMON } from '../../modules/local/extract_annotations' +include { + EXTRA_REFERENCE_NAME_STAR as EXTRACT_REFERENCE_NAME_STAR_HOST; + EXTRA_REFERENCE_NAME_STAR as EXTRACT_REFERENCE_NAME_STAR_PATHOGEN; +} from '../../modules/local/extract_reference_name_star' workflow PREPARE_REFERENCE_FILES{ take: @@ -52,7 +59,8 @@ workflow PREPARE_REFERENCE_FILES{ ch_transcriptome = Channel.empty() ch_host_transcriptome = Channel.empty() ch_pathogen_transcriptome = Channel.empty() - + ch_transcript_host_unzipped = Channel.empty() + ch_transcript_pathogen_unzipped = Channel.empty() ch_gene_feature_pat = Channel .value(params.gene_feature_gff_to_create_transcriptome_pathogen) .collect() @@ -122,15 +130,15 @@ workflow PREPARE_REFERENCE_FILES{ } else { - PREPARE_HOST_TRANSCRIPTOME( - ch_fasta_host_unzipped, - ch_gff_host_unzipped, - ch_gff_host_tRNA_unzipped - ) + PREPARE_HOST_TRANSCRIPTOME( + ch_fasta_host_unzipped, + ch_gff_host_unzipped, + ch_gff_host_tRNA_unzipped + ) - ch_transcript_host_unzipped = PREPARE_HOST_TRANSCRIPTOME.out.transcriptome + ch_transcript_host_unzipped = PREPARE_HOST_TRANSCRIPTOME.out.transcriptome - } + } if(params.transcript_fasta_pathogen){ ch_pathogen_transcriptome = params.transcript_fasta_pathogen ? Channel.value(file( params.transcript_fasta_pathogen, checkIfExists: true )) : Channel.empty() @@ -148,20 +156,21 @@ workflow PREPARE_REFERENCE_FILES{ ch_gff_pathogen_unzipped ) ch_transcript_pathogen_unzipped = PREPARE_PATHOGEN_TRANSCRIPTOME.out.transcriptome - } + } + } - // combine pathogen and host transcriptome - // transciptiome_transcriptome_to_combine = ch_host_transcriptome.mix( - // ch_pathogen_transcriptome - // ).collect() + // combine pathogen and host transcriptome + // transciptiome_transcriptome_to_combine = ch_host_transcriptome.mix( + // ch_pathogen_transcriptome + // ).collect() - COMBINE_FILES_TRANSCRIPTOME_FILES( - ch_transcript_host_unzipped,ch_transcript_pathogen_unzipped,'host_pathogen.transcript.fasta' - ) - ch_transcriptome = COMBINE_FILES_TRANSCRIPTOME_FILES.out + COMBINE_FILES_TRANSCRIPTOME_FILES( + ch_transcript_host_unzipped,ch_transcript_pathogen_unzipped,'host_pathogen.transcript.fasta' + ) + ch_transcriptome = COMBINE_FILES_TRANSCRIPTOME_FILES.out @@ -171,10 +180,31 @@ workflow PREPARE_REFERENCE_FILES{ 'parent') REPLACE_ATTRIBUTE_GFF_STAR_SALMON_HOST( - ch_gff_host_unzipped, - 'Parent', - 'parent' - ) + ch_gff_host_unzipped, + 'Parent', + 'parent' + ) + + if (params.run_htseq_uniquely_mapped) { + REPLACE_GENE_FEATURE_GFF_HOST_HTSEQ( + COMBINE_HOST_GFF_FILES.out, + Channel.value(params.gene_feature_gff_to_quantify_host) + ); + REPLACE_GENE_FEATURE_GFF_PATHOGEN_HTSEQ( + ch_gff_pathogen_unzipped, + Channel.value(params.gene_feature_gff_to_quantify_pathogen) + ); + REPLACE_ATTRIBUTE_PATHOGEN_GFF_HTSEQ( + REPLACE_GENE_FEATURE_GFF_PATHOGEN_HTSEQ.out, + params.host_gff_attribute, + params.pathogen_gff_attribute + ) + COMBINE_PATHOGEN_HOST_GFF_FILES_HTSEQ( + REPLACE_GENE_FEATURE_GFF_HOST_HTSEQ.out, + REPLACE_ATTRIBUTE_PATHOGEN_GFF_HTSEQ.out.gff3, + "fake_name.gff" + ) + } if(params.gff_host_tRNA){ REPLACE_ATTRIBUTE_GFF_STAR_SALMON_TRNA_FILE( @@ -191,9 +221,9 @@ workflow PREPARE_REFERENCE_FILES{ ch_gff_host_genome_salmon_alignment = params.gff_host_tRNA ? COMBINE_HOST_GENOME_TRNA_GFF_STAR_SALMON.out : REPLACE_ATTRIBUTE_GFF_STAR_SALMON_HOST.out REPLACE_GENE_FEATURE_GFF_HOST_SALMON( - ch_gff_host_genome_salmon_alignment, - params.gene_feature_gff_to_create_transcriptome_host - ) + ch_gff_host_genome_salmon_alignment, + params.gene_feature_gff_to_create_transcriptome_host + ) REPLACE_GENE_FEATURE_GFF_PATHOGEN_SALMON( @@ -202,10 +232,10 @@ workflow PREPARE_REFERENCE_FILES{ ) COMBINE_FILES_PATHOGEN_HOST_GFF( - REPLACE_GENE_FEATURE_GFF_PATHOGEN_SALMON.out, - REPLACE_GENE_FEATURE_GFF_HOST_SALMON.out, - "host_pathogen_star_alignment_mode.gff" - ) + REPLACE_GENE_FEATURE_GFF_PATHOGEN_SALMON.out, + REPLACE_GENE_FEATURE_GFF_HOST_SALMON.out, + "host_pathogen_star_alignment_mode.gff" + ) EXTRACT_ANNOTATIONS_PATHOGEN_SALMON ( @@ -225,18 +255,49 @@ workflow PREPARE_REFERENCE_FILES{ ) + ch_reference_pathogen_name = Channel.empty() + ch_reference_host_name = Channel.empty() + + if(params.mapping_statistics) { + EXTRACT_REFERENCE_NAME_STAR_HOST(ch_fasta_host_unzipped, "host") + EXTRACT_REFERENCE_NAME_STAR_PATHOGEN(ch_fasta_pathogen_unzipped, "pathogen") + ch_reference_pathogen_name = EXTRACT_REFERENCE_NAME_STAR_HOST.out.txt + ch_reference_host_name = EXTRACT_REFERENCE_NAME_STAR_PATHOGEN.out.txt + } + + ch_host_pathoge_gff = Channel.empty() //COMBINE_FILES_PATHOGEN_HOST_GFF.out + - } + EXTRACT_ANNOTATIONS_HOST_HTSEQ(COMBINE_HOST_GFF_FILES.out, + params.gene_feature_gff_to_quantify_host, + params.host_gff_attribute, + "host", + "htseq" + ) + EXTRACT_ANNOTATIONS_PATHOGEN_HTSEQ(ch_gff_pathogen_unzipped, + params.gene_feature_gff_to_quantify_pathogen, + params.pathogen_gff_attribute, + "pathogen", + "htseq") + ch_annotations_host_htseq = EXTRACT_ANNOTATIONS_HOST_HTSEQ.out.annotations + ch_annotations_pathogen_htseq = EXTRACT_ANNOTATIONS_PATHOGEN_HTSEQ.out.annotations emit: genome_fasta = COMBINE_FILES_FASTA.out + host_gff = COMBINE_HOST_GFF_FILES.out transcript_fasta = ch_transcriptome transcript_fasta_host = ch_transcript_host_unzipped transcript_fasta_pathogen = ch_transcript_pathogen_unzipped - host_pathoge_gff = COMBINE_FILES_PATHOGEN_HOST_GFF.out + host_pathoge_gff = ch_host_pathoge_gff annotations_host_salmon = EXTRACT_ANNOTATIONS_HOST_SALMON.out.annotations annotations_pathogen_salmon = EXTRACT_ANNOTATIONS_PATHOGEN_SALMON.out.annotations - } - + annotations_host_htseq = ch_annotations_host_htseq + annotations_pathogen_htseq = ch_annotations_pathogen_htseq + reference_pathogen_name = ch_reference_pathogen_name + reference_host_name = ch_reference_host_name + quantification_gff_u_m = COMBINE_PATHOGEN_HOST_GFF_FILES_HTSEQ.out + gff_host = REPLACE_GENE_FEATURE_GFF_HOST_HTSEQ.out + patoghen_host = REPLACE_ATTRIBUTE_PATHOGEN_GFF_HTSEQ.out.gff3 +} \ No newline at end of file diff --git a/subworkflows/local/rna_statistics/main.nf b/subworkflows/local/rna_statistics/main.nf new file mode 100644 index 00000000..4214bf0f --- /dev/null +++ b/subworkflows/local/rna_statistics/main.nf @@ -0,0 +1,99 @@ +include { + RNA_CLASS_STATISTICS_UNIQUELY_MAPPED as RNA_CLASS_STATISTICS_UNIQUELY_MAPPED_HOST; + RNA_CLASS_STATISTICS_UNIQUELY_MAPPED as RNA_CLASS_STATISTICS_UNIQUELY_MAPPED_PATOGEN; +} from '../../../modules/local/rna_class_statistics_uniquely_mapped/main.nf' + +include { + PLOT_RNA_CLASS_UNIQUELY_MAPPED_EACH as PLOT_RNA_CLASS_UNIQUELY_MAPPED_EACH_HOST; + PLOT_RNA_CLASS_UNIQUELY_MAPPED_EACH as PLOT_RNA_CLASS_UNIQUELY_MAPPED_EACH_PATOGEN; +} from '../../../modules/local/plot_rna_class_uniquely_mapped_each/main.nf' + + +include { + PLOT_RNA_CLASS_UNIQUELY_MAPPED_COMBINED as PLOT_RNA_CLASS_UNIQUELY_MAPPED_COMBINED_HOST; + PLOT_RNA_CLASS_UNIQUELY_MAPPED_COMBINED as PLOT_RNA_CLASS_UNIQUELY_MAPPED_COMBINED_PATHOGEN; +} from '../../../modules/local/plot_rna_class_uniquely_mapped_combined/main.nf' + + +include { + SCATTER_PLOT as SCATTER_PLOT_HOST; + SCATTER_PLOT as SCATTER_PLOT_PATHOGEN; +} from '../../../modules/local/scatter_plot/main.nf' + +/* + Common statistics used for Star HTSEQ and SALOMON SELECTIVE ALIGNMENT +*/ +workflow RNA_STATISTICS { + take: + host_quantification + pathogen_quantification + scatterplots_host + scatterplots_pathogen + annotations_host_htseq + annotations_pathogen_htseq + attribute // gff attribute + tool // salmon, htseq + + main: + ch_versions = Channel.empty() + SCATTER_PLOT_HOST( + host_quantification, + attribute, + Channel.value("replicates"), // output from CHECK_REPLICATES + scatterplots_host, + "host" + ) + ch_versions = ch_versions.mix(SCATTER_PLOT_HOST.out.versions.first()) + SCATTER_PLOT_PATHOGEN( + pathogen_quantification, + attribute, + Channel.value("replicates"), // output from CHECK_REPLICATES + scatterplots_pathogen, + "pathogen" + ) + ch_versions = ch_versions.mix(SCATTER_PLOT_PATHOGEN.out.versions.first()) + + // // statistics + RNA_CLASS_STATISTICS_UNIQUELY_MAPPED_HOST( + host_quantification, + attribute, + annotations_host_htseq, + Channel.empty(), + "host", + "htseq" + ) + + RNA_CLASS_STATISTICS_UNIQUELY_MAPPED_PATOGEN( + pathogen_quantification, + attribute, + annotations_pathogen_htseq, + Channel.empty(), + "pathogen", + "htseq" + ) + + host_input_for_plots = RNA_CLASS_STATISTICS_UNIQUELY_MAPPED_HOST.out.tsv.filter{it.countLines > 0} + pathogen_input_for_plots = RNA_CLASS_STATISTICS_UNIQUELY_MAPPED_HOST.out.tsv.filter{it.countLines > 0} + countLines + // each + PLOT_RNA_CLASS_UNIQUELY_MAPPED_EACH_HOST( + host_input_for_plots, + tool + ) + PLOT_RNA_CLASS_UNIQUELY_MAPPED_EACH_PATOGEN( + pathogen_input_for_plots, + tool + ) + + // combined + PLOT_RNA_CLASS_UNIQUELY_MAPPED_COMBINED_HOST( + host_input_for_plots, + tool + ) + PLOT_RNA_CLASS_UNIQUELY_MAPPED_COMBINED_PATHOGEN( + pathogen_input_for_plots, + tool + ) + emit: + versions = ch_versions // channel: [ versions.yml ] +} \ No newline at end of file diff --git a/subworkflows/local/salmon_selective_alignment.nf b/subworkflows/local/salmon_selective_alignment/main.nf similarity index 57% rename from subworkflows/local/salmon_selective_alignment.nf rename to subworkflows/local/salmon_selective_alignment/main.nf index 91a20563..8e268eab 100644 --- a/subworkflows/local/salmon_selective_alignment.nf +++ b/subworkflows/local/salmon_selective_alignment/main.nf @@ -1,11 +1,11 @@ -include { SALMON_INDEX } from '../../modules/nf-core/salmon/index/main' -include { SALMON_QUANT } from '../../modules/nf-core/salmon/quant/main' -include { COMBINE_QUANTIFICATION_RESULTS_SALMON } from '../../modules/local/combine_quantification_results_salmon' -include { SALMON_SPLIT_TABLE as SALMON_SPLIT_TABLE_EACH} from '../../modules/local/salmon_split_table' -include { SALMON_SPLIT_TABLE as SALMON_SPLIT_TABLE_COMBINED} from '../../modules/local/salmon_split_table' -include { EXTRACT_PROCESSED_READS } from '../../modules/local/extract_processed_reads' -include { TXIMPORT } from '../../modules/local/tximport/main' - +include { SALMON_INDEX } from '../../../modules/nf-core/salmon/index/main' +include { SALMON_QUANT } from '../../../modules/nf-core/salmon/quant/main' +include { COMBINE_QUANTIFICATION_RESULTS_SALMON } from '../../../modules/local/combine_quantification_results_salmon' +include { SALMON_SPLIT_TABLE as SALMON_SPLIT_TABLE_EACH} from '../../../modules/local/salmon_split_table' +include { SALMON_SPLIT_TABLE as SALMON_SPLIT_TABLE_COMBINED} from '../../../modules/local/salmon_split_table' +include { EXTRACT_PROCESSED_READS } from '../../../modules/local/extract_processed_reads' +include { TXIMPORT } from '../../../modules/local/tximport/main' +include { RNA_STATISTICS } from '../rna_statistics/main' workflow SALMON_SELECTIVE_ALIGNMENT { @@ -45,10 +45,28 @@ workflow SALMON_SELECTIVE_ALIGNMENT { SALMON_SPLIT_TABLE_COMBINED( combined_salmon_quant, ch_transcript_fasta_pathogen, ch_transcript_fasta_host) - EXTRACT_PROCESSED_READS( SALMON_QUANT.out.json_results, "salmon" ) + if(params.mapping_statistics) { + EXTRACT_PROCESSED_READS( SALMON_QUANT.out.json_results, "salmon" ) - TXIMPORT(SALMON_SPLIT_TABLE_EACH.out.host,ch_annotations_host_salmon ) + // COLLECT_PROCESSED_READS_STAR_FOR_SALMON ?? + // EXTRACT_PROCESSED_READS_SALMON_ALIGNMENT_BASED + + // COLLECT_PROCESSED_READS_SALMON_ALIGNMENT_BASED + + // SALMON_QUANTIFICATION_STATS_SALMON_ALIGNMENT_BASED + + // plot_salmon_mapping_stats_host_pathogen_salmon_alignment_based + + RNA_STATISTICS( + SALMON_SPLIT_TABLE_COMBINED.host, + SALMON_SPLIT_TABLE_COMBINED.pathogen, + params.gene_attribute_gff_to_create_transcriptome_host) + } + + TXIMPORT(SALMON_SPLIT_TABLE_EACH.out.host,ch_annotations_host_salmon ) + + emit: versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/star/main.nf b/subworkflows/local/star/main.nf new file mode 100644 index 00000000..9f04bc07 --- /dev/null +++ b/subworkflows/local/star/main.nf @@ -0,0 +1,20 @@ +include { STAR_GENOMEGENERATE } from '../../../modules/nf-core/star/genomegenerate/main' +include { STAR_ALIGN } from '../../../modules/nf-core/star/align/main' + +workflow STAR { + take: + ch_reads + ch_genome_fasta + ch_host_gff + main: + ch_versions = Channel.empty() + STAR_GENOMEGENERATE ( ch_genome_fasta, ch_host_gff ) + ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions.first()) + + STAR_ALIGN ( ch_reads, STAR_GENOMEGENERATE.out.index, ch_host_gff, true, '', '' ) + ch_versions = ch_versions.mix(STAR_ALIGN.out.versions.first()) + emit: + bam = STAR_ALIGN.out.bam + log_final = STAR_ALIGN.out.log_final + versions = ch_versions +} \ No newline at end of file diff --git a/subworkflows/local/star_htseq.nf b/subworkflows/local/star_htseq.nf deleted file mode 100644 index a89969f4..00000000 --- a/subworkflows/local/star_htseq.nf +++ /dev/null @@ -1,18 +0,0 @@ -include { HTSEQ } from '../../modules/nf-core/htseq/main' - -workflow STAR_HTSEQ { - - take: - ch_gtf // channel: /path/to/genome.gtf - main: - - ch_versions = Channel.empty() - ch_htseq = Channel.empty() - - HTSEQ ( ch_htseq, ch_gtf ) - ch_versions = ch_versions.mix(HTSEQ.out.versions.first()) - - emit: - versions = ch_versions // channel: [ versions.yml ] -} - diff --git a/subworkflows/local/star_htseq/main.nf b/subworkflows/local/star_htseq/main.nf new file mode 100644 index 00000000..f75ccee6 --- /dev/null +++ b/subworkflows/local/star_htseq/main.nf @@ -0,0 +1,102 @@ +include { HTSEQ_COUNT } from '../../../modules/local/htseq' +include { + COMBINE_ANNOTATIONS_QUANT_UNIQUELY_MAPPED_HOST as COMBINE_ANNOTATIONS_QUANT_PATHOGEN_UNIQUELY_MAPPED_HOST; + COMBINE_ANNOTATIONS_QUANT_UNIQUELY_MAPPED_HOST as COMBINE_ANNOTATIONS_QUANT_HOST_UNIQUELY_MAPPED_HOST; +} from '../../../modules/local/combine_annotations_quant_uniquely_mapped_host/main' + +include { COMBINE_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED } from '../../../modules/local/combine_quantification_tables_htseq_uniquely_mapped/main' + + +include { HTSEQ_QUANTIFICATION_STATS_UNIQUELY_MAPPED } from '../../../modules/local/htseq_quantification_stats_uniquely_mapped/main' +include { HTSEQ_UNIQUELY_MAPPED_TPM } from '../../../modules/local/htseq_uniquely_mapped_tpm/main' + +include { SPLIT_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED } from '../../../modules/local/split_quantification_tables_htseq_uniquely_mapped/main' + +include { PLOT_MAPPING_STATS_HOST_PATHOGEN_HTSEQ_UNIQUELY_MAPPED } from '../../../modules/local/plot_mapping_stats_host_pathogen_htseq_uniquely_mapped/main' +include { RNA_STATISTICS } from '../rna_statistics/main' + + +workflow STAR_HTSEQ { + + take: + star_bam // STAR_ALIGN.out.bam_unsorted, + quantification // PREPARE_REFERENCE_FILES.quantification_gff_u_m, + annotations_host_htseq + annotations_pathogen_htseq + gff_host // PREPARE_REFERENCE_FILES.gff_host, + gff_pathogen // PREPARE_REFERENCE_FILES.gff_pathogen, + star_mapping_stats_tsv + + main: + ch_versions = Channel.empty() + ch_htseq = Channel.empty() + + HTSEQ_COUNT ( star_bam, quantification, "quant" ) + ch_versions = ch_versions.mix(HTSEQ_COUNT.out.versions.first()) + + COMBINE_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED( + HTSEQ_COUNT.out.txt.map {meta, path -> path }.collect(), params.host_gff_attribute + ) + ch_versions = ch_versions.mix(COMBINE_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED.out.versions.first()) + + HTSEQ_UNIQUELY_MAPPED_TPM( + COMBINE_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED.out.tsv, + params.host_gff_attribute, + gff_host, + gff_pathogen + ) + ch_versions = ch_versions.mix(HTSEQ_UNIQUELY_MAPPED_TPM.out.versions.first()) + + SPLIT_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED( + HTSEQ_UNIQUELY_MAPPED_TPM.out.tsv, + annotations_host_htseq, + annotations_pathogen_htseq + ) + ch_versions = ch_versions.mix(SPLIT_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED.out.versions.first()) + + COMBINE_ANNOTATIONS_QUANT_PATHOGEN_UNIQUELY_MAPPED_HOST( + SPLIT_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED.out.pathogen, + annotations_pathogen_htseq, + params.host_gff_attribute, + "pathogen" + ) + + COMBINE_ANNOTATIONS_QUANT_HOST_UNIQUELY_MAPPED_HOST( + SPLIT_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED.out.host, + annotations_host_htseq, + params.host_gff_attribute, + "host" + ) + if(params.mapping_statistics) { + HTSEQ_QUANTIFICATION_STATS_UNIQUELY_MAPPED( + SPLIT_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED.out.host, + SPLIT_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED.out.pathogen, + Channel.value(params.host_gff_attribute), + star_mapping_stats_tsv + ) + ch_versions = ch_versions.mix(HTSEQ_QUANTIFICATION_STATS_UNIQUELY_MAPPED.out.versions.first()) + + PLOT_MAPPING_STATS_HOST_PATHOGEN_HTSEQ_UNIQUELY_MAPPED( + HTSEQ_QUANTIFICATION_STATS_UNIQUELY_MAPPED.out.tsv + ) + ch_versions = ch_versions.mix(PLOT_MAPPING_STATS_HOST_PATHOGEN_HTSEQ_UNIQUELY_MAPPED.out.versions.first()) + RNA_STATISTICS( + SPLIT_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED.out.host, + SPLIT_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED.out.pathogen, + SPLIT_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED.out.scatterplots_host_htseq, + SPLIT_QUANTIFICATION_TABLES_HTSEQ_UNIQUELY_MAPPED.out.scatterplots_pathogen_htseq, + annotations_host_htseq, + annotations_pathogen_htseq, + Channel.value(params.host_gff_attribute), + "htseq" + ) + ch_versions = ch_versions.mix(RNA_STATISTICS.out.versions.first()) + + } + emit: + versions = ch_versions // channel: [ versions.yml ] +} + +workflow { + STAR_HTSEQ() +} \ No newline at end of file diff --git a/subworkflows/local/star_statistic/main.nf b/subworkflows/local/star_statistic/main.nf new file mode 100644 index 00000000..676c1327 --- /dev/null +++ b/subworkflows/local/star_statistic/main.nf @@ -0,0 +1,75 @@ +include { REMOVE_CROSSMAPPED_READS } from '../../../modules/local/remove_crossmapped_reads/main' +include { EXTRACT_PROCESSED_READS } from '../../../modules/local/extract_processed_reads' +include { COLLECT_PROCESSED_READS_STAR } from '../../../modules/local/collect_processed_reads_star/main' +include { UNIQUE_MAPPING_STATS_STAR } from '../../../modules/local/unique_mapping_stats_star/main' +include { COLLECT_STATS_STAR_UNIQUELY_MAPPED } from '../../../modules/local/collect_stats_star_uniquely_mapped/main' +include { COUNT_CROSSMAPPED_READS } from '../../../modules/local/count_crossmapped_reads/main' + +include { MULTI_MAPPING_STATS } from '../../../modules/local/multi_mapping_stats/main' +include { STAR_MAPPING_STATS } from '../../../modules/local/star_mapping_stats/main' +include { COLLECT_STATS_STAR_MULTI_MAPPED } from '../../../modules/local/collect_stats_star_multi_mapped/main' +include { PLOT_STAR_MAPPING_STATS } from '../../../modules/local/plot_star_mapping_stats/main' +workflow STAR_STATISTIC { + take: + star_bam + star_log_final + reference_host_name + reference_pathogen_name + total_read_pairs + main: + ch_versions = Channel.empty() + + if(params.mapping_statistics) { + + REMOVE_CROSSMAPPED_READS( + star_bam, + reference_host_name, + reference_pathogen_name + ) + ch_versions = ch_versions.mix(REMOVE_CROSSMAPPED_READS.out.versions.first()) + + EXTRACT_PROCESSED_READS(star_log_final, "star") + ch_versions = ch_versions.mix(EXTRACT_PROCESSED_READS.out.versions.first()) + + COLLECT_PROCESSED_READS_STAR(EXTRACT_PROCESSED_READS.out.collect_results.collect()) + ch_versions = ch_versions.mix(COLLECT_PROCESSED_READS_STAR.out.versions.first()) + + UNIQUE_MAPPING_STATS_STAR( + star_bam, + reference_host_name, + reference_pathogen_name + ) + ch_versions = ch_versions.mix(UNIQUE_MAPPING_STATS_STAR.out.versions.first()) + + COLLECT_STATS_STAR_UNIQUELY_MAPPED(UNIQUE_MAPPING_STATS_STAR.out.txt) + ch_versions = ch_versions.mix(COLLECT_STATS_STAR_UNIQUELY_MAPPED.out.versions.first()) + + COUNT_CROSSMAPPED_READS(REMOVE_CROSSMAPPED_READS.out.count_crossmapped_reads) + ch_versions = ch_versions.mix(COUNT_CROSSMAPPED_READS.out.versions.first()) + + MULTI_MAPPING_STATS( + REMOVE_CROSSMAPPED_READS.out.alignment_multi_mapping_stats, + reference_host_name, + reference_pathogen_name + ) + ch_versions = ch_versions.mix(MULTI_MAPPING_STATS.out.versions.first()) + + COLLECT_STATS_STAR_MULTI_MAPPED(MULTI_MAPPING_STATS.out.txt) + ch_versions = ch_versions.mix(COLLECT_STATS_STAR_MULTI_MAPPED.out.versions.first()) + + STAR_MAPPING_STATS( + total_read_pairs, + COLLECT_PROCESSED_READS_STAR.out.tsv, + COLLECT_STATS_STAR_UNIQUELY_MAPPED.out.tsv, + COLLECT_STATS_STAR_MULTI_MAPPED.out.tsv, + COUNT_CROSSMAPPED_READS.out.txt + ) + ch_versions = ch_versions.mix(STAR_MAPPING_STATS.out.versions.first()) + + PLOT_STAR_MAPPING_STATS(STAR_MAPPING_STATS.out.tsv) + ch_versions = ch_versions.mix(PLOT_STAR_MAPPING_STATS.out.versions.first()) + } + emit: + versions = ch_versions // channel: [ versions.yml ] + star_mapping_stats = STAR_MAPPING_STATS.out.tsv +} diff --git a/tests/nextflow.config b/tests/nextflow.config new file mode 100644 index 00000000..e69de29b diff --git a/tests/subworkflows/local/star/main.nf.test b/tests/subworkflows/local/star/main.nf.test new file mode 100644 index 00000000..8f7c43af --- /dev/null +++ b/tests/subworkflows/local/star/main.nf.test @@ -0,0 +1,35 @@ +nextflow_workflow { + + name "Test Workflow STAR" + script "subworkflows/local/star/main.nf" + workflow "STAR" + + test("Should run without failures") { + + when { + params { + // define parameters here. Example: + // outdir = "tests/results" + } + workflow { + """ + // define inputs of the workflow here. Example: + input[0] = Channel.from('hello','nf-test') + input[1] = Channel.from('hello','nf-test') + input[2] = Channel.from('hello','nf-test') + input[3] = Channel.from('hello','nf-test') + input[4] = Channel.from('hello','nf-test') + input[5] = Channel.from('hello','nf-test') + input[6] = Channel.from('hello','nf-test') + """ + } + } + + then { + assert workflow.success + assert snapshot(workflow.out).match() + } + + } + +} diff --git a/tests/subworkflows/local/star/main.nf.test.snap b/tests/subworkflows/local/star/main.nf.test.snap new file mode 100644 index 00000000..f31b5a3c --- /dev/null +++ b/tests/subworkflows/local/star/main.nf.test.snap @@ -0,0 +1,15 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + + ], + "versions": [ + + ] + } + ], + "timestamp": "2023-12-10T22:45:13.325422" + } +} \ No newline at end of file diff --git a/tests/subworkflows/local/star_htseq/main.nf.test b/tests/subworkflows/local/star_htseq/main.nf.test new file mode 100644 index 00000000..3407b2a4 --- /dev/null +++ b/tests/subworkflows/local/star_htseq/main.nf.test @@ -0,0 +1,34 @@ +nextflow_workflow { + + name "Test Workflow STAR_HTSEQ" + script "subworkflows/local/star_htseq/main.nf" + workflow "STAR_HTSEQ" + + test("Should run without failures") { + + when { + params { + // define parameters here. Example: + // outdir = "tests/results" + } + workflow { + """ + // define inputs of the workflow here. Example: + input[0] = Channel.from('hello','nf-test') + input[1] = Channel.from('hello','nf-test') + input[2] = Channel.from('hello','nf-test') + input[3] = Channel.from('hello','nf-test') + input[4] = Channel.from('hello','nf-test') + input[5] = Channel.from('hello','nf-test') + """ + } + } + + then { + assert workflow.success + assert snapshot(workflow.out).match() + } + + } + +} diff --git a/tests/subworkflows/local/star_htseq/main.nf.test.snap b/tests/subworkflows/local/star_htseq/main.nf.test.snap new file mode 100644 index 00000000..e728aab2 --- /dev/null +++ b/tests/subworkflows/local/star_htseq/main.nf.test.snap @@ -0,0 +1,15 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + + ], + "versions": [ + + ] + } + ], + "timestamp": "2023-12-10T20:51:59.09991" + } +} \ No newline at end of file diff --git a/workflows/dualrnaseq.nf b/workflows/dualrnaseq.nf index ab39bf33..921a873f 100644 --- a/workflows/dualrnaseq.nf +++ b/workflows/dualrnaseq.nf @@ -52,7 +52,9 @@ include { INPUT_CHECK } from '../subworkflows/local/input_check' include { PREPARE_REFERENCE_FILES } from '../subworkflows/local/prepare_reference_files' include { SALMON_SELECTIVE_ALIGNMENT } from '../subworkflows/local/salmon_selective_alignment' include { SALMON_ALIGNMENT_BASED } from '../subworkflows/local/salmon_alignment_based' - +include { STAR } from '../subworkflows/local/star' +include { STAR_STATISTIC } from '../subworkflows/local/star_statistic' +include { STAR_HTSEQ } from '../subworkflows/local/star_htseq' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS @@ -65,9 +67,13 @@ include { SALMON_ALIGNMENT_BASED } from '../subworkflows/local/salmon_alignment_ include { FASTQC } from '../modules/nf-core/fastqc/main' include { FASTQC as FASTQC_AFTER_TRIMMING } from '../modules/nf-core/fastqc/main' include { CUTADAPT } from '../modules/nf-core/cutadapt/main' +include { BBMAP_BBDUK } from '../modules/nf-core/bbmap/bbduk/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { COUNT_TOTAL_READS } from '../modules/local/count_total_reads/main' +include { COUNT_TOTAL_READS_PAIRS } from '../modules/local/count_total_read_pairs/main' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -101,12 +107,30 @@ workflow DUALRNASEQ { ch_versions = ch_versions.mix(CUTADAPT.out.versions.first()) } - if (!(params.skip_tools && (params.skip_tools.split(',').contains('fastqc') || params.skip_tools.split(',').contains('cutadapt')))) { + if (!(params.skip_tools && params.skip_tools.split(',').contains('bbduk'))) { + if (params.adapters) { + adapter_database = file(params.adapters, checkIfExists: true) + BBMAP_BBDUK(INPUT_CHECK.out.reads, Channel.value(adapter_database)) + ch_reads = BBMAP_BBDUK.out.reads + ch_versions = ch_versions.mix(BBMAP_BBDUK.out.versions.first()) + } + } + + if (!(params.skip_tools && (params.skip_tools.split(',').contains('fastqc') || params.skip_tools.split(',').contains('cutadapt') || params.skip_tools.split(',').contains('bbduk')))) { FASTQC_AFTER_TRIMMING(ch_reads) ch_versions = ch_versions.mix(FASTQC_AFTER_TRIMMING.out.versions.first()) } - - + if (!(params.skip_tools && (params.skip_tools.split(',').contains('fastqc') || params.skip_tools.split(',').contains('cutadapt') || params.skip_tools.split(',').contains('bbduk')))) { + if (params.mapping_statistics) { + ch_reads + .map { tag, file -> file } + .set {raw_read_count_file} + COUNT_TOTAL_READS(raw_read_count_file) + if (!params.single_end){ + COUNT_TOTAL_READS_PAIRS(COUNT_TOTAL_READS.out.tsv) + } + } + } PREPARE_REFERENCE_FILES( params.fasta_host, @@ -142,7 +166,36 @@ workflow DUALRNASEQ { ch_versions = ch_versions.mix(SALMON_ALIGNMENT_BASED.out.versions) } - + if (params.run_star || params.run_htseq_uniquely_mapped) { + ch_genome_fasta = PREPARE_REFERENCE_FILES.out.genome_fasta + ch_host_gff = PREPARE_REFERENCE_FILES.out.host_gff + + STAR(ch_reads, ch_genome_fasta, ch_host_gff) + ch_versions = ch_versions.mix(STAR.out.versions) + if ( params.run_star ) { + STAR_STATISTIC( + STAR.out.bam, + STAR.out.log_final, + PREPARE_REFERENCE_FILES.out.reference_host_name.collect(), + PREPARE_REFERENCE_FILES.out.reference_pathogen_name.collect(), + COUNT_TOTAL_READS_PAIRS.out.tsv + ) + ch_versions = ch_versions.mix(STAR_STATISTIC.out.versions) + if (params.run_htseq_uniquely_mapped) { + STAR_HTSEQ( + STAR.out.bam, + PREPARE_REFERENCE_FILES.out.quantification_gff_u_m, + PREPARE_REFERENCE_FILES.out.annotations_host_htseq, + PREPARE_REFERENCE_FILES.out.annotations_pathogen_htseq, + PREPARE_REFERENCE_FILES.out.gff_host, + PREPARE_REFERENCE_FILES.out.patoghen_host, + STAR_STATISTIC.out.star_mapping_stats + ) + ch_versions = ch_versions.mix(STAR_HTSEQ.out.versions) + } + } + + } CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') @@ -151,25 +204,25 @@ workflow DUALRNASEQ { // MODULE: MultiQC - workflow_summary = WorkflowDualrnaseq.paramsSummaryMultiqc(workflow, summary_params) - ch_workflow_summary = Channel.value(workflow_summary) - - methods_description = WorkflowDualrnaseq.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) - ch_methods_description = Channel.value(methods_description) - - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList() - ) - multiqc_report = MULTIQC.out.report.toList() + // workflow_summary = WorkflowDualrnaseq.paramsSummaryMultiqc(workflow, summary_params) + // ch_workflow_summary = Channel.value(workflow_summary) + + // methods_description = WorkflowDualrnaseq.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) + // ch_methods_description = Channel.value(methods_description) + + // ch_multiqc_files = Channel.empty() + // ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + // ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) + // ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) + // ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + + // MULTIQC ( + // ch_multiqc_files.collect(), + // ch_multiqc_config.toList(), + // ch_multiqc_custom_config.toList(), + // ch_multiqc_logo.toList() + // ) + // multiqc_report = MULTIQC.out.report.toList() } diff --git a/workflows/old_raw.nf b/workflows/old_raw.nf new file mode 100644 index 00000000..67ade939 --- /dev/null +++ b/workflows/old_raw.nf @@ -0,0 +1,4687 @@ +#!/usr/bin/env nextflow +/* + +========================================== + +nf-core/dualrnaseq + +========================================== + +------------------------------------------ +Authors: B.Mika-Gospodorz and R.Hayward + +Homepage: https://github.com/nf-core/dualrnaseq +------------------------------------------ + + +Brief description: +------------------------------------------ +Extract host and pathogen expression using three methods, with options for various statistical outputs. +1) STAR + HTSeq +2) Salmon selective alignment +3) STAR + Salmon - alignment-based mode + +*/ + + + +def helpMessage() { + log.info nfcoreHeader() + log.info""" + + Usage: + + The typical command for running the pipeline is as follows: + + nextflow run nf-core/dualrnaseq --input '*_R{1,2}.fastq.gz' -profile docker + + Mandatory arguments: + --input [file] Path to input data (must be surrounded with quotes) + -profile [str] Configuration profile to use. Can use multiple (comma separated) + Available: conda, docker, singularity, test, podman, awsbatch, and more + + References and annotative files can be specified in the configuration file. + Alternatively, the following params can be edited directly. + + Library type and genome files: + --single_end [bool] Specifies that the input is single-end reads (default: false) + --fasta_host [file] Host genome ("folder/file.fa") + --fasta_pathogen [file] Pathogen genome + + Annotation files: + --gff_host [file] Host GFF + --gff_pathogen [file] Pathogen GFF + --gff_host_tRNA [file] Host tRNA (optional) + + The pipeline will automatically generate transcriptome files for both the host and pathogen. + These parameters should only be used when using custom transcriptome files + + Other options: + --outdir [file] The output directory where the results will be saved + --publish_dir_mode, [str] Mode for publishing results in the output directory. Available: symlink, rellink, link, copy, copyNoFollow, move (Default: copy) + --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits + --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful + --max_multiqc_email_size [str] Threshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) + -name [str] Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic + + Transcriptome files: + --read_transcriptome_fasta_host_from_file [bool] Include custom host transcriptome + (Default: false) + --read_transcriptome_fasta_pathogen_from_file [bool] Include custom pathogen transcriptome + (Default: false) + --transcriptome_host [file] Custom host transcriptome + (Default: "") + --transcriptome_pathogen [file] Custom pathogen transcriptome + (Default: "") + + Trimming is performed by either Cutadapt or BBDuk with the following related options + + Cutadapt: + --run_cutadapt [bool] To run cutadapt (Default: false) + --a [str] Adapter sequence for single-end reads or first reads of paired-end data + (Default: "AGATCGGAAGAGCACACGTCTGAACTCCAGTCA") + --A [str] Adapter sequence for second reads of paired-end data + (Default: "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT") + --quality-cutoff [int] Cutoff to remove low-quality ends of reads. (Default: 10) + A single cutoff value is used to trim the 3’ end of reads. + If two comma-separated cutoffs are defined, the first value reprerents 5’ cutoff, + and the second value defines the 3’ cutoff. + --cutadapt_params [str] Set of additional cutadapt parameters + + BBDuk: + --run_bbduk [bool] To run BBDuk (Default: false) + --minlen [int] Reads shorter than this after trimming will be discarded. + Pairs will be discarded if both are shorter. + (Default: 18) + --qtrim [str] To trim read ends to remove bases with quality below trimq. + Possible options:rl (trim both ends), f (neither end), r (right end only), + l (left end only), w (sliding window). + (Default: "r") + --trimq [int] Cutoff to trim regions with average quality BELOW given value. + Option is avaiable if qtrim is set to something other than f. + (Default: 10) + --ktrim [str] To trim reads to remove bases matching reference kmers. + Avaiable options: f (don't trim), r (trim to the right - 3' adapters), + l (trim to the left - 5' adapters). + (Default: "r") + --k [int] Kmer length used for finding contaminants (adapters). Contaminants + shorter than k will not be found. k must be at least 1. + (Default: 17) + --mink [int] Look for shorter kmers at read tips down to this length, + when k-trimming or masking. 0 means disabled. Enabling + this will disable maskmiddle. + (Default: 11) + --hdist [int] Maximum Hamming distance for ref kmers (subs only). + (Default: 1) + --adapters [file] Fasta file with adapter sequences (Default: $projectDir/data/adapters.fa) + --bbduk_params [str] Set of additional BBDuk parameters + + Basic quality control is reported through FastQC, which is run on raw reads and trimmed reads. + + FastQC: + --skip_fastqc [bool] Option to skip running FastQC + --fastqc_params [str] Set of additional fastqc parameters + + The following options are related to the three main methods to extract gene expression: + + Salmon: + --libtype [str] To define the type of sequencing library of your data + (Default:'') + --kmer_length [int] To define the k-mer length (-k parameter in Salmon) + (Default: 21) + --writeUnmappedNames [bool] By default the pipeline does not save names of unmapped reads + (Default: false) + --softclipOverhangs [bool] By default, the pipeline does not allow soft-clipping of reads + (Default: false) + --incompatPrior [int] This is set to 0.0, to ensure that only mappings or alignments that + are compatible with the specified library type are considered by Salmon + (Default: 0.0) + --dumpEq [bool] To save the equivalence classes and their counts, change this option to True + (Default: false) + --writeMappings [bool] If set to True, the pipeline will create a files named mapping.sam + containing mapping information + (Default: false) + --keepDuplicates [bool] Option to remove/collapse identical transcripts during the indexing stage + (Default: false) + --generate_salmon_uniq_ambig [bool] Option to extract all of the unique and ambigious reads after quantification + Works for both Selective alignment and alignment-based modes + (Default: false) + + Salmon selective alignment: + --run_salmon_selective_alignment [bool] Run this mode + (Default: false) + --gene_attribute_gff_to_create_transcriptome_host [str] Host transcriptome - gene attributes + (Default: transcript_id) + --gene_feature_gff_to_create_transcriptome_host [str] Host transcriptome - gene feature + (Default: ["exon", "tRNA"]) + --gene_attribute_gff_to_create_transcriptome_pathogen [str] Pathogen transcriptome - gene attribute + (Default: locus_tag) + --gene_feature_gff_to_create_transcriptome_pathogen [str] Pathogen transcriptome - gene features + (Default: ["gene","sRNA","tRNA","rRNA"] ) + --salmon_sa_params_index [str] Set of additional parameters for creating an index with Salmon Selective Alignment + --salmon_sa_params_mapping [str] Set of additional parameters for mapping with Salmon Selective Alignment + + STAR - general options available for both modes, genome mapping with HTSeq quantification and salmon - alignment-based mode: + --run_star [bool] Run STAR + (Default: false) + --outSAMunmapped [str] By default, the pipeline saves unmapped reads + within the main BAM file. If you want to switch off this option, + set the --outSAMunmapped flag to None + (Default: Within) + --outSAMattributes [str] To specify the attributes of the output BAm file + (Default: Standard) + --outFilterMultimapNmax [int] To specify the maximum number of loci a read is allowed to map to + (Default: 999) + --outFilterType [str] By default, the pipeline keeps reads containing junctions that + passed filtering into the file SJ.out.tab. This option reduces + the number of ”spurious” junctions + (Default: BySJout) + --alignSJoverhangMin [int] The number of minimum overhang for unannotated junctions + (Default: 8) + --alignSJDBoverhangMin [int] The number of minimum overhang for annotated junctions + (Default: 1) + --outFilterMismatchNmax [int] To define a threshold for the number of mismatches to be allowed. + The pipeline uses a large number to switch this filter off + (Default: 999) + --outFilterMismatchNoverReadLmax [int] Here, you can define a threshold for a ratio of mismatches to + read length. The alignment will be considered if the ratio is + less than or equal to this value. For 2x100b, max number of + mismatches is 0.04x200=8 for paired-end reads + (Default: 1) + --alignIntronMin [int] By default, the nf-core dualrnaseq pipeline uses 20 as a + minimum intron length. If the genomic gap is smaller than this + value, it is considered as a deletion + (Default: 20) + --alignIntronMax [int] The maximum intron length + (Default: 1000000) + --alignMatesGapMax [int] The maximum genomic distance between mates is 1,000,000 + (Default: 1000000) + --limitBAMsortRAM [int] Option to limit RAM when sorting BAM file. + If 0, will be set to the genome index size, which can be quite + large when running on a desktop or laptop + (Default: 0) + --winAnchorMultimapNmax [int] By default, the nf-core dualrnaseq pipeline uses 999 as a + maximum number of loci anchors that are allowed to map to + (Default: 999) + --sjdbOverhang [int] To specify the length of the donor/acceptor sequence on each side of the junctions + used in constructing the splice junctions database. + ideally = (mate length - 1) + (Default: 100) + + STAR - additional options available only for genome mapping with HTSeq quantification mode + --outWigType [str] Used to generate signal outputs, such as "wiggle" and "bedGraph" + (Default: None) + --outWigStrand [str] Options are Stranded or Unstranded when defining + the strandedness of wiggle/bedGraph output + (Default: Stranded) + --star_index_params [str] Set of additional parameters for creating an index with STAR + --star_alignment_params [str] Set of additional parameters for alignment with STAR + + STAR - additional options available only for Salmon - alignment-based mode: + --quantTranscriptomeBan [str] The nf-core/dualrnaseq pipeline runs STAR to generate a + transcriptomic alignments. By default, it allows for insertions, + deletions and soft-clips (Singleend option). To prohibit this + behaviour, specify IndelSoftclipSingleend + (Default: Singleend) + --star_salmon_index_params [str] Set of additional parameters for creating an index with STAR in salmon alignment-based mode + --star_salmon_alignment_params [str] Set of additional parameters for alignment with STAR in salmon alignment-based mode + + Salmon - alignment-based mode: + --run_salmon_alignment_based_mode [bool] Option to run Salmn in alignment mode + (Default: false) + -- salmon_alignment_based_params [str] Set of additional parameters for Salmon in alignment-based mode + + HTSeq: + --run_htseq_uniquely_mapped [bool] Option to run HTSeq + (Default: false) + --stranded [char] Is library type stranded (yes/no) + (Default: yes) + --max_reads_in_buffer [int] To define the number of maximum reads allowed to + stay in memory until the mates are found + Has an effect for paired-end reads + (Default: 30000000) + --minaqual [int] To specify a threshold for a minimal MAPQ alignment quality.(Default: 10) + Reads with the MAPQ alignment quality below the given number will be removed + (Default: 10) + --htseq_params [str] Set of additional parameters for HTSeq + --gene_feature_gff_to_quantify_host [str] Host - gene features to quantify from GFF + (Default: ["exon","tRNA"] ) + --gene_feature_gff_to_quantify_pathogen [str] Pathogen - gene features to quantify from GFF + (Default: ["gene", "sRNA", "tRNA", "rRNA"] ) + --host_gff_attribute [str] Host - attribute from GFF + (Default: gene_id ) + --pathogen_gff_attribute [str] Pathogen - attribute from GFF + (Default: locus_tag) + + RNA mapping statistics: + --mapping_statistics [bool] Option to generate mapping statistics. This will create the following: + - Count the total number of reads before and after trimming + - Scatterplots comparing all replicates (separate for both host and pathogen reads) + - Plots of the % of mapped/quantified reads + - Plots of RNA-class statistics (as many types can be identified, + the parameter below --rna_classes_to_replace_host can help to summarise these) + (Default: false) + --rna_classes_to_replace_host [file] Located within the data/ directory, this tab delimited file contains headers which + groups similar types of RNA classes together. This helps to keep the RNA-class + names simplified for plotting purposes. + (Default: $projectDir/data/RNA_classes_to_replace.tsv) + + Report options: + --email [email] Set this parameter to your e-mail address to get a summary e-mail with details of the + run sent to you when the workflow exits + (Default: false) + --email_on_fail [email] Same as --email, except only send mail if the workflow is not successful + (Default: false) + --max_multiqc_email_size [str] Theshold size for MultiQC report to be attached in notification email. + If file generated by pipeline exceeds the threshold, it will not be attached + (Default: 25MB) + --plaintext_email [bool] Set to receive plain-text e-mails instead of HTML formatted + (Default: false) + --monochrome_logs [bool] Set to disable colourful command line output and live life in monochrome + (Default: false) + --multiqc_config [bool] Specify path to a custom MultiQC configuration file. + (Default: false) + + AWSBatch options: + --awsqueue [str] The AWSBatch JobQueue that needs to be set when running on AWSBatch + --awsregion [str] The AWS Region for your AWS Batch job to run on + --awscli [str] Path to the AWS CLI tool + """.stripIndent() +} + + +// Show help message +if (params.help) { + helpMessage() + exit 0 +} + + + +/* +-------------------------------------------------------------------- + +SET UP CONFIGURATIONs AND IDENTIFY USER-SPECIFIED VARIABLES + +-------------------------------------------------------------------- + */ + +//---------- +// Check if genomes exists in the config file +//---------- + +if (params.genomes && params.genome_host && !params.genomes.containsKey(params.genome_host)) { + exit 1, "The provided genome '${params.genome_host}' is not available in the genomes.config file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" +} + +if (params.genomes && params.genome_pathogen && !params.genomes.containsKey(params.genome_pathogen)) { + exit 1, "The provided genome '${params.genome_pathogen}' is not available in the genomes.config file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" +} + + +//---------- +// Reference genomes, annotation files and transcriptomes +//---------- +params.fasta_host = params.genome_host ? params.genomes[ params.genome_host ].fasta_host ?: false : false +if (params.fasta_host) { ch_fasta_host = file(params.fasta_host, checkIfExists: true) } + +params.fasta_pathogen = params.genome_pathogen ? params.genomes[ params.genome_pathogen ].fasta_pathogen ?: false : false +if (params.fasta_pathogen) { ch_fasta_pathogen = file(params.fasta_pathogen, checkIfExists: true) } + +params.gff_host_tRNA = params.genome_host ? params.genomes[ params.genome_host ].gff_host_tRNA ?: false : false +if (params.gff_host_tRNA) { ch_gff_host_tRNA = file(params.gff_host_tRNA, checkIfExists: true) } + +params.gff_host_genome = params.genome_host ? params.genomes[ params.genome_host ].gff_host ?: false : false +if (params.gff_host_genome) { ch_gff_host_genome = file(params.gff_host_genome, checkIfExists: true) } + +params.gff_pathogen = params.genome_pathogen ? params.genomes[ params.genome_pathogen ].gff_pathogen ?: false : false +if (params.gff_pathogen) { ch_gff_pathogen = file(params.gff_pathogen, checkIfExists: true) } + +if(params.read_transcriptome_fasta_host_from_file){ + params.transcriptome_host = params.genome_host ? params.genomes[ params.genome_host ].transcriptome_host ?: false : false + if (params.transcriptome_host) { ch_transcriptome_host = file(params.transcriptome_host, checkIfExists: true) } +} + +if(params.read_transcriptome_fasta_pathogen_from_file){ + params.transcriptome_pathogen = params.genome_pathogen ? params.genomes[ params.genome_pathogen ].transcriptome_pathogen ?: false : false + if (params.transcriptome_pathogen) { ch_transcriptome_pathogen = file(params.transcriptome_pathogen, checkIfExists: true) } +} + + +//---------- +// Trimming - check if only one of the trimming tools is set to "true" +//---------- +if (params.run_cutadapt & params.run_bbduk) { + exit 1, "Trimming: both --run_cutadapt and --run_bbduk are set to true, please use only one of the adapter trimming tools" +} + +//---------- +// BBDuk - fasta file of adapters +//---------- +if(params.run_bbduk) { + if (params.adapters) { adapter_database = file(params.adapters, checkIfExists: true) } +} + + +//---------- +// Salmon library type +//---------- +if (params.run_salmon_selective_alignment | params.run_salmon_alignment_based_mode){ + if (!params.libtype){ + exit 1, "Salmon: Please specify --libtype" +} else if (params.libtype == 'A'){ + // continue +} else if (params.single_end & (params.libtype != 'U' && params.libtype != 'SR' && params.libtype != 'SF')) { + exit 1, "Salmon: Invalid library type --libtype ${params.libtype}! Library types available for single-end reads are:'U', 'SR', 'SF'." +} else if (!params.single_end & (params.libtype != 'IU' && params.libtype != 'ISR' && params.libtype != 'ISF' && params.libtype != 'MU' && params.libtype != 'MSR' && params.libtype != 'MSF' && params.libtype != 'OU' && params.libtype != 'OSR' && params.libtype != 'OSF' )) { + exit 1, "Salmon: Invalid library type --libtype ${params.libtype}! Library types available for paired-end reads are: 'IU', 'ISR', 'ISF', 'MU', 'MSR', 'MSF','OU', 'OSR', 'OSF'." +} +} + + +//---------- +// HTSeq - check if there is an alignment file +//---------- +if (params.run_htseq_uniquely_mapped){ + if (!params.run_star){ + exit 1, "HTSeq: There is no alignment file. Please specify --run_star" + } +} + + +//---------- +// Salmon alignment based mode - check if there is an alignment file +//---------- +if (params.run_salmon_alignment_based_mode){ + if (!params.run_star){ + exit 1, "Salmon: There is no alignment file. Please specify --run_star" + } +} + +//---------- +// Mapping stats +//---------- +if(params.mapping_statistics) { + if (params.rna_classes_to_replace_host) { ch_RNA_classes = file(params.rna_classes_to_replace_host, checkIfExists: true) } +} + + +//---------- +// Has the run name been specified by the user? +// this has the bonus effect of catching both -name and --name +custom_runName = params.name +if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) { + custom_runName = workflow.runName +} + + +// Check AWS batch settings +if (workflow.profile.contains('awsbatch')) { + // AWSBatch sanity checking + if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" + // Check outdir paths to be S3 buckets if running on AWSBatch + // related: https://github.com/nextflow-io/nextflow/issues/813 + if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" + // Prevent trace files to be stored on S3 since S3 does not support rolling files. + if (params.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." +} + + +//---------- +// Stage config files +ch_multiqc_config = file("$workflow.projectDir/assets/multiqc_config.yaml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() +ch_output_docs = file("$workflow.projectDir/docs/output.md", checkIfExists: true) +ch_output_docs_images = file("$workflow.projectDir/docs/images/", checkIfExists: true) + + + +/* + * Create a channel for input read files + */ +if (params.input_paths) { + if (params.single_end) { + Channel + .from(params.input_paths) + .map { row -> [ row[0], [ file(row[1][0], checkIfExists: true) ] ] } + .ifEmpty { exit 1, "params.input_paths was empty - no input files supplied" } + .into { ch_read_files_fastqc; trimming_reads; raw_read_count; scatter_plots_set } + } else { + Channel + .from(params.input_paths) + .map { row -> [ row[0], [ file(row[1][0], checkIfExists: true), file(row[1][1], checkIfExists: true) ] ] } + .ifEmpty { exit 1, "params.input_paths was empty - no input files supplied" } + .into { ch_read_files_fastqc; trimming_reads; raw_read_count;scatter_plots_set } + } +} else { + Channel + .fromFilePairs(params.input, size: params.single_end ? 1 : 2) + .ifEmpty { exit 1, "Cannot find any reads matching: ${params.input}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --single_end on the command line." } + .into { ch_read_files_fastqc; trimming_reads; raw_read_count; scatter_plots_set} +} + + + +//---------- +// Channel for host and pathogen fasta files +//---------- +Channel + .value( ch_fasta_pathogen) + .collect() + .set { genome_fasta_pathogen_to_unzip} + +Channel + .value( ch_fasta_host ) + .set { genome_fasta_host_to_unzip} + + +//---------- +// Channel for host GFF and/or tRNA-based files +//---------- +if(params.gff_host_tRNA){ + Channel + .value(ch_gff_host_tRNA) + .set {host_gff_trna_file_to_unzip} + Channel + .value(ch_gff_host_genome) + .set {host_gff_trna_to_unzip} +}else{ + Channel + .value(ch_gff_host_genome) + .set {host_gff_to_unzip} +} + + +//---------- +// Channel for pathogen GFF files +//---------- +Channel + .value(ch_gff_pathogen) + .set {pathogen_gff_to_unzip} + + +//---------- +// Channel for host and pathogen transcriptomes +//---------- +if(params.read_transcriptome_fasta_host_from_file){ +Channel + .value(ch_transcriptome_host) + .into {host_transcriptome_to_combine; transcriptome_host_to_split_q_table_salmon; transcriptome_host_to_split_table_salmon; transcriptome_host_to_split_q_table_salmon_alignment_based; transcriptome_host_to_split_table_salmon_alignment; transcriptome_fasta_host_ref_names} +} + +if(params.read_transcriptome_fasta_pathogen_from_file){ +Channel + .value(ch_transcriptome_pathogen) + .into {pathogen_transcriptome_to_combine; transcriptome_pathogen_to_split_table_salmon; transcriptome_pathogen_to_split_table_salmon_alignment; transcriptome_pathogen_to_split_q_table_salmon; transcriptome_pathogen_to_split_q_table_salmon_alignment_based;transcriptome_fasta_pathogen_ref_names} +} + + +//---------- +// Channel to capture Cutadapt-based params +//---------- +if (params.run_cutadapt){ + if(params.single_end){ + Channel + .value( params.a ) + .set { adapter_sequence_3 } + } else { + Channel + .from (params.a, params.A) + .collect() + .set { adapter_sequence_3 } + } + Channel + .value(params.quality_cutoff) + .set { quality_cutoff} +} + + +//---------- +// Channel to capture BBDuk-based params +//---------- +if (params.run_bbduk){ + Channel + .value( adapter_database ) + .set { adapter_database } +} + + +//---------- +// Channel to capture Salmon-based params +//---------- +if (params.run_salmon_selective_alignment){ + Channel + .value(params.kmer_length) + .set {kmer_length_salmon_index} + +} + + +if (params.run_salmon_selective_alignment | params.run_salmon_alignment_based_mode) { + Channel + .value(params.gene_attribute_gff_to_create_transcriptome_host) + .into {host_gff_attribute_salmon_alignment_tRNA; gene_attribute_gff_to_create_transcriptome_host_salmon; host_atr_collect_data_salmon; combine_annot_quant_pathogen; combine_annot_quant_host; atr_scatter_plot_pathogen; atr_scatter_plot_host; attribute_quant_stats_salmon; host_annotations_RNA_class_stats_pathogen; attribute_host_RNA_class_stats; host_atr_collect_data_salmon_alignment_mode; combine_annot_quant_pathogen_salmon_alignment_based; combine_annot_quant_host_salmon_alignment_based; atr_scatter_plot_pathogen_alignment; atr_scatter_plot_host_alignment; attribute_quant_stats_salmon_alignment;host_annotations_RNA_class_stats_pathogen_alignment; attribute_host_RNA_class_stats_alignment} + + Channel + .value(params.gene_feature_gff_to_create_transcriptome_host) + .collect() + .into { gene_feature_gff_host_salmon_alignment; gene_feature_gff_to_create_transcriptome_host_salmon} + + Channel + .value(params.gene_attribute_gff_to_create_transcriptome_pathogen) + .into {pathogen_gff_attribute_salmon_alignment; gene_attribute_gff_to_create_transcriptome_pathogen_salmon} + + Channel + .value(params.gene_feature_gff_to_create_transcriptome_pathogen) + .collect() + .into {gene_feature_to_quantify_pathogen_salmon_alignment; gene_feature_to_extract_annotations_pathogen; gene_feature_gff_to_create_transcriptome_pathogen_salmon} + + Channel + .value(params.libtype) + .into {libtype_salmon; libtype_salmon_alignment_mode} +} + + +//---------- +// Channel to capture htseq and star-based params +//---------- +if(params.run_htseq_uniquely_mapped | params.run_star){ + + Channel + .value(params.gene_feature_gff_to_quantify_host) + .collect() + .into {gene_feature_to_quantify_host; gene_feature_to_extract_annotations_host_htseq} + + Channel + .value(params.gene_feature_gff_to_quantify_pathogen) + .collect() + .into {gene_feature_to_quantify_pathogen; gene_feature_to_extract_annotations_pathongen_htseq} + + Channel + .value(params.pathogen_gff_attribute) + .into { pathogen_gff_attribute; pathogen_gff_attribute_to_extract_annotations_htseq} + + Channel + .value(params.stranded) + .set { stranded_htseq_unique} + + Channel + .value(params.host_gff_attribute) + .into { host_gff_attribute_to_pathogen; host_gff_attribute_htseq; host_gff_attribute_htseq_combine; host_gff_attribute_to_extract_annotations_htseq; host_gff_attribute_mapping_stats_htseq; host_gff_attribute_RNA_class_pathogen_htseq; host_gff_attribute_RNA_class_host_htseq; combine_annot_quant_pathogen_host_gff_attribute; host_gff_attribute_htseq_TPM; atr_scatter_plot_pathogen_htseq_u_m; atr_scatter_plot_host_htseq_u_m} +} + + +//---------- +// Channel to capture mapping statistics-based params +//---------- +if(params.mapping_statistics) { + +Channel + .value(ch_RNA_classes) + .into { RNA_classes_to_replace; RNA_classes_to_replace_alignment; RNA_classes_to_replace_htseq_uniquely_mapped} +} + + + + +/* +-------------------------------------------------------------------- + +HEADER LOG INFO + +-------------------------------------------------------------------- +*/ + + +log.info nfcoreHeader() +def summary = [:] +if (workflow.revision) summary['Pipeline Release'] = workflow.revision +summary['Run Name'] = custom_runName ?: workflow.runName + +// TODO nf-core: Report custom parameters here +summary['Input'] = params.input +summary['Host Fasta Ref'] = params.fasta_host +summary['Pathogen Fasta Ref'] = params.fasta_pathogen +summary['Host tRNA gff Ref'] = params.gff_host_tRNA +summary['Host genome gff Ref'] = params.gff_host_genome +summary['Pathogen genome gff Ref'] = params.gff_pathogen +summary['Data Type'] = params.single_end ? 'Single-End' : 'Paired-End' +summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" +if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" +summary['Output dir'] = params.outdir +summary['Launch dir'] = workflow.launchDir +summary['Working dir'] = workflow.workDir +summary['Script dir'] = workflow.projectDir +summary['User'] = workflow.userName + +if (workflow.profile.contains('awsbatch')) { + summary['AWS Region'] = params.awsregion + summary['AWS Queue'] = params.awsqueue + summary['AWS CLI'] = params.awscli +} + +summary['Config Profile'] = workflow.profile +if (params.config_profile_description) summary['Config Profile Description'] = params.config_profile_description +if (params.config_profile_contact) summary['Config Profile Contact'] = params.config_profile_contact +if (params.config_profile_url) summary['Config Profile URL'] = params.config_profile_url +summary['Config Files'] = workflow.configFiles.join(', ') + +if (params.email || params.email_on_fail) { + summary['E-mail Address'] = params.email + summary['E-mail on failure'] = params.email_on_fail + summary['MultiQC maxsize'] = params.max_multiqc_email_size +} + +log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n") +log.info "-\033[2m--------------------------------------------------\033[0m-" + +// Check the hostnames against configured profiles +checkHostname() + +Channel.from(summary.collect{ [it.key, it.value] }) + .map { k,v -> "
$k
${v ?: 'N/A'}
" } + .reduce { a, b -> return [a, b].join("\n ") } + .map { x -> """ + id: 'nf-core-dualrnaseq-summary' + description: " - this information is collected when the pipeline is started." + section_name: 'nf-core/dualrnaseq Workflow Summary' + section_href: 'https://github.com/nf-core/dualrnaseq' + plot_type: 'html' + data: | +
+ $x +
+ """.stripIndent() } + .set { ch_workflow_summary } + + + + +/* +-------------------------------------------------------------------- + +PARSE SOFTWARE VERSION NUMBERS + +-------------------------------------------------------------------- +*/ + + +process get_software_versions { + publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode, + saveAs: { filename -> + if (filename.indexOf(".csv") > 0) filename + else null + } + + output: + file 'software_versions_mqc.yaml' into ch_software_versions_yaml + file "software_versions.csv" + + script: + """ + echo $workflow.manifest.version > v_pipeline.txt + echo $workflow.nextflow.version > v_nextflow.txt + python --version > v_python.txt + R --version > v_r.txt + cutadapt --version > v_cutadapt.txt + fastqc --version > v_fastqc.txt + multiqc --version > v_multiqc.txt + STAR --version > v_star.txt + htseq-count . . --version > v_htseq.txt + samtools --version > v_samtools.txt + gffread --version > v_gffread.txt + salmon --version > v_salmon.txt + scrape_software_versions.py &> software_versions_mqc.yaml + """ +} + + +/* +-------------------------------------------------------------------- +Workflow - Processes +-------------------------------------------------------------------- +*/ + + +/* + * STEP 1 - Check if there are technical replicates - to plot scatter plots of technical replicates + */ + +if(params.mapping_statistics) { + + scatter_plots_set + .map { tag, file -> tag } + .set {scatter_plots} + + process check_replicates { + tag "check_replicates" + + label 'process_high' + + input: + val(sample_name) from scatter_plots.collect() + + output: + stdout repl_scatter_plots_salmon_pathogen + stdout repl_scatter_plots_salmon_host + stdout repl_scatter_plots_salmon_alignment_host + stdout repl_scatter_plots_salmon_alignment_pathogen + stdout repl_scatter_plots_htseq_pathogen + stdout repl_scatter_plots_htseq_host + + shell: + ''' + python !{workflow.projectDir}/bin/check_replicates.py -s !{sample_name} 2>&1 + ''' + } +} + + + +/* + * STEP 2 - Prepare reference files + */ + +//Uncompress Pathogen genome +process uncompress_pathogen_fasta_genome { + tag "uncompress_pathogen_genome" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + + label 'process_high' + + input: + each file(f_ext) from genome_fasta_pathogen_to_unzip.collect() + + output: + file "${base_name_file}.fasta" into genome_fasta_pathogen_to_combine + file "${base_name_file}.fasta" into genome_fasta_pathogen_ref_names + file "${base_name_file}.fasta" into genome_fasta_pathogen_to_transcriptome + + shell: + ext_file = f_ext.getExtension() + base_name_file = f_ext.getBaseName() + if (ext_file == "fasta" | ext_file == "fa"){ + ''' + cp -n !{f_ext} !{base_name_file}.fasta + ''' + }else if(ext_file == "zip"){ + old_base_name_file = base_name_file + base_name_file = old_base_name_file.replaceAll(/.fasta|.fa/,"") + ''' + gunzip -f -S .zip !{f_ext} + cp -n !{old_base_name_file} !{base_name_file}.fasta + ''' + }else if(ext_file == "gz"){ + old_base_name_file = base_name_file + base_name_file = old_base_name_file.replaceAll(/.fasta|.fa/,"") + ''' + gunzip -f !{f_ext} + cp -n !{old_base_name_file} !{base_name_file}.fasta + ''' + }else { + ''' + echo "Your pathogen genome files appear to have the wrong extension. \n Currently, the pipeline only supports .fasta or .fa, or compressed files with .zip or .gz extensions." + ''' + } +} + + +//Uncompress Host genome +process uncompress_host_fasta_genome { + tag "uncompress_host_genome" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + + label 'process_high' + + input: + file(f_ext) from genome_fasta_host_to_unzip + + output: + file "${base_name_file}.fasta" into genome_fasta_host_to_combine + file "${base_name_file}.fasta" into genome_fasta_host_to_decoys + file "${base_name_file}.fasta" into genome_fasta_host_ref_names + file "${base_name_file}.fasta" into genome_fasta_host_to_transcriptome + file "${base_name_file}.fasta" into genome_fasta_host_to_transcriptome_tRNA + + shell: + //Tests to see if the input files are compressed. + //At this stage, only accepts fasta or .fa files, or .gz or .zip genome files. + ext_file = f_ext.getExtension() + base_name_file = f_ext.getBaseName() + if (ext_file == "fasta" | ext_file == "fa"){ + ''' + cp -n !{f_ext} !{base_name_file}.fasta + ''' + }else if(ext_file == "zip"){ + old_base_name_file = base_name_file + base_name_file = old_base_name_file.replaceAll(/.fasta|.fa/,"") + ''' + gunzip -f -S .zip !{f_ext} + cp -n !{old_base_name_file} !{base_name_file}.fasta + ''' + }else if(ext_file == "gz"){ + old_base_name_file = base_name_file + base_name_file = old_base_name_file.replaceAll(/.fasta|.fa/,"") + ''' + gunzip -f !{f_ext} + cp -n !{old_base_name_file} !{base_name_file}.fasta + ''' + }else { + ''' + echo "Your host genome files appear to have the wrong extension. \n Currently, the pipeline only supports .fasta or .fa, or compressed files with .zip or .gz extensions." + ''' + } +} + + + +//Uncompress Pathogen GFF +process uncompress_pathogen_gff { + tag "uncompress_pathogen_GFF" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + + label 'process_high' + + input: + file(f_ext) from pathogen_gff_to_unzip + + output: + file "${base_name_file}.gff3" into gff_feature_quant_pathogen_salmon_alignment + file "${base_name_file}.gff3" into gff_pathogen_create_transcriptome + file "${base_name_file}.gff3" into gff_feature_quant_pathogen_htseq + file "${base_name_file}.gff3" into extract_annotations_pathogen_gff_htseq + + shell: + //Tests to see if the input files are compressed. + //At this stage, only accepts .gff or gff3, or .gz or .zip annotation files. + ext_file = f_ext.getExtension() + base_name_file = f_ext.getBaseName() + + if (ext_file == "gff" | ext_file == "gff3"){ + ''' + cp -n !{f_ext} !{base_name_file}.gff3 + ''' + }else if(ext_file == "zip"){ + ''' + gunzip -f -S .zip !{f_ext} + cp -n !{base_name_file} !{base_name_file}.gff3 + ''' + }else if(ext_file == "gz"){ + //if gff or gff3, need to save as .gff3 + old_base_name_file = base_name_file + base_name_file = old_base_name_file.replaceAll(/.gff|.gff3/,"") + ''' + gunzip -f !{f_ext} + cp -n !{old_base_name_file} !{base_name_file}.gff3 + ''' + }else { + ''' + echo "Your pathogen GFF file appears to be in the wrong format or has the wrong extension. \n Currently, the pipeline only supports .gff or .gff3, or compressed files with .zip or .gz extensions." + ''' + } +} + + + + +//Uncompress host GFF (no tRNA) +process uncompress_host_gff { + tag "uncompress_host_GFF" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + + label 'process_high' + + input: + file(f_ext) from host_gff_to_unzip + + output: + file "${base_name_file}.gff3" into gff_host_genome_star_salmon_change_atr + file "${base_name_file}.gff3" into gff_host_create_transcriptome + file "${base_name_file}.gff3" into gff_host_genome_htseq + file "${base_name_file}.gff3" into extract_annotations_host_gff_htseq + file "${base_name_file}.gff3" into gff_host_star_alignment_gff + file "${base_name_file}.gff3" into gff_host_star_htseq_alignment_gff + file "${base_name_file}.gff3" into genome_gff_star_index + + shell: + //Tests to see if the input files are compressed. + //At this stage, only accepts .gff or gff3, or .gz or .zip annotation files. + ext_file = f_ext.getExtension() + base_name_file = f_ext.getBaseName() + + if (ext_file == "gff" | ext_file == "gff3"){ + ''' + cp -n !{f_ext} !{base_name_file}.gff3 + ''' + }else if(ext_file == "zip"){ + ''' + gunzip -f -S .zip !{f_ext} + cp -n !{base_name_file} !{base_name_file}.gff3 + ''' + }else if(ext_file == "gz"){ + //if gff or gff3, need to save as .gff3 + old_base_name_file = base_name_file + base_name_file = old_base_name_file.replaceAll(/.gff|.gff3/,"") + ''' + gunzip -f !{f_ext} + cp -n !{old_base_name_file} !{base_name_file}.gff3 + ''' + }else { + ''' + echo "Your host GFF file appears to be in the wrong format or has the wrong extension. \n Currently, the pipeline only supports .gff or .gff3, or compressed files with .zip or .gz extensions." + ''' + } +} + + + + +if(params.gff_host_tRNA){ + //Uncompress host GFF (with tRNA) + process uncompress_host_gff_trna { + tag "uncompress_host_GFF_trna" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + + label 'process_high' + + input: + file(f_ext) from host_gff_trna_to_unzip + + output: + file "${base_name_file}.gff3" into gff_host_genome_star_salmon_change_atr + file "${base_name_file}.gff3" into gff_host_create_transcriptome + file "${base_name_file}.gff3" into combine_gff_host_genome_htseq + file "${base_name_file}.gff3" into gff_host_star_alignment_gff + file "${base_name_file}.gff3" into gff_host_star_htseq_alignment_gff + file "${base_name_file}.gff3" into genome_gff_star_index + + shell: + //Tests to see if the input files are compressed. + //At this stage, only accepts .gff or gff3, or .gz or .zip annotation files. + ext_file = f_ext.getExtension() + base_name_file = f_ext.getBaseName() + + if (ext_file == "gff" | ext_file == "gff3"){ + ''' + cp -n !{f_ext} !{base_name_file}.gff3 + ''' + }else if(ext_file == "zip"){ + ''' + gunzip -f -S .zip !{f_ext} + cp -n !{base_name_file} !{base_name_file}.gff3 + ''' + }else if(ext_file == "gz"){ + //if gff or gff3, need to save as .gff3 + old_base_name_file = base_name_file + base_name_file = old_base_name_file.replaceAll(/.gff|.gff3/,"") + ''' + gunzip -f !{f_ext} + cp -n !{old_base_name_file} !{base_name_file}.gff3 + ''' + }else { + ''' + echo "Your host GFF file appears to be in the wrong format or has the wrong extension. \n Currently, the pipeline only supports .gff or .gff3, or compressed files with .zip or .gz extensions." + ''' + } + } + + + //Uncompress host GFF (with tRNA) + process uncompress_host_gff_trna_file { + tag "uncompress_host_GFF_trna_file" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + + label 'process_high' + + input: + file(f_ext) from host_gff_trna_file_to_unzip + + output: + file "${base_name_file}.gff3" into change_attrubute_gff_host_tRNA_salmon_alignment + file "${base_name_file}.gff3" into gff_host_create_transcriptome_tRNA + file "${base_name_file}.gff3" into combine_gff_host_tRNA_htseq + + shell: + //Tests to see if the input files are compressed. + //At this stage, only accepts .gff or gff3, or .gz or .zip annotation files. + ext_file = f_ext.getExtension() + base_name_file = f_ext.getBaseName() + + if (ext_file == "gff" | ext_file == "gff3"){ + ''' + cp -n !{f_ext} !{base_name_file}.gff3 + ''' + }else if(ext_file == "zip"){ + ''' + gunzip -f -S .zip !{f_ext} + cp -n !{base_name_file} !{base_name_file}.gff3 + ''' + }else if(ext_file == "gz"){ + //if gff or gff3, need to save as .gff3 + old_base_name_file = base_name_file + base_name_file = old_base_name_file.replaceAll(/.gff|.gff3/,"") + ''' + gunzip -f !{f_ext} + cp -n !{old_base_name_file} !{base_name_file}.gff3 + ''' + }else { + ''' + echo "Your host GFF tRNA file appears to be in the wrong format or has the wrong extension. \n Currently, the pipeline only supports .gff or .gff3, or compressed files with .zip or .gz extensions." + ''' + } + } + +} // end of "if(params.gff_host_tRNA){" + + + + +/* + * combine pathogen and host genome fasta files + */ + +process combine_pathogen_host_fasta_genome { + tag "combine_genome_fa_files" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + + label 'process_high' + + input: + file(host_fa) from genome_fasta_host_to_combine + file(pathogen_fa) from genome_fasta_pathogen_to_combine.collect() + + output: + file "host_pathogen.fasta" into host_pathogen_fasta_index + file "host_pathogen.fasta" into host_pathogen_fasta_star_index + file "host_pathogen.fasta" into genome_fasta_file_host_pathogen_to_decoy_transcriptome + + script: + """ + cat $pathogen_fa $host_fa > host_pathogen.fasta + """ +} + + +if(params.run_htseq_uniquely_mapped) { + + /* + * combine host genome and tRNA gff files + */ + + if(params.gff_host_tRNA){ + process combine_host_genome_tRNA_gff_files_htseq { + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + tag "comb_host_genome_tRNA_gff" + + label 'process_high' + + input: + file(host_gff_genome) from combine_gff_host_genome_htseq + file(host_gff_tRNA) from combine_gff_host_tRNA_htseq + + output: + file "${outfile_name}" into gff_host_genome_htseq + file "${outfile_name}" into extract_annotations_host_gff_htseq + + script: + outfile_name = host_gff_genome[0].toString().replaceAll(/.gff3|.gff/,"_with_tRNA.gff3") + """ + cat $host_gff_genome $host_gff_tRNA > ${outfile_name} + """ + } + } + + + /* + * Replace gene features (3rd column of gff) with 'quant' in host gff + */ + + process replace_gene_feature_gff_host_htseq { + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + tag "repl_gene_feature_gff_host" + + label 'process_high' + + input: + file(gff) from gff_host_genome_htseq + val(features) from gene_feature_to_quantify_host + + output: + file "${outfile_name}" into combine_gff_host + file "${outfile_name}" into gff_host_to_TPM + + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"_quant_feature.gff3") + """ + $workflow.projectDir/bin/replace_feature_gff.sh $gff ${outfile_name} $features + """ + } + + + /* + * Replace gene features (3rd column of gff) with 'quant' in pathogen gff + */ + + process replace_gene_feature_gff_pathogen_htseq { + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + tag "repl_gene_feature_gff_pathogen" + + label 'process_high' + + input: + file(gff) from gff_feature_quant_pathogen_htseq + val(features) from gene_feature_to_quantify_pathogen + + output: + file "${outfile_name}" into to_replace_gff_attribute + + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"_quant_feature.gff3") + """ + $workflow.projectDir/bin/replace_feature_gff.sh $gff ${outfile_name} $features + """ + } + + + /* + * Replace pathogen gene attribute in 9th column of pathogen gff with host attribute + */ + + process replace_attribute_pathogen_gff_htseq { + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + tag "repl_attribute_pathogen_gff" + + label 'process_high' + + input: + file(gff) from to_replace_gff_attribute + val(host_attribute) from host_gff_attribute_to_pathogen + val(pathogen_attribute) from pathogen_gff_attribute + + output: + file "${outfile_name}" into combine_gff_pathogen + file "${outfile_name}" into gff_pathogen_to_TPM + + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"_new_attribute.gff3") + """ + $workflow.projectDir/bin/replace_attribute_gff.sh $gff ${outfile_name} $host_attribute $pathogen_attribute + """ + } + + + /* + * combine pathogen and host genome gff files + */ + + process combine_pathogen_host_gff_files_htseq { + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + tag "comb_pathogen_host_gff_files" + + label 'process_high' + + input: + file(host_gff) from combine_gff_host + file(pathogen_gff_genome) from combine_gff_pathogen + + output: + file "host_pathogen_htseq.gff" into quantification_gff_u_m + + script: + """ + cat $pathogen_gff_genome $host_gff > host_pathogen_htseq.gff + """ + } + + + /* + * extract annotations from pathogen gff files - for RNA class statistics and to merge with quantification results + */ + + process extract_annotations_pathogen_htseq { + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + tag "extract_annotations_pathogen" + + label 'process_high' + + input: + file gff from extract_annotations_pathogen_gff_htseq + val(features) from gene_feature_to_extract_annotations_pathongen_htseq + val(pathogen_attribute) from pathogen_gff_attribute_to_extract_annotations_htseq + + output: + file "${outfile_name}*_htseq.tsv" into pathogen_annotations_RNA_class_stats_htseq + file "${outfile_name}*_htseq.tsv" into annotation_pathogen_combine_quant_htseq_u_m + file "${outfile_name}*_htseq.tsv" into annotation_pathogen_split_quant_htseq + + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"") + """ + python $workflow.projectDir/bin/extract_annotations_from_gff.py -gff $gff -f $features -a $pathogen_attribute -org pathogen -q_tool htseq -o ${outfile_name} + """ + } + + + /* + * extract annotations from host gff files - for RNA class statistics and to merge with quantification results + */ + + process extract_annotations_host_htseq { + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + tag "extract_annotations_host" + + label 'process_high' + + input: + file gff from extract_annotations_host_gff_htseq + val(features) from gene_feature_to_extract_annotations_host_htseq + val(host_attribute) from host_gff_attribute_to_extract_annotations_htseq + + output: + file "${outfile_name}*_htseq.tsv" into host_annotations_RNA_class_stats_htseq + file "${outfile_name}*_htseq.tsv" into annotation_host_combine_quant_htseq + file "${outfile_name}*_htseq.tsv" into annotation_host_split_quant_htseq + + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"") + """ + python $workflow.projectDir/bin/extract_annotations_from_gff.py -gff $gff -f $features -a $host_attribute -org host -q_tool htseq -o ${outfile_name} + """ + } +} + + + +if(params.run_star | params.run_salmon_alignment_based_mode) { + + if(params.mapping_statistics) { + + /* + * extract reference names from host genome fasta files - for mapping stats + */ + + process extract_reference_names_host_star { + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + tag "extract_ref_names_host_star" + + label 'process_high' + + input: + file(host_fa) from genome_fasta_host_ref_names + + output: + file "reference_host_names.txt" into reference_host_names_uniquelymapped + file "reference_host_names.txt" into reference_host_names_crossmapped_find + file "reference_host_names.txt" into reference_host_names_multimapped + + script: + """ + $workflow.projectDir/bin/extract_reference_names_from_fasta_files.sh reference_host_names.txt $host_fa + """ + } + + + + /* + * extract reference names from pathogen genome fasta files - for mapping stats + */ + + process extract_reference_names_pathogen_star { + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + tag "extract_ref_names_pathgn_star" + + label 'process_high' + + input: + file(pathogen_fa) from genome_fasta_pathogen_ref_names.collect() + + output: + file "reference_pathogen_names.txt" into reference_pathogen_names_uniquelymapped + file "reference_pathogen_names.txt" into reference_pathogen_crossmapped_find + file "reference_pathogen_names.txt" into reference_pathogen_names_multimapped + + script: + """ + $workflow.projectDir/bin/extract_reference_names_from_fasta_files.sh reference_pathogen_names.txt $pathogen_fa + """ + } + } +} + + + +if(params.run_salmon_selective_alignment | params.run_salmon_alignment_based_mode) { + + + /* + * Replace 'Parent' gene attribute in 9th column of host gff with 'parent' + */ + + process replace_attribute_host_genome_gff_star_salmon { + tag "repl_attribute_host_genome_gff" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + + label 'process_high' + + input: + file(gff) from gff_host_genome_star_salmon_change_atr + + output: + file "${outfile_name}" into combine_gff_host_genome_star_salmon + + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"_parent_attribute.gff3") + + """ + $workflow.projectDir/bin/replace_attribute_gff.sh $gff ${outfile_name} parent Parent + """ + } + + + + + if(params.gff_host_tRNA){ + + /* + * Replace gene attribute in 9th column of host tRNA gff with 'parent' + */ + + process replace_attribute_host_tRNA_gff_star_salmon { + tag "repl_attribute_host_tRNA_gff" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + + label 'process_high' + + input: + file(gff) from change_attrubute_gff_host_tRNA_salmon_alignment + val(host_attribute) from host_gff_attribute_salmon_alignment_tRNA + + output: + file "${outfile_name}" into combine_host_gffs + + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"_parent_attribute.gff3") + """ + $workflow.projectDir/bin/replace_attribute_gff.sh $gff ${outfile_name} parent $host_attribute + """ + } + + /* + * Combine tRNA gff file with genome host gff + */ + + process combine_host_genome_tRNA_gff_star_salmon { + tag "comb_host_genome_tRNA_gff" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + + label 'process_high' + + input: + file(host_gff_genome) from combine_gff_host_genome_star_salmon + file(host_gff_tRNA) from combine_host_gffs + + output: + file "${outfile_name}" into gff_host_tRNA_genome_salmon_alignment + + script: + outfile_name = host_gff_genome[0].toString().replaceAll(/.gff3|.gff/,"_with_tRNA_STAR_salmon.gff3") + """ + cat $host_gff_genome $host_gff_tRNA > ${outfile_name} + """ + } + } + + gff_host_genome_salmon_alignment = params.gff_host_tRNA ? gff_host_tRNA_genome_salmon_alignment : combine_gff_host_genome_star_salmon + + + /* + * Replace gene features (3rd column of gff) with 'quant' in host gff + */ + + process replace_gene_feature_gff_host_salmon { + tag "repl_gene_feature_gff_host" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + + label 'process_high' + + input: + file(gff) from gff_host_genome_salmon_alignment + val(features) from gene_feature_gff_host_salmon_alignment + + output: + file "${outfile_name}" into combine_gff_host_salmon_alignment + file "${outfile_name}" into extract_annotations_host_gff_salmon + + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"_quant_feature_salmon_alignment.gff3") + """ + $workflow.projectDir/bin/replace_feature_gff.sh $gff ${outfile_name} $features + """ + } + + + /* + * Replace gene attribute in 9th column of pathogen gff with 'parent' + */ + + process replace_attribute_pathogen_gff_star_salmon { + tag "repl_attribute_pathogen_gff" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + + label 'process_high' + + input: + file(gff) from gff_feature_quant_pathogen_salmon_alignment + val(pathogen_attribute) from pathogen_gff_attribute_salmon_alignment + + output: + file "${outfile_name}" into to_replace_gff_feature_salmon_alignment + file "${outfile_name}" into extract_annotations_pathogen_gff_salmon + + + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"_parent_attribute.gff3") + """ + $workflow.projectDir/bin/replace_attribute_gff.sh $gff ${outfile_name} parent $pathogen_attribute + """ + } + + + /* + * Extract annotations from pathogen gff file - for RNA class statistics and to merge with quantification results + */ + + process extract_annotations_pathogen_salmon { + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + tag "extract_gff_annots_pathogen" + + label 'process_high' + + input: + file gff from extract_annotations_pathogen_gff_salmon + val(features) from gene_feature_to_extract_annotations_pathogen + + output: + file "${outfile_name}*_salmon.tsv" into pathogen_annotations_RNA_class_stats + file "${outfile_name}*_salmon.tsv" into pathogen_annotations_RNA_class_stats_salmon_alignment + file "${outfile_name}*_salmon.tsv" into annotation_pathogen_combine_quant + file "${outfile_name}*_salmon.tsv" into annotation_pathogen_combine_quant_salmon_alignment_based + + script: + outfile_name = "pathogen_gff_annotations" + """ + python $workflow.projectDir/bin/extract_annotations_from_gff.py -gff $gff -f $features -a parent -org pathogen -q_tool salmon -o ${outfile_name} + """ + } + + + /* + * Extract annotations from host gff file - for RNA class statistics and to merge with quantification results + */ + + process extract_annotations_host_salmon { + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + tag "extract_gff_annots_host" + + label 'process_high' + + input: + file gff from extract_annotations_host_gff_salmon + + output: + file "${outfile_name}*_salmon.tsv" into host_annotations_RNA_class_stats + file "${outfile_name}*_salmon.tsv" into host_annotations_RNA_class_stats_salmon_alignment + file "${outfile_name}*_salmon.tsv" into tximport_annotations + file "${outfile_name}*_salmon.tsv" into host_annotations_uniq_ambig + file "${outfile_name}*_salmon.tsv" into tximport_annotations_salmon_alignment + file "${outfile_name}*_salmon.tsv" into host_annotations_uniq_ambig_AB + file "${outfile_name}*_salmon.tsv" into annotation_host_combine_quant + file "${outfile_name}*_salmon.tsv" into annotation_host_combine_quant_salmon_alignment_based + file "${outfile_name}*_salmon.tsv" into annotation_host_combine_quant_gene_level_salmon + file "${outfile_name}*_salmon.tsv" into annotation_host_combine_quant_gene_level_salmon_alignment + + script: + outfile_name = "host_gff_annotations" + """ + python $workflow.projectDir/bin/extract_annotations_from_gff.py -gff $gff -f quant -a parent -org host -q_tool salmon -o ${outfile_name} + """ + } + + + + if(!params.read_transcriptome_fasta_host_from_file){ + + /* + * create host transcriptome fasta file + */ + + process create_transcriptome_fasta_host { + tag "create_transcripts_host" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + + label 'process_high' + + input: + file(gff) from gff_host_create_transcriptome + file(host_fa) from genome_fasta_host_to_transcriptome + + output: + file "${outfile_name}" into host_transcriptome + file "${outfile_name}" into transcriptome_host_to_split_table_salmon_without_tRNA + file "${outfile_name}" into transcriptome_host_to_split_table_salmon_alignment_without_tRNA + file "${outfile_name}" into transcriptome_host_to_split_q_table_salmon_without_tRNA + file "${outfile_name}" into transcriptome_host_to_split_q_table_salmon_alignment_based_without_tRNA + file "${outfile_name}" into host_transcriptome_to_combine_host + file "${outfile_name}" into transcriptome_fasta_host_ref_names_without_tRNA + + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"_transcriptome.fasta") + """ + gffread -w $outfile_name -g $host_fa $gff + """ + } + + + if(params.gff_host_tRNA){ + + /* + * Create host tRNA transcriptome fasta file + */ + + process create_transcriptome_fasta_host_tRNA { + tag "create_transcripts_tRNA_host" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + + label 'process_high' + + input: + file(gff) from gff_host_create_transcriptome_tRNA + file(host_fa) from genome_fasta_host_to_transcriptome_tRNA + val(features) from gene_feature_gff_to_create_transcriptome_host_salmon + val(attribute) from gene_attribute_gff_to_create_transcriptome_host_salmon + + output: + file "${outfile_name}" into host_transcriptome_to_combine_tRNA + + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"_transcriptome.fasta") + """ + python $workflow.projectDir/bin/gff_to_fasta_transcriptome.py -fasta $host_fa -gff $gff -f $features -a $attribute -o $outfile_name + """ + } + + /* + * Combine host genome transcriptome with tRNA transcriptome + */ + + process combine_host_fasta_transcriptomes { + tag "comb_host_fa_tRNA_transcripts" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + + label 'process_high' + + input: + file(host_tr_fa) from host_transcriptome_to_combine_host + file(host_tRNA_tr_fa) from host_transcriptome_to_combine_tRNA + + output: + file "host_transcriptome.fasta" into host_transcriptome_genome_tRNA + file "host_transcriptome.fasta" into transcriptome_host_to_split_table_salmon_with_tRNA + file "host_transcriptome.fasta" into transcriptome_host_to_split_table_salmon_alignment_with_tRNA + file "host_transcriptome.fasta" into transcriptome_host_to_split_q_table_salmon_with_tRNA + file "host_transcriptome.fasta" into transcriptome_host_to_split_q_table_salmon_alignment_based_with_tRNA + file "host_transcriptome.fasta" into transcriptome_fasta_host_ref_names_with_tRNA + + script: + """ + cat $host_tr_fa $host_tRNA_tr_fa > host_transcriptome.fasta + """ + } + } + + host_transcriptome_to_combine = params.gff_host_tRNA ? host_transcriptome_genome_tRNA : host_transcriptome + transcriptome_host_to_split_q_table_salmon = params.gff_host_tRNA ? transcriptome_host_to_split_q_table_salmon_with_tRNA : transcriptome_host_to_split_q_table_salmon_without_tRNA + transcriptome_host_to_split_table_salmon = params.gff_host_tRNA ? transcriptome_host_to_split_table_salmon_with_tRNA : transcriptome_host_to_split_table_salmon_without_tRNA + transcriptome_host_to_split_q_table_salmon_alignment_based = params.gff_host_tRNA ? transcriptome_host_to_split_q_table_salmon_alignment_based_with_tRNA : transcriptome_host_to_split_q_table_salmon_alignment_based_without_tRNA + transcriptome_host_to_split_table_salmon_alignment = params.gff_host_tRNA ? transcriptome_host_to_split_table_salmon_alignment_with_tRNA : transcriptome_host_to_split_table_salmon_alignment_without_tRNA + transcriptome_fasta_host_ref_names = params.gff_host_tRNA ? transcriptome_fasta_host_ref_names_with_tRNA : transcriptome_fasta_host_ref_names_without_tRNA +} + + + + if(!params.read_transcriptome_fasta_pathogen_from_file){ + + /* + * create pathogen transcriptome fasta file + */ + + process create_transcriptome_fasta_pathogen { + tag "create_transcripts_fa_pathogen" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + + label 'main' + label 'process_high' + + input: + file(gff) from gff_pathogen_create_transcriptome + file(pathogen_fa) from genome_fasta_pathogen_to_transcriptome.collect() + val(features) from gene_feature_gff_to_create_transcriptome_pathogen_salmon + val(attribute) from gene_attribute_gff_to_create_transcriptome_pathogen_salmon + + output: + file "${outfile_name}" into pathogen_transcriptome_to_combine + file "${outfile_name}" into transcriptome_pathogen_to_split_table_salmon + file "${outfile_name}" into transcriptome_pathogen_to_split_table_salmon_alignment + file "${outfile_name}" into transcriptome_pathogen_to_split_q_table_salmon + file "${outfile_name}" into transcriptome_pathogen_to_split_q_table_salmon_alignment_based + file "${outfile_name}" into transcriptome_fasta_pathogen_ref_names + + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"_transcriptome.fasta") + """ + python $workflow.projectDir/bin/gff_to_fasta_transcriptome.py -fasta $pathogen_fa -gff $gff -f $features -a $attribute -o $outfile_name + """ + } + } + + + /* + * combine pathogen and host transcriptome fasta files + */ + + + process combine_pathogen_host_fasta_transcriptome { + tag "comb_pathogen_host_transcripts" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + + label 'process_high' + + input: + file(host_tr_fa) from host_transcriptome_to_combine + file(pathogen_tr_fa) from pathogen_transcriptome_to_combine + + output: + file "host_pathogen_transcriptome.fasta" into transcriptome_fasta_file_host_pathogen_to_decoy_transcriptome + file "host_pathogen_transcriptome.fasta" into transcriptome_salmon_alignment_based_mode + + script: + """ + cat $pathogen_tr_fa $host_tr_fa > host_pathogen_transcriptome.fasta + """ + } + +} + + +if(params.run_salmon_alignment_based_mode){ + + /* + * Replace gene features (3rd column of gff) with 'quant' in pathogen gff + */ + + process replace_gene_feature_gff_pathogen_salmon { + tag "repl_gene_feature_gff_pathogen" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + + label 'process_high' + + input: + file(gff) from to_replace_gff_feature_salmon_alignment + val(features) from gene_feature_to_quantify_pathogen_salmon_alignment + + output: + file "${outfile_name}" into combine_gff_pathogen_salmon_alignment + + script: + outfile_name = gff[0].toString().replaceAll(/.gff3|.gff/,"_quant_feature_salmon_alignment.gff3") + """ + $workflow.projectDir/bin/replace_feature_gff.sh $gff ${outfile_name} $features + """ + } + + + /* + * Combine pathogen gff with host gff + */ + + + process combine_pathogen_host_gff_files_salmon { + tag "combine_pathogen_host_gff" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + + label 'process_high' + + input: + file(host_gff) from combine_gff_host_salmon_alignment + file(pathogen_gff_genome) from combine_gff_pathogen_salmon_alignment + + output: + file "host_pathogen_star_alignment_mode.gff" into gff_host_pathogen_star_salmon_alignment_gff + + script: + """ + cat $pathogen_gff_genome $host_gff > host_pathogen_star_alignment_mode.gff + """ + } +} + + +/* + * STEP 3 - FastQC + */ + +if (!params.skip_fastqc) { + process fastqc { + tag "$name" + label 'process_medium' + + publishDir "${params.outdir}/fastqc", mode: params.publish_dir_mode + //saveAs: { filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"} + storeDir "${params.outdir}/fastqc" + + input: + set val(name), file(reads) from ch_read_files_fastqc + + output: + file "*_fastqc.{zip,html}" into ch_fastqc_results + + script: + fastqc_params = params.fastqc_params + """ + fastqc --quiet --threads $task.cpus --noextract $reads $fastqc_params + """ + } +}else{ + Channel.empty() + .set {ch_fastqc_results} +} + + +/* + * STEP 4 - Trimming + */ + + +if (params.run_cutadapt | params.run_bbduk) { + + /* + * run Cutadapt + */ + + if (params.run_cutadapt) { + process trimming { + tag "$name_reads" + publishDir "${params.outdir}/trimming_cutadapt", mode: params.publish_dir_mode + storeDir "${params.outdir}/trimming_cutadapt" + + label 'process_high' + + input: + set val(name), file(reads) from trimming_reads + val adapter_seq_3 from adapter_sequence_3 + val q_value from quality_cutoff + + output: + set val(name_sample), file("${name_sample}{_1,_2,}_trimmed.fastq.gz") into trimming_results_star_htseq, trimming_results_to_salmon, trimming_results_to_qc, trimming_results_star_salmon + + script: + cutadapt_params = params.cutadapt_params + if (params.single_end){ + name_reads = reads.toString().replaceAll(/:/,"_") + name_reads = name_reads.replaceAll(/-/,"_") + name_out = name_reads.replaceAll(/.fastq.gz|.fq.gz|.fastq|.fq/,"_trimmed.fastq.gz") + name_sample = name_reads.replaceAll(/.fastq.gz|.fq.gz|.fastq|.fq/,"") + + """ + cutadapt -j ${task.cpus} -q $q_value -a $adapter_seq_3 -m 1 -o ${name_out} $reads $cutadapt_params + """ + } else{ + name_reads = name.replaceAll(/:/,"_") + name_reads = name_reads.replaceAll(/-/,"_") + name_sample = name_reads.replaceAll(/.fastq.gz|.fq.gz|.fastq|.fq/,"") + name_1 = reads[0][0].toString().replaceAll(/:/,"_") + name_1 = name_1.replaceAll(/-/,"_") + name_1 = name_1.replaceAll(/.fastq.gz|.fq.gz|.fastq|.fq/,"_trimmed.fastq.gz") + name_2 = reads[1][0].toString().replaceAll(/:/,"_") + name_2 = name_2.replaceAll(/-/,"_") + name_2 = name_2.replaceAll(/.fastq.gz|.fq.gz|.fastq|.fq/,"_trimmed.fastq.gz") + """ + cutadapt -j ${task.cpus} -q $q_value -a ${adapter_seq_3[0]} -A ${adapter_seq_3[1]} -o ${name_1} -p ${name_2} -m 1 ${reads[0]} ${reads[1]} $cutadapt_params + """ + } + } + + } + + /* + * run BBDuk + */ + + if (params.run_bbduk) { + process bbduk { + tag "$name_reads" + publishDir "${params.outdir}/trimming_bbduk", mode: params.publish_dir_mode + storeDir "${params.outdir}/trimming_bbduk" + + label 'process_high' + + input: + set val(name), file(reads) from trimming_reads + file adapters from adapter_database + + output: + set val(name_sample), file("${name_sample}{_1,_2,}_trimmed.fastq.gz") into trimming_results_star_htseq, trimming_results_to_salmon, trimming_results_to_qc, trimming_results_star_salmon + file "*.log" + + script: + minlen = params.minlen + qtrim = params.qtrim + trimq = params.trimq + ktrim = params.ktrim + k = params.k + mink = params.mink + hdist = params.hdist + bbduk_params = params.bbduk_params + if (params.single_end){ + name_reads = reads.toString().replaceAll(/:/,"_") + name_reads = name_reads.replaceAll(/-/,"_") + name_out = name_reads.replaceAll(/.fastq.gz|.fq.gz|.fastq|.fq/,"_trimmed.fastq.gz") + name_sample = name_reads.replaceAll(/.fastq.gz|.fq.gz|.fastq|.fq/,"") + fileoutput = name_sample + '.log' + """ + bbduk.sh -Xmx1g in=$reads out=${name_out} ref=$adapters minlen=$minlen qtrim=$qtrim trimq=$trimq ktrim=$ktrim k=$k mink=$mink hdist=$hdist &> $fileoutput $bbduk_params + """ + } else{ + name_reads = name.replaceAll(/:/,"_") + name_reads = name_reads.replaceAll(/-/,"_") + name_sample = name_reads.replaceAll(/.fastq.gz|.fq.gz|.fastq|.fq/,"") + name_1 = reads[0][0].toString().replaceAll(/:/,"_") + name_1 = name_1.replaceAll(/-/,"_") + name_1 = name_1.replaceAll(/.fastq.gz|.fq.gz|.fastq|.fq/,"_trimmed.fastq.gz") + name_2 = reads[1][0].toString().replaceAll(/:/,"_") + name_2 = name_2.replaceAll(/-/,"_") + name_2 = name_2.replaceAll(/.fastq.gz|.fq.gz|.fastq|.fq/,"_trimmed.fastq.gz") + fileoutput = name_sample + '.log' + """ + bbduk.sh -Xmx1g in1=${reads[0]} in2=${reads[1]} out1=${name_1} out2=${name_2} ref=$adapters minlen=$minlen qtrim=$qtrim trimq=$trimq ktrim=$ktrim k=$k mink=$mink hdist=$hdist $bbduk_params tpe tbo &> $fileoutput + """ + } + } + + } +}else{ + trimming_reads + .into {trimming_results_to_qc; trimming_results_star_htseq; trimming_results_to_salmon; trimming_results_star_salmon} +} + + + +/* + * STEP 5 -FastQC after trimming + */ + + + +if (params.run_cutadapt | params.run_bbduk & !params.skip_fastqc) { + process fastqc_after_trimming { + tag "$sample_name" + publishDir "${params.outdir}/fastqc_after_trimming", mode: params.publish_dir_mode + //saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"} + storeDir "${params.outdir}/fastqc_after_trimming" + + label 'process_medium' + + input: + set val(name),file(reads) from trimming_results_to_qc + + output: + file "*_trimmed_fastqc.{zip,html}" into ch_fastqc_trimmed_results + + script: + sample_name = name.replaceFirst(/.fastq.gz|.fq.gz|.fastq|.fq/, "") + fastqc_params = params.fastqc_params + """ + fastqc --threads ${task.cpus} --quiet --noextract $reads $fastqc_params + """ + } +}else{ + Channel.empty() + .set {ch_fastqc_trimmed_results} +} + + + +if(params.run_cutadapt | params.run_bbduk & params.mapping_statistics) { + + raw_read_count + .map { tag, file -> file } + .set {raw_read_count_file} + /* + * count total number of raw single-end reads + */ + process count_total_reads { + tag "count_total_reads" + publishDir "${params.outdir}/mapping_statistics", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics" + + label 'process_high' + + input: + file(fastq) from raw_read_count_file.collect() + output: + file "total_raw_reads_fastq.tsv" into to_collect_total_reads + + script: + """ + $workflow.projectDir/bin/count_total_reads.sh $fastq >> total_raw_reads_fastq.tsv + """ + } + + if (!params.single_end){ + + /* + * count total number of paired-end reads + */ + process count_total_read_pairs { + tag "count_total_reads" + publishDir "${params.outdir}/mapping_statistics", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics" + + label 'process_high' + + input: + file(tsv) from to_collect_total_reads.collect() + output: + file "total_raw_read_pairs_fastq.tsv" into collect_total_reads_raw_salmon + file "total_raw_read_pairs_fastq.tsv" into collect_total_reads_raw_salmon_alignment + file "total_raw_read_pairs_fastq.tsv" into collect_total_reads_raw_star + file "total_raw_read_pairs_fastq.tsv" into collect_total_reads_raw_star_for_salmon + + script: + """ + $workflow.projectDir/bin/collect_total_raw_read_pairs.py -i $tsv + """ + } + }else{ + to_collect_total_reads + .into {collect_total_reads_raw_salmon; collect_total_reads_raw_salmon_alignment; collect_total_reads_raw_star; collect_total_reads_raw_star_for_salmon} + } +}else{ + Channel.empty() + .into {collect_total_reads_raw_salmon; collect_total_reads_raw_salmon_alignment; collect_total_reads_raw_star; collect_total_reads_raw_star_for_salmon} +} + + + +/* + * STEP 6 - Salmon Selective Alignment + */ + + +if(params.run_salmon_selective_alignment) { + + /* + * create genome as decoy sequence + */ + + process create_decoy_transcriptome_file { + tag "create_decoy_transcripts_file" + publishDir "${params.outdir}/references", mode: params.publish_dir_mode + storeDir "${params.outdir}/references" + + label 'process_high' + + input: + file(host_fa) from genome_fasta_host_to_decoys + //file(host_tr_fa) from host_transcriptome_to_combine + file(host_pathogen_genome_fasta) from genome_fasta_file_host_pathogen_to_decoy_transcriptome + file(host_pathogen_transcriptome_fasta) from transcriptome_fasta_file_host_pathogen_to_decoy_transcriptome + + output: + file "gentrome.fasta" into salmon_index_gentrome + file "decoys.txt" into salmon_index_decoys + + shell: + ''' + grep ">" !{host_fa} | cut -d " " -f 1 > decoys.txt + sed -i -e 's/>//g' decoys.txt + cat !{host_pathogen_transcriptome_fasta} !{host_fa} > gentrome.fasta + ''' + } + + + /* +  * Salmon - build an index +  */ + + process salmon_index { + tag "salmon_index" + storeDir "${params.outdir}/salmon" + + label 'process_high' + + input: + file(gentrome) from salmon_index_gentrome + file(decoys) from salmon_index_decoys + val(kmer_length) from kmer_length_salmon_index + + output: + file "transcripts_index" into salmon_index + + script: + salmon_sa_params_index = params.salmon_sa_params_index + keepDuplicates = params.keepDuplicates ? "--keepDuplicates" : '' + """ + salmon index -t $gentrome -i transcripts_index --decoys $decoys -k $kmer_length -p ${task.cpus} $keepDuplicates $salmon_sa_params_index + """ + } + + + /* +  * Salmon - quantification +  */ + + + process salmon_quantification { + storeDir "${params.outdir}/salmon" + tag "${sample}" + + label 'process_high' + + input: + file(index) from salmon_index.collect() + set val(sample), file(reads) from trimming_results_to_salmon + val(libtype) from libtype_salmon + + output: + set val(sample_name), file("${sample_name}") into split_table + set val(sample_name), file("${sample_name}") into split_table_uniq_ambig + file("${sample_name}") into salmon_files_to_combine + file("${sample_name}") into multiqc_salmon_quant + set val(sample_name), file("${sample_name}") into collect_processed_read_counts + + script: + UnmappedNames = params.writeUnmappedNames ? "--writeUnmappedNames" : '' + softclip = params.softclipOverhangs ? "--softclipOverhangs" : '' + incompatPrior = params.incompatPrior + dumpEq = params.dumpEq ? "--dumpEq" : '' + salmon_sa_params_mapping = params.salmon_sa_params_mapping + if (params.single_end){ + sample_name = sample.replaceFirst(/.fastq.gz|.fq.gz|.fastq|.fq/, "") + writeMappings = params.writeMappings ? "--writeMappings=$sample_name/mapping.sam" : '' + """ + salmon quant -p ${task.cpus} -i $index -l $libtype -r $reads $softclip --incompatPrior $incompatPrior $UnmappedNames --validateMappings $dumpEq $writeMappings -o $sample_name $salmon_sa_params_mapping + """ + } else{ + sample_name = sample.replaceFirst(/.fastq.gz|.fq.gz|.fastq|.fq/, "") + writeMappings = params.writeMappings ? "--writeMappings=$sample_name/mapping.sam" : '' + """ + salmon quant -p ${task.cpus} -i $index -l $libtype -1 ${reads[0]} -2 ${reads[1]} $softclip --incompatPrior $incompatPrior $UnmappedNames --validateMappings $dumpEq $writeMappings -o $sample_name $salmon_sa_params_mapping + """ + } + } + + + + /* +  * Salmon - split quantification tables into host and pathogen results +  */ + + + process split_table_salmon_each { + publishDir "${params.outdir}/salmon/${sample_name}", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon/${sample_name}" + tag "split_quantification ${sample_name}" + + label 'process_high' + + input: + set val(sample_name), file ("salmon/*") from split_table + file transcriptome_pathogen from transcriptome_pathogen_to_split_q_table_salmon + file transcriptome_host from transcriptome_host_to_split_q_table_salmon + + output: + set val(sample_name), file("host_quant.sf") into salmon_host_tximport + + script: + """ + $workflow.projectDir/bin/split_quant_tables_salmon.sh $transcriptome_pathogen $transcriptome_host salmon/*/quant.sf "quant.sf" + """ + } + + + if(params.generate_salmon_uniq_ambig) { + + /* + * Extract and combine the ambig and unique counts + */ + + process extract_ambig_uniq_transcripts_genes { + publishDir "${params.outdir}/salmon/${sample_name}/aux_info", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon/${sample_name}/aux_info" + tag "extract_ambig_uniq_transcripts_genes ${sample_name}" + + label 'process_high' + + input: + set val(sample_name), file("salmon/*") from split_table_uniq_ambig + file (annotations) from host_annotations_uniq_ambig + + + output: + file "${sample_name}_host_quant_ambig_uniq.sf" + file "${sample_name}_pathogen_quant_ambig_uniq.sf" + file "${sample_name}_host_quant_ambig_uniq_gene_level.sf" + set val(sample_name), file("${sample_name}_host_quant_ambig_uniq.sf") into host_files_comb + set val(sample_name), file("${sample_name}_pathogen_quant_ambig_uniq.sf") into path_files_comb + + script: + """ + $workflow.projectDir/bin/salmon_extract_ambig_uniq_transcripts_genes.R salmon/*/quant.sf salmon/*/aux_info/ambig_info.tsv $sample_name $annotations + """ + } + + /* + * Combine the host ambig and unique counts + */ + process host_comb_ambig_uniq { + publishDir "${params.outdir}/salmon", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon" + tag "host_comb_ambig_uniq" + + label 'process_high' + + input: + file("salmon/*") from host_files_comb.collect() + + output: + file "host_quant_combined_ambig_uniq.tsv" + + script: + """ + $workflow.projectDir/bin/salmon_host_comb_ambig_uniq.R salmon/*/aux_info/*_host_quant_ambig_uniq.sf + """ + } + + /* + * Combine the pathogen ambig and unique counts + */ + process pathogen_comb_ambig_uniq { + publishDir "${params.outdir}/salmon", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon" + tag "pathogen_comb_ambig_uniq" + + label 'process_high' + + input: + file("salmon/*") from path_files_comb.collect() + + output: + file "pathogen_quant_combined_ambig_uniq.tsv" + + script: + """ + $workflow.projectDir/bin/salmon_pathogen_comb_ambig_uniq.R salmon/*/aux_info/*_pathogen_quant_ambig_uniq.sf + """ + } + } + + + + /* +  * Salmon - combine quantification results +  */ + + + process combine_quantification_tables_salmon { + publishDir "${params.outdir}/salmon", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon" + tag "combine_quantification_salmon" + label 'process_high' + + input: + file input_quantification from salmon_files_to_combine.collect() + val gene_attribute from host_atr_collect_data_salmon + + output: + file "combined_quant.tsv" into split_table_salmon + + script: + """ + python $workflow.projectDir/bin/collect_quantification_data.py -i $input_quantification -q salmon -a $gene_attribute -org both + """ + } + + + + /* +  * Salmon - split quantification tables into host and pathogen results +  */ + + + process split_quantification_tables_salmon { + publishDir "${params.outdir}/salmon", mode: params.publish_dir_mode + tag "split_quant_table_salmon" + + label 'process_high' + + input: + file quant_table from split_table_salmon + file transcriptome_pathogen from transcriptome_pathogen_to_split_table_salmon + file transcriptome_host from transcriptome_host_to_split_table_salmon + + output: + file 'host_quant_salmon.tsv' into host_quantification_mapping_stats_salmon + file 'pathogen_quant_salmon.tsv' into pathogen_quantification_mapping_stats_salmon + file 'host_quant_salmon.tsv' into host_quantification_RNA_stats_salmon + file 'pathogen_quant_salmon.tsv' into pathogen_quantification_RNA_stats_salmon + file 'host_quant_salmon.tsv' into quant_host_add_annotations + file 'pathogen_quant_salmon.tsv' into quant_pathogen_add_annotations + file 'host_quant_salmon.tsv' into quant_scatter_plot_host + file 'pathogen_quant_salmon.tsv' into quant_scatter_plot_pathogen + env pathonen_tab into pathonen_tab into scatterplots_pathogen_salmon + env host_tab into host_tab into scatterplots_host_salmon + + script: + """ + $workflow.projectDir/bin/split_quant_tables_salmon.sh $transcriptome_pathogen $transcriptome_host $quant_table "quant_salmon.tsv" + pathonen_tab=\$(if [ \$(cat pathogen_quant_salmon.tsv | wc -l) -gt 1 ]; then echo "true"; else echo "false"; fi) + host_tab=\$(if [ \$(cat host_quant_salmon.tsv | wc -l) -gt 1 ]; then echo "true"; else echo "false"; fi) + """ + } + + + /* +  * Salmon - combine pathogen annotations extracted from gff with quantification tables +  */ + + process combine_annotations_quant_pathogen { + publishDir "${params.outdir}/salmon", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon" + tag "comb_annots_quant_pathgn_salmon" + + label 'process_high' + + input: + file quantification_table from quant_pathogen_add_annotations + file annotation_table from annotation_pathogen_combine_quant + val attribute from combine_annot_quant_pathogen + + output: + file "pathogen_combined_quant_annotations.tsv" + + script: + """ + $workflow.projectDir/bin/combine_quant_annotations.py -q $quantification_table -annotations $annotation_table -a $attribute -org pathogen + """ + } + + + /* +  * Salmon - combine host annotations extracted from gff with quantification tables +  */ + + process combine_annotations_quant_host_salmon { + publishDir "${params.outdir}/salmon", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon" + tag "comb_annots_quant_host_salmon" + + label 'process_high' + + input: + file quantification_table from quant_host_add_annotations + file annotation_table from annotation_host_combine_quant + val attribute from combine_annot_quant_host + + output: + file "host_combined_quant_annotations.tsv" + + script: + """ + $workflow.projectDir/bin/combine_quant_annotations.py -q $quantification_table -annotations $annotation_table -a $attribute -org host + """ + } + + + + /* +  * Tximport - host +  */ + + process tximport_host { + publishDir "${params.outdir}/salmon/${sample_name}", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon/${sample_name}" + tag "tximport_host" + + label 'process_high' + + input: + set val(sample_name), file("salmon/${sample_name}/*") from salmon_host_tximport + file (annotations) from tximport_annotations + + output: + file "${sample_name}_host_quant_gene_level.sf" into salmon_files_to_combine_gene_level + + script: + """ + $workflow.projectDir/bin/tximport.R salmon $annotations $sample_name + """ + } + + + /* +  * Salmon - Combine host gene level quantification results estimated with Tximport +  */ + + process combine_host_quant_gene_level_salmon { + publishDir "${params.outdir}/salmon", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon" + tag "comb_host_quant_genes_salmon" + + label 'process_high' + + input: + file input_quantification from salmon_files_to_combine_gene_level.collect() + + output: + file "host_combined_gene_level.tsv" into quant_gene_level_host_add_annotations_salmon + + script: + """ + python $workflow.projectDir/bin/collect_quantification_data.py -i $input_quantification -q salmon -a gene_id -org host_gene_level + """ + } + + + /* +  * Salmon - combine host annotations extracted from gff with gene-level quantification estimates +  */ + + process combine_annotations_quant_gene_level_salmon { + publishDir "${params.outdir}/salmon", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon" + tag "comb_annots_gene_host_salmon" + + label 'process_high' + + input: + file quantification_table from quant_gene_level_host_add_annotations_salmon + file annotation_table from annotation_host_combine_quant_gene_level_salmon + + output: + file "host_combined_quant_gene_level_annotations.tsv" + + script: + """ + $workflow.projectDir/bin/combine_annotations_salmon_gene_level.py -q $quantification_table -annotations $annotation_table -a gene_id -org host + """ + } + + + + if(params.mapping_statistics) { + + /* +  * Salmon - plot scatter plots of technical replicates for pathogen +  */ + + process scatter_plot_pathogen_salmon { + publishDir "${params.outdir}/mapping_statistics/salmon/scatter_plots", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon/scatter_plots" + tag "scatter_plot_salmon_pathogen" + + label 'process_high' + + input: + file quant_table from quant_scatter_plot_pathogen + val attribute from atr_scatter_plot_pathogen + val replicates from repl_scatter_plots_salmon_pathogen + val pathogen_table_non_empty from scatterplots_pathogen_salmon + + output: + file ('*.pdf') + + when: + replicates.toBoolean() + pathogen_table_non_empty.toBoolean() + + script: + """ + python $workflow.projectDir/bin/scatter_plots.py -q $quant_table -a $attribute -org pathogen + """ + } + + + /* +  * Salmon - plot scatter plots of technical replicates for host +  */ + + process scatter_plot_host_salmon { + publishDir "${params.outdir}/mapping_statistics/salmon/scatter_plots", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon/scatter_plots" + tag "scatter_plot_salmon_host" + + label 'process_high' + + input: + file quant_table from quant_scatter_plot_host + val attribute from atr_scatter_plot_host + val replicates from repl_scatter_plots_salmon_host + val host_table_non_empty from scatterplots_host_salmon + + output: + file ('*.pdf') + + when: + replicates.toBoolean() + host_table_non_empty.toBoolean() + + script: + """ + python $workflow.projectDir/bin/scatter_plots.py -q $quant_table -a $attribute -org host + """ + } + + + /* +  * Salmon - Extract processed reads from log file +  */ + + process extract_processed_reads { + publishDir "${params.outdir}/mapping_statistics/salmon", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon" + tag "extract_processed_reads" + + label 'process_high' + + input: + set val(sample_name), file ("salmon/*") from collect_processed_read_counts + + output: + file "${sample_name}.txt" into collect_results + + script: + """ + $workflow.projectDir/bin/extract_processed_reads.sh salmon/*/aux_info/meta_info.json $sample_name salmon + """ + } + + + /* +  * Salmon - Collect processed reads from all samples +  */ + + process collect_processed_reads { + publishDir "${params.outdir}/mapping_statistics/salmon", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon" + tag "collect_processed_reads" + + label 'process_high' + + input: + file process_reads from collect_results.collect() + + output: + file "processed_reads_salmon.tsv" into mapping_stats_total_reads + + script: + """ + cat $process_reads > processed_reads_salmon.tsv + """ + } + + + /* +  * Salmon - Collect mapping statistics +  */ + + process salmon_quantification_stats { + publishDir "${params.outdir}/mapping_statistics/salmon", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon" + tag "quantification_stats_salmon" + + label 'process_high' + + input: + file quant_table_host from host_quantification_mapping_stats_salmon + file quant_table_pathogen from pathogen_quantification_mapping_stats_salmon + val attribute from attribute_quant_stats_salmon + file total_processed_reads from mapping_stats_total_reads + file total_raw_reads from collect_total_reads_raw_salmon.ifEmpty('.') + + output: + file ('salmon_host_pathogen_total_reads.tsv') into salmon_mapped_stats_to_plot + + script: + """ + python $workflow.projectDir/bin/mapping_stats.py -q_p $quant_table_pathogen -q_h $quant_table_host -total_processed $total_processed_reads -total_raw $total_raw_reads -a $attribute -t salmon -o salmon_host_pathogen_total_reads.tsv + """ + } + + /* +  * Salmon - Plot mapping statistics +  */ + + process plot_salmon_mapping_stats_host_pathogen { + tag "$name2" + publishDir "${params.outdir}/mapping_statistics/salmon", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon" + + label 'process_high' + + input: + file(stats) from salmon_mapped_stats_to_plot + + output: + file "*.tsv" + file "*.pdf" + + script: + """ + python $workflow.projectDir/bin/plot_mapping_statistics_salmon.py -i $stats + """ + } + + + + /* +  * Salmon - Collect pathogen RNA class statistics +  */ + + + process RNA_class_statistics_salmon_pathogen { + publishDir "${params.outdir}/mapping_statistics/salmon/RNA_classes_pathogen", mode: params.publish_dir_mode + tag "rna_class_stats_pathgn_salmon" + + label 'process_high' + + input: + file quant_table from pathogen_quantification_RNA_stats_salmon + val attribute from host_annotations_RNA_class_stats_pathogen + file gene_annotations from pathogen_annotations_RNA_class_stats + + output: + file "pathogen_RNA_classes_percentage_*.tsv" into plot_RNA_stats_pathogen + file "pathogen_RNA_classes_percentage_*.tsv" into plot_RNA_stats_pathogen_combined + file "pathogen_RNA_classes_sum_counts_*.tsv" + stdout plot_RNA_stats_pathogen_boolean + stdout plot_RNA_stats_pathogen_combined_boolean + + shell: + ''' + python !{workflow.projectDir}/bin/RNA_class_content.py -q !{quant_table} -a !{attribute} -annotations !{gene_annotations} -q_tool salmon -org pathogen 2>&1 + ''' + } + + + /* +  * Salmon - Collect host RNA class statistics +  */ + + process RNA_class_statistics_salmon_host { + publishDir "${params.outdir}/mapping_statistics/salmon/RNA_classes_host", mode: params.publish_dir_mode + tag "rna_class_stats_host_salmon" + + label 'process_high' + + input: + file quant_table from host_quantification_RNA_stats_salmon + val attribute from attribute_host_RNA_class_stats + file gene_annotations from host_annotations_RNA_class_stats + file rna_classes_to_replace from RNA_classes_to_replace + + output: + file "host_RNA_classes_percentage_*.tsv" into plot_RNA_stats_host + file "host_RNA_classes_percentage_*.tsv" into plot_RNA_stats_host_combined + file "host_RNA_classes_sum_counts_*.tsv" + file "host_gene_types_groups_*" + stdout plot_RNA_stats_host_combined_boolean + stdout plot_RNA_stats_host_boolean + + shell: + ''' + python !{workflow.projectDir}/bin/RNA_class_content.py -q !{quant_table} -a !{attribute} -annotations !{gene_annotations} -rna !{rna_classes_to_replace} -q_tool salmon -org host 2>&1 + ''' + } + + + /* +  * Salmon - Plot pathogen RNA class statistics for each sample separately +  */ + + process plot_RNA_class_salmon_pathogen_each { + publishDir "${params.outdir}/mapping_statistics/salmon/RNA_classes_pathogen", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon/RNA_classes_pathogen" + tag "plot_RNA_stats_pathogen_salmon" + + label 'process_high' + + input: + file stats_table from plot_RNA_stats_pathogen + val plot_rna from plot_RNA_stats_pathogen_boolean + + output: + file "*.pdf" + + when: + plot_rna.toBoolean() + + script: + """ + python $workflow.projectDir/bin/plot_RNA_class_stats_each.py -i $stats_table + """ + } + + + /* +  * Salmon - Plot pathogen RNA class statistics for all samples +  */ + + process plot_RNA_class_salmon_pathogen_combined { + publishDir "${params.outdir}/mapping_statistics/salmon/RNA_classes_pathogen", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon/RNA_classes_pathogen" + tag "plot_RNA_stats_comb_pathogen" + + label 'process_high' + + input: + file stats_table from plot_RNA_stats_pathogen_combined + val plot_rna from plot_RNA_stats_pathogen_combined_boolean + + output: + file "RNA_class_stats_combined_pathogen.pdf" + + when: + plot_rna.toBoolean() + + script: + """ + python $workflow.projectDir/bin/plot_RNA_class_stats_combined.py -i $stats_table -org pathogen + """ + } + + /* +  * Salmon - Plot host RNA class statistics for each sample separately +  */ + + process plot_RNA_class_salmon_host_each { + publishDir "${params.outdir}/mapping_statistics/salmon/RNA_classes_host", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon/RNA_classes_host" + tag "plot_RNA_stats_host_salmon" + + label 'process_high' + + input: + file stats_table from plot_RNA_stats_host + val plot_rna from plot_RNA_stats_host_boolean + + output: + file "*.pdf" + + when: + plot_rna.toBoolean() + + script: + """ + python $workflow.projectDir/bin/plot_RNA_class_stats_each.py -i $stats_table + """ + } + + /* +  * Salmon - Plot host RNA class statistics for all samples +  */ + + process plot_RNA_class_salmon_host_combined { + publishDir "${params.outdir}/mapping_statistics/salmon/RNA_classes_host", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon/RNA_classes_host" + tag "plot_RNA_stats_comb_host" + + label 'process_high' + + input: + file stats_table from plot_RNA_stats_host_combined + val plot_rna from plot_RNA_stats_host_combined_boolean + + output: + file "RNA_class_stats_combined_host.pdf" + + when: + plot_rna.toBoolean() + + script: + """ + python $workflow.projectDir/bin/plot_RNA_class_stats_combined.py -i $stats_table -org host + """ + } + } +}else{ + Channel.empty() + .set {multiqc_salmon_quant} +} + + +/* + * STEP 7 - Salmon alignment_based_mode + */ + + +if (params.run_salmon_alignment_based_mode){ + + /* +  * STAR - index +  */ + + process STARindex_salmon_alignment { + publishDir "${params.outdir}/STAR_for_salmon", mode: params.publish_dir_mode + storeDir "${params.outdir}/STAR_for_salmon" + tag "STAR_index" + + label 'process_high' + + input: + file(fasta) from host_pathogen_fasta_index + file(gff) from genome_gff_star_index + + output: + file "index/*" into star_index_transcriptome_alignment + + script: + sjdbOverhang = params.sjdbOverhang + star_salmon_index_params = params.star_salmon_index_params + """ + mkdir index + STAR --runThreadN ${task.cpus} --runMode genomeGenerate --genomeDir index/ --genomeFastaFiles $fasta --sjdbGTFfile $gff --sjdbGTFfeatureExon exon --sjdbGTFtagExonParentTranscript Parent --sjdbOverhang $sjdbOverhang $star_salmon_index_params + """ + } + + + /* +  * STAR - alignment +  */ + + process ALIGNMENTstar_for_salmon { + tag "$sample_name" + publishDir "${params.outdir}/STAR_for_salmon", mode: params.publish_dir_mode + storeDir "${params.outdir}/STAR_for_salmon" + + label 'process_medium' + + input: + set val(sample_name),file(reads) from trimming_results_star_salmon + file(gff) from gff_host_pathogen_star_salmon_alignment_gff.collect() + file(index) from star_index_transcriptome_alignment.collect() + + output: + file "${sample_name}/*" into multiqc_star_for_salmon_alignment + set val(sample_name), file("${sample_name}/${sample_name}Aligned.toTranscriptome.out.bam") into salmon_quantify_alignment_based_mode + file "${sample_name}/*" + set val(sample_name), file("${sample_name}/${sample_name}Log.final.out") into collect_processed_read_counts_STAR_for_salmon + + script: + outSAMunmapped = params.outSAMunmapped + outSAMattributes = params.outSAMattributes + quantTranscriptomeBan = params.quantTranscriptomeBan + outFilterMultimapNmax = params.outFilterMultimapNmax + outFilterType = params.outFilterType + alignSJoverhangMin = params.alignSJoverhangMin + alignSJDBoverhangMin = params.alignSJDBoverhangMin + outFilterMismatchNmax = params.outFilterMismatchNmax + outFilterMismatchNoverReadLmax = params.outFilterMismatchNoverReadLmax + alignIntronMin = params.alignIntronMin + alignIntronMax = params.alignIntronMax + alignMatesGapMax = params.alignMatesGapMax + limitBAMsortRAM = params.limitBAMsortRAM + readFilesCommand = reads[0].toString().endsWith('.gz') ? "--readFilesCommand zcat" : '' + winAnchorMultimapNmax = params.winAnchorMultimapNmax + star_salmon_alignment_params = params.star_salmon_alignment_params + + if (params.single_end){ + """ + mkdir $sample_name + STAR --runThreadN ${task.cpus} --genomeDir . --sjdbGTFfile $gff $readFilesCommand --readFilesIn $reads --outSAMtype BAM Unsorted --outSAMunmapped $outSAMunmapped --outSAMattributes $outSAMattributes --outFileNamePrefix $sample_name/$sample_name --sjdbGTFfeatureExon quant --sjdbGTFtagExonParentTranscript parent --quantMode TranscriptomeSAM --quantTranscriptomeBan $quantTranscriptomeBan --outFilterMultimapNmax $outFilterMultimapNmax --outFilterType $outFilterType --limitBAMsortRAM $limitBAMsortRAM --alignSJoverhangMin $alignSJoverhangMin --alignSJDBoverhangMin $alignSJDBoverhangMin --outFilterMismatchNmax $outFilterMismatchNmax --outFilterMismatchNoverReadLmax $outFilterMismatchNoverReadLmax --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax --alignMatesGapMax $alignMatesGapMax --winAnchorMultimapNmax $winAnchorMultimapNmax $star_salmon_alignment_params + """ + } else { + """ + mkdir $sample_name + STAR --runThreadN ${task.cpus} --genomeDir . --sjdbGTFfile $gff $readFilesCommand --readFilesIn ${reads[0]} ${reads[1]} --outSAMtype BAM Unsorted --outSAMunmapped $outSAMunmapped --outSAMattributes $outSAMattributes --outFileNamePrefix $sample_name/$sample_name --sjdbGTFfeatureExon quant --sjdbGTFtagExonParentTranscript parent --quantMode TranscriptomeSAM --quantTranscriptomeBan $quantTranscriptomeBan --outFilterMultimapNmax $outFilterMultimapNmax --outFilterType $outFilterType --limitBAMsortRAM $limitBAMsortRAM --alignSJoverhangMin $alignSJoverhangMin --alignSJDBoverhangMin $alignSJDBoverhangMin --outFilterMismatchNmax $outFilterMismatchNmax --outFilterMismatchNoverReadLmax $outFilterMismatchNoverReadLmax --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax --alignMatesGapMax $alignMatesGapMax --winAnchorMultimapNmax $winAnchorMultimapNmax $star_salmon_alignment_params + """ + } + } + + + /* +  * Salmon alignment-based mode - quantification +  */ + + process salmon_quantification_alignment_based_mode { + storeDir "${params.outdir}/salmon_alignment_mode" + tag "${sample}" + + label 'process_high' + + input: + file(transcriptome) from transcriptome_salmon_alignment_based_mode.collect() + set val(sample), file(bam_file) from salmon_quantify_alignment_based_mode + val(libtype) from libtype_salmon_alignment_mode + + output: + set val(sample_name), file("${sample_name}") into split_table_alignment_based + set val(sample_name), file("${sample_name}") into split_table_uniq_ambig_ab + file("${sample_name}") into salmon_files_to_combine_alignment_mode + file("${sample_name}") into multiqc_salmon_alignment_quant + set val(sample_name), file("${sample_name}") into collect_processed_read_counts_alignment_based + + script: + incompatPrior = params.incompatPrior + salmon_alignment_based_params = params.salmon_alignment_based_params + sample_name = sample.replaceFirst(/.fastq.gz|.fq.gz|.fastq|.fq/, "") + """ + salmon quant -p ${task.cpus} -t $transcriptome -l $libtype -a $bam_file --incompatPrior $incompatPrior -o $sample_name $salmon_alignment_based_params + """ + } + + + /* +  * Salmon alignment-based mode - split quantification tables into host and pathogen results +  */ + + process split_table_salmon_each_salmon_alignment_mode { + publishDir "${params.outdir}/salmon_alignment_mode/${sample_name}", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon_alignment_mode/${sample_name}" + tag "split_quant_tbl_for_sal_modes" + + label 'process_high' + + input: + set val(sample_name), file ("salmon/*") from split_table_alignment_based + //set val(sample_name), file ("salmon/*") from split_table_alignment_based_uniq_ambig + file transcriptome_pathogen from transcriptome_pathogen_to_split_q_table_salmon_alignment_based + file transcriptome_host from transcriptome_host_to_split_q_table_salmon_alignment_based + + output: + set val(sample_name), file("host_quant.sf") into salmon_alignment_host_tximport + set val(sample_name), file("pathogen_quant.sf") + + script: + """ + $workflow.projectDir/bin/split_quant_tables_salmon.sh $transcriptome_pathogen $transcriptome_host salmon/*/quant.sf "quant.sf" + """ + } + + + + + if(params.generate_salmon_uniq_ambig) { + + /* + * Extract and combine the ambig and unique counts + */ + process extract_ambig_uniq_transcripts_genes_alignment_based { + publishDir "${params.outdir}/salmon_alignment_mode/${sample_name}/aux_info", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon_alignment_mode/${sample_name}/aux_info" + tag "extract_ambig_uniq_transcripts_genes_AB ${sample_name}" + + label 'process_high' + + input: + set val(sample_name), file("salmon/*") from split_table_uniq_ambig_ab + file (annotations) from host_annotations_uniq_ambig_AB + + + output: + file "${sample_name}_host_quant_ambig_uniq.sf" + file "${sample_name}_pathogen_quant_ambig_uniq.sf" + file "${sample_name}_host_quant_ambig_uniq_gene_level.sf" + set val(sample_name), file("${sample_name}_host_quant_ambig_uniq.sf") into host_files_comb_uniq_ambig_AB + set val(sample_name), file("${sample_name}_pathogen_quant_ambig_uniq.sf") into path_files_comb_uniq_ambig_AB + + script: + """ + $workflow.projectDir/bin/salmon_extract_ambig_uniq_transcripts_genes.R salmon/*/quant.sf salmon/*/aux_info/ambig_info.tsv $sample_name $annotations + """ + } + + /* + * Combine the host ambig and unique counts + */ + process host_comb_ambig_uniq_alignment_based { + publishDir "${params.outdir}/salmon_alignment_mode", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon_alignment_mode" + tag "host_comb_ambig_uniq_AB" + + label 'process_high' + + input: + file("salmon/*") from host_files_comb_uniq_ambig_AB.collect() + + output: + file "host_quant_combined_ambig_uniq.tsv" + + script: + """ + $workflow.projectDir/bin/salmon_host_comb_ambig_uniq.R salmon/*/aux_info/*_host_quant_ambig_uniq.sf + """ + } + + /* + * Combine the pathogen ambig and unique counts + */ + process pathogen_comb_ambig_uniq_alignment_based { + publishDir "${params.outdir}/salmon_alignment_mode", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon_alignment_mode" + tag "pathogen_comb_ambig_uniq_AB" + + label 'process_high' + + input: + file("salmon/*") from path_files_comb_uniq_ambig_AB.collect() + + output: + file "pathogen_quant_combined_ambig_uniq.tsv" + + script: + """ + $workflow.projectDir/bin/salmon_pathogen_comb_ambig_uniq.R salmon/*/aux_info/*_pathogen_quant_ambig_uniq.sf + """ + } + } + + + /* +  * Tximport - host +  */ + + process tximport_host_salmon_alignment { + publishDir "${params.outdir}/salmon_alignment_mode/${sample_name}", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon_alignment_mode/${sample_name}" + tag "tximport_host" + + label 'process_high' + + input: + set val(sample_name), file("salmon/${sample_name}/*") from salmon_alignment_host_tximport + file (annotations) from tximport_annotations_salmon_alignment + + output: + file "${sample_name}_host_quant_gene_level.sf" into salmon_files_to_combine_gene_level_alignment + + script: + """ + $workflow.projectDir/bin/tximport.R salmon $annotations $sample_name + """ + } + + + /* +  * Salmon alignment-based mode - Combine host gene level quantification results estimated with Tximport +  */ + + process combine_host_quant_gene_level_salmon_alignment { + publishDir "${params.outdir}/salmon_alignment_mode", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon_alignment_mode" + tag "comb_host_quant_genes_sal_alig" + + label 'process_high' + + input: + file input_quantification from salmon_files_to_combine_gene_level_alignment.collect() + + output: + file "host_combined_gene_level.tsv" into quant_gene_level_host_add_annotations_salmon_alignment + + script: + """ + python $workflow.projectDir/bin/collect_quantification_data.py -i $input_quantification -q salmon -a gene_id -org host_gene_level + """ + } + + + /* +  * Salmon alignment-based mode - Combine quantification results +  */ + + process combine_quantification_tables_salmon_alignment_mode { + publishDir "${params.outdir}/salmon_alignment_mode", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon_alignment_mode" + tag "combine_quantification_salmon" + + label 'process_high' + + input: + file input_quantification from salmon_files_to_combine_alignment_mode.collect() + val gene_attribute from host_atr_collect_data_salmon_alignment_mode + + output: + file "combined_quant.tsv" into split_table_salmon_salmon_alignment + + script: + """ + python $workflow.projectDir/bin/collect_quantification_data.py -i $input_quantification -q salmon -a $gene_attribute -org both + """ + } + + + /* +  * Salmon alignment-based mode - Split quantification tables into host and pathogen results +  */ + + + process split_quantification_tables_salmon_salmon_alignment_mode { + publishDir "${params.outdir}/salmon_alignment_mode", mode: params.publish_dir_mode + tag "split_quantification" + + label 'process_high' + + input: + file quant_table from split_table_salmon_salmon_alignment + file transcriptome_pathogen from transcriptome_pathogen_to_split_table_salmon_alignment + file transcriptome_host from transcriptome_host_to_split_table_salmon_alignment + + output: + file 'host_quant_salmon.tsv' into host_quantification_mapping_stats_salmon_alignment_based + file 'pathogen_quant_salmon.tsv' into pathogen_quantification_mapping_stats_salmon_alignment_based + file 'host_quant_salmon.tsv' into host_quantification_RNA_stats_salmon_alignment_based + file 'pathogen_quant_salmon.tsv' into pathogen_quantification_RNA_stats_salmon_alignment_based + file 'host_quant_salmon.tsv' into quant_host_add_annotations_salmon_alignment_based + file 'pathogen_quant_salmon.tsv' into quant_pathogen_add_annotations_alignment_based + file 'host_quant_salmon.tsv' into quant_scatter_plot_host_salmon_alignment_based + file 'pathogen_quant_salmon.tsv' into quant_scatter_plot_pathogen_salmon_alignment_based + env pathonen_tab into pathonen_tab into scatterplots_pathogen_salmon_alignment_based + env host_tab into host_tab into scatterplots_host_salmon_alignment_based + + script: + """ + $workflow.projectDir/bin/split_quant_tables_salmon.sh $transcriptome_pathogen $transcriptome_host $quant_table "quant_salmon.tsv" + pathonen_tab=\$(if [ \$(cat pathogen_quant_salmon.tsv | wc -l) -gt 1 ]; then echo "true"; else echo "false"; fi) + host_tab=\$(if [ \$(cat host_quant_salmon.tsv | wc -l) -gt 1 ]; then echo "true"; else echo "false"; fi) + """ + } + + + /* +  * Salmon alignment-based mode - Combine pathogen annotations with quantification results +  */ + + process combine_annotations_quant_pathogen_salmon_alignment_mode { + publishDir "${params.outdir}/salmon_alignment_mode", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon_alignment_mode" + tag "comb_annots_quant_pathgn_salmn" + + label 'process_high' + + input: + file quantification_table from quant_pathogen_add_annotations_alignment_based + file annotation_table from annotation_pathogen_combine_quant_salmon_alignment_based + val attribute from combine_annot_quant_pathogen_salmon_alignment_based + + output: + file "pathogen_combined_quant_annotations.tsv" + + script: + """ + $workflow.projectDir/bin/combine_quant_annotations.py -q $quantification_table -annotations $annotation_table -a $attribute -org pathogen + """ + } + + /* +  * Salmon alignment-based mode - Combine host annotations with quantification results +  */ + + process combine_annotations_quant_host_salmon_alignment_mode { + publishDir "${params.outdir}/salmon_alignment_mode", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon_alignment_mode" + tag "comb_annots_quant_host_salmn" + + label 'process_high' + + input: + file quantification_table from quant_host_add_annotations_salmon_alignment_based + file annotation_table from annotation_host_combine_quant_salmon_alignment_based + val attribute from combine_annot_quant_host_salmon_alignment_based + + output: + file "host_combined_quant_annotations.tsv" + + script: + """ + $workflow.projectDir/bin/combine_quant_annotations.py -q $quantification_table -annotations $annotation_table -a $attribute -org host + """ + } + + /* +  * Salmon alignment-based mode - Combine host annotations with gene-level estimates +  */ + + process combine_annotations_quant_gene_level_salmon_alignment_mode { + publishDir "${params.outdir}/salmon_alignment_mode", mode: params.publish_dir_mode + storeDir "${params.outdir}/salmon_alignment_mode" + tag "comb_annots_gene_host_salmn" + + label 'process_high' + + input: + file quantification_table from quant_gene_level_host_add_annotations_salmon_alignment + file annotation_table from annotation_host_combine_quant_gene_level_salmon_alignment + + output: + file "host_combined_quant_gene_level_annotations.tsv" + + script: + """ + $workflow.projectDir/bin/combine_annotations_salmon_gene_level.py -q $quantification_table -annotations $annotation_table -a gene_id -org host + """ + } + + + if(params.mapping_statistics) { + + /* +  * Extract processed reads from STAR log file +  */ + + process extract_processed_reads_STAR_for_salmon { + publishDir "${params.outdir}/mapping_statistics/STAR_for_salmon/processed_reads", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR_for_salmon/processed_reads" + tag "extract_processed_reads_STAR" + + label 'process_high' + + input: + set val(sample_name), file (Log_final_out) from collect_processed_read_counts_STAR_for_salmon + + output: + file "${sample_name}.txt" into collect_results_star_for_salmon + + script: + """ + $workflow.projectDir/bin/extract_processed_reads.sh $Log_final_out $sample_name star + """ + } + + /* +  * Collect STAR processed reads +  */ + + process collect_processed_reads_STAR_for_salmon { + publishDir "${params.outdir}/mapping_statistics/STAR_for_salmon", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR_for_salmon" + tag "collect_processed_reads_STAR" + + label 'process_high' + + input: + file process_reads from collect_results_star_for_salmon.collect() + + output: + file "processed_reads_star.tsv" into mapping_stats_total_processed_reads_alignment_for_salmon + + script: + """ + cat $process_reads > processed_reads_star.tsv + """ + } + + + /* +  * Salmon alignment-based mode - plot scatter plots of technical replicates for pathogen +  */ + + process scatter_plot_pathogen_salmon_alignment_based { + publishDir "${params.outdir}/mapping_statistics/salmon_alignment_based/scatter_plots", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon_alignment_based/scatter_plots" + tag "scatter_plots_salmon_pathogen" + + label 'process_high' + + input: + file quant_table from quant_scatter_plot_pathogen_salmon_alignment_based + val attribute from atr_scatter_plot_pathogen_alignment + val replicates from repl_scatter_plots_salmon_alignment_pathogen + val pathogen_table_non_empty from scatterplots_pathogen_salmon_alignment_based + + output: + file ('*.pdf') + + when: + replicates.toBoolean() + pathogen_table_non_empty.toBoolean() + + script: + """ + python $workflow.projectDir/bin/scatter_plots.py -q $quant_table -a $attribute -org pathogen + """ + } + + + /* +  * Salmon alignment-based mode - plot scatter plots of technical replicates for host +  */ + + process scatter_plot_host_salmon_alignment_based { + publishDir "${params.outdir}/mapping_statistics/salmon_alignment_based/scatter_plots", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon_alignment_based/scatter_plots" + tag "scatter_plots_salmon_host" + + label 'process_high' + + input: + file quant_table from quant_scatter_plot_host_salmon_alignment_based + val attribute from atr_scatter_plot_host_alignment + val replicates from repl_scatter_plots_salmon_alignment_host + val host_table_non_empty from scatterplots_host_salmon_alignment_based + + output: + file ('*.pdf') + + when: + replicates.toBoolean() + host_table_non_empty.toBoolean() + + script: + """ + python $workflow.projectDir/bin/scatter_plots.py -q $quant_table -a $attribute -org host + """ + } + + + /* +  * Salmon alignment-based mode - Extract processed reads from log file +  */ + + process extract_processed_reads_salmon_alignment_based { + publishDir "${params.outdir}/mapping_statistics/salmon_alignment_based", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon_alignment_based" + tag "extract_processed_reads" + + label 'process_high' + + input: + set val(sample_name), file ("salmon_alignment_mode/*") from collect_processed_read_counts_alignment_based + + output: + file "${sample_name}.txt" into collect_results_alignment_based + + script: + """ + $workflow.projectDir/bin/extract_processed_reads.sh salmon_alignment_mode/*/aux_info/meta_info.json $sample_name salmon_alignment + """ + } + + + /* +  * Salmon alignment-based mode - Collect processed reads +  */ + + process collect_processed_reads_salmon_alignment_based { + publishDir "${params.outdir}/mapping_statistics/salmon_alignment_based", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon_alignment_based" + tag "collect_processed_reads" + + label 'process_high' + + input: + file process_reads from collect_results_alignment_based.collect() + + output: + file "processed_reads_salmon_alignment.tsv" into mapping_stats_total_reads_alignment + script: + """ + cat $process_reads > processed_reads_salmon_alignment.tsv + """ + } + + + /* +  * Salmon alignment-based mode - Collect mapping stats +  */ + + process salmon_quantification_stats_salmon_alignment_based { + publishDir "${params.outdir}/mapping_statistics/salmon_alignment_based", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon_alignment_based" + tag "quantification_stats_salmon" + + label 'process_high' + + input: + file quant_table_host from host_quantification_mapping_stats_salmon_alignment_based + file quant_table_pathogen from pathogen_quantification_mapping_stats_salmon_alignment_based + val attribute from attribute_quant_stats_salmon_alignment + file total_processed_reads from mapping_stats_total_reads_alignment + file total_processed_reads_star from mapping_stats_total_processed_reads_alignment_for_salmon + file total_raw_reads from collect_total_reads_raw_salmon_alignment.ifEmpty('.') + + output: + file ('salmon_alignment_host_pathogen_total_reads.tsv') into salmon_mapped_stats_to_plot_alignment + + script: + """ + python $workflow.projectDir/bin/mapping_stats.py -q_p $quant_table_pathogen -q_h $quant_table_host -total_processed $total_processed_reads -total_raw $total_raw_reads -a $attribute --star_processed $total_processed_reads_star -t salmon_alignment -o salmon_alignment_host_pathogen_total_reads.tsv + """ + } + + + /* +  * Salmon alignment-based mode - Plot mapping stats +  */ + + + process plot_salmon_mapping_stats_host_pathogen_salmon_alignment_based { + tag "$name2" + publishDir "${params.outdir}/mapping_statistics/salmon_alignment_based", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon_alignment_based" + + label 'process_high' + + input: + file(stats) from salmon_mapped_stats_to_plot_alignment + + output: + file "*.tsv" + file "*.pdf" + + script: + """ + python $workflow.projectDir/bin/plot_mapping_statistics_salmon_alignment.py -i $stats + """ + } + + + /* +  * Salmon alignment-based mode - Collect pathogen RNA class mapping stats +  */ + + process RNA_class_statistics_salmon_pathogen_alignment_based { + publishDir "${params.outdir}/mapping_statistics/salmon_alignment_based/RNA_classes_pathogen", mode: params.publish_dir_mode + tag "rna_class_stats_pathogen" + + label 'process_high' + + input: + file quant_table from pathogen_quantification_RNA_stats_salmon_alignment_based + val attribute from host_annotations_RNA_class_stats_pathogen_alignment + file gene_annotations from pathogen_annotations_RNA_class_stats_salmon_alignment + + output: + file "pathogen_RNA_classes_percentage_*.tsv" into plot_RNA_stats_pathogen_alignment + file "pathogen_RNA_classes_percentage_*.tsv" into plot_RNA_stats_pathogen_combined_alignment + file "pathogen_RNA_classes_sum_counts_*.tsv" + stdout plot_RNA_stats_pathogen_alignment_boolean + stdout plot_RNA_stats_pathogen_combined_alignment_boolean + + shell: + ''' + python !{workflow.projectDir}/bin/RNA_class_content.py -q !{quant_table} -a !{attribute} -annotations !{gene_annotations} -q_tool salmon -org pathogen 2>&1 + ''' + } + + /* +  * Salmon alignment-based mode - Collect host RNA class mapping stats +  */ + + process RNA_class_statistics_salmon_host_alignment_based { + publishDir "${params.outdir}/mapping_statistics/salmon_alignment_based/RNA_classes_host", mode: params.publish_dir_mode + tag "rna_class_stats_host" + + label 'process_high' + + input: + file quant_table from host_quantification_RNA_stats_salmon_alignment_based + val attribute from attribute_host_RNA_class_stats_alignment + file gene_annotations from host_annotations_RNA_class_stats_salmon_alignment + file rna_classes_to_replace from RNA_classes_to_replace_alignment + + output: + file "host_RNA_classes_percentage_*.tsv" into plot_RNA_stats_host_alignment + file "host_RNA_classes_percentage_*.tsv" into plot_RNA_stats_host_combined_alignment + file "host_RNA_classes_sum_counts_*.tsv" + file "host_gene_types_groups_*" + stdout plot_RNA_stats_host_alignment_boolean + stdout plot_RNA_stats_host_combined_alignment_boolean + + shell: + ''' + python !{workflow.projectDir}/bin/RNA_class_content.py -q !{quant_table} -a !{attribute} -annotations !{gene_annotations} -rna !{rna_classes_to_replace} -q_tool salmon -org host 2>&1 + ''' + } + + + /* +  * Salmon alignment-based mode - Plot pathogen RNA class mapping stats for each sample separately +  */ + + process plot_RNA_class_salmon_pathogen_each_alignment_based{ + publishDir "${params.outdir}/mapping_statistics/salmon_alignment_based/RNA_classes_pathogen", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon_alignment_based/RNA_classes_pathogen" + tag "plot_rna_class_stats_path_sal" + + label 'process_high' + + input: + file stats_table from plot_RNA_stats_pathogen_alignment + val plot_rna from plot_RNA_stats_pathogen_alignment_boolean + + output: + file "*.pdf" + + when: + plot_rna.toBoolean() + + script: + """ + python $workflow.projectDir/bin/plot_RNA_class_stats_each.py -i $stats_table + """ + } + + /* +  * Salmon alignment-based mode - Plot host RNA class mapping stats for each sample separately +  */ + + process plot_RNA_class_salmon_host_each_alignment_based { + publishDir "${params.outdir}/mapping_statistics/salmon_alignment_based/RNA_classes_host", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon_alignment_based/RNA_classes_host" + tag "plot_rna_class_stats_host_sal" + + label 'process_high' + + input: + file stats_table from plot_RNA_stats_host_alignment + val plot_rna from plot_RNA_stats_host_alignment_boolean + + output: + file "*.pdf" + + when: + plot_rna.toBoolean() + + script: + """ + python $workflow.projectDir/bin/plot_RNA_class_stats_each.py -i $stats_table + """ + } + + + /* +  * Salmon alignment-based mode - Plot pathogen RNA class mapping stats for all samples +  */ + + process plot_RNA_class_salmon_pathogen_combined_alignment_based { + publishDir "${params.outdir}/mapping_statistics/salmon_alignment_based/RNA_classes_pathogen", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon_alignment_based/RNA_classes_pathogen" + tag "plot_rna_class_stats_path_all" + + label 'process_high' + + input: + file stats_table from plot_RNA_stats_pathogen_combined_alignment + val plot_rna from plot_RNA_stats_pathogen_combined_alignment_boolean + + output: + file "RNA_class_stats_combined_pathogen.pdf" + + when: + plot_rna.toBoolean() + + script: + """ + python $workflow.projectDir/bin/plot_RNA_class_stats_combined.py -i $stats_table -org pathogen + """ + } + + /* +  * Salmon alignment-based mode - Plot host RNA class mapping stats for all samples +  */ + + process plot_RNA_class_salmon_host_combined_alignment_based { + publishDir "${params.outdir}/mapping_statistics/salmon_alignment_based/RNA_classes_host", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/salmon_alignment_based/RNA_classes_host" + tag "plot_rna_class_stats_host_all" + + label 'process_high' + + input: + file stats_table from plot_RNA_stats_host_combined_alignment + val plot_rna from plot_RNA_stats_host_combined_alignment_boolean + + output: + file "RNA_class_stats_combined_host.pdf" + + when: + plot_rna.toBoolean() + + script: + """ + python $workflow.projectDir/bin/plot_RNA_class_stats_combined.py -i $stats_table -org host + """ + } + } +}else{ + Channel.empty() + .into {multiqc_star_for_salmon_alignment; multiqc_salmon_alignment_quant} +} + + + +/* + * STEP 8 - STAR + */ + + +if(params.run_star) { + + /* +  * STAR - build an index +  */ + + process STARindex { + publishDir "${params.outdir}/STAR", mode: params.publish_dir_mode + storeDir "${params.outdir}/STAR" + tag "build_star_index" + + + label 'process_high' + + input: + file(fasta) from host_pathogen_fasta_star_index + file(gff) from gff_host_star_alignment_gff + + output: + file "index/*" into star_index_genome_alignment + + script: + sjdbOverhang = params.sjdbOverhang + star_index_params = params.star_index_params + """ + mkdir index + STAR --runThreadN ${task.cpus} --runMode genomeGenerate + --genomeDir index/ --genomeFastaFiles $fasta --sjdbGTFfile $gff + --sjdbGTFfeatureExon exon --sjdbGTFtagExonParentTranscript Parent + --sjdbOverhang $sjdbOverhang $star_index_params + """ + } + + /* +  * STAR - alignment +  */ + + process ALIGNMENTstar { + tag "${sample_name}" + publishDir "${params.outdir}/STAR", mode: params.publish_dir_mode + storeDir "${params.outdir}/STAR" + + label 'process_high' + + input: + set val(sample_name),file(reads) from trimming_results_star_htseq + file(gff) from gff_host_star_htseq_alignment_gff.collect() + file(index) from star_index_genome_alignment.collect() + + output: + set val(sample_name), file("${sample_name}/${sample_name}Aligned*.out.bam") into star_aligned_u_m + set val(sample_name), file("${sample_name}/${sample_name}Aligned*.out.bam") into alignment_unique_mapping_stats + set val(sample_name), file("${sample_name}/${sample_name}Aligned*.out.bam") into alignment_crossmapped_extract + file "${sample_name}/*" into multiqc_star_alignment + set val(sample_name), file("${sample_name}/${sample_name}Log.final.out") into collect_processed_read_counts_STAR + + script: + outSAMunmapped = params.outSAMunmapped + outSAMattributes = params.outSAMattributes + outFilterMultimapNmax = params.outFilterMultimapNmax + outFilterType = params.outFilterType + alignSJoverhangMin = params.alignSJoverhangMin + alignSJDBoverhangMin = params.alignSJDBoverhangMin + outFilterMismatchNmax = params.outFilterMismatchNmax + outFilterMismatchNoverReadLmax = params.outFilterMismatchNoverReadLmax + alignIntronMin = params.alignIntronMin + alignIntronMax = params.alignIntronMax + alignMatesGapMax = params.alignMatesGapMax + outWigType = params.outWigType + outWigStrand = params.outWigStrand + limitBAMsortRAM = params.limitBAMsortRAM + readFilesCommand = reads[0].toString().endsWith('.gz') ? "--readFilesCommand zcat" : '' + winAnchorMultimapNmax = params.winAnchorMultimapNmax + star_alignment_params = params.star_alignment_params + + if (params.single_end){ + """ + mkdir $sample_name + STAR --runThreadN ${task.cpus} --genomeDir . --sjdbGTFfile $gff $readFilesCommand + --readFilesIn $reads --outSAMtype BAM SortedByCoordinate + --outSAMunmapped $outSAMunmapped --outSAMattributes $outSAMattributes + --outWigType $outWigType --outWigStrand $outWigStrand --outFileNamePrefix $sample_name/$sample_name --sjdbGTFfeatureExon exon --sjdbGTFtagExonParentTranscript Parent --outFilterMultimapNmax $outFilterMultimapNmax --outFilterType $outFilterType --limitBAMsortRAM $limitBAMsortRAM --alignSJoverhangMin $alignSJoverhangMin --alignSJDBoverhangMin $alignSJDBoverhangMin --outFilterMismatchNmax $outFilterMismatchNmax --outFilterMismatchNoverReadLmax $outFilterMismatchNoverReadLmax --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax --alignMatesGapMax $alignMatesGapMax --winAnchorMultimapNmax $winAnchorMultimapNmax $star_alignment_params + """ + } else { + """ + mkdir $sample_name + STAR --runThreadN ${task.cpus} --genomeDir . --sjdbGTFfile + $gff $readFilesCommand --readFilesIn ${reads[0]} ${reads[1]} + --outSAMtype BAM SortedByCoordinate --outSAMunmapped $outSAMunmapped + --outSAMattributes $outSAMattributes --outWigType $outWigType + --outWigStrand $outWigStrand --outFileNamePrefix $sample_name/$sample_name + --sjdbGTFfeatureExon exon --sjdbGTFtagExonParentTranscript Parent + --outFilterMultimapNmax $outFilterMultimapNmax --outFilterType $outFilterType + --limitBAMsortRAM $limitBAMsortRAM --alignSJoverhangMin $alignSJoverhangMin + --alignSJDBoverhangMin $alignSJDBoverhangMin --outFilterMismatchNmax $outFilterMismatchNmax + --outFilterMismatchNoverReadLmax $outFilterMismatchNoverReadLmax + --alignIntronMin $alignIntronMin --alignIntronMax $alignIntronMax + --alignMatesGapMax $alignMatesGapMax --winAnchorMultimapNmax $winAnchorMultimapNmax $star_alignment_params + """ + } + } + + + + if(params.mapping_statistics) { + + /* +  * STAR - Remove cross mapped reads from bam file + */ + + process remove_crossmapped_reads { + tag "$sample_name" + publishDir "${params.outdir}/STAR/multimapped_reads", mode: params.publish_dir_mode + storeDir "${params.outdir}/STAR/multimapped_reads" + + label 'process_high' + + input: + set val(sample_name), file(alignment) from alignment_crossmapped_extract + file(host_reference) from reference_host_names_crossmapped_find.collect() + file(pathogen_reference) from reference_pathogen_crossmapped_find.collect() + + + output: + set val(sample_name), file("${bam_file_without_crossmapped}") into alignment_multi_mapping_stats + file "${cross_mapped_reads}" into count_crossmapped_reads + + script: + bam_file_without_crossmapped = sample_name + "_no_crossmapped.bam" + cross_mapped_reads = sample_name + "_cross_mapped_reads.txt" + + if (params.single_end){ + """ + $workflow.projectDir/bin/remove_crossmapped_reads_BAM.sh $alignment $workflow.projectDir/bin $host_reference $pathogen_reference $cross_mapped_reads $bam_file_without_crossmapped + """ + } else { + """ + $workflow.projectDir/bin/remove_crossmapped_read_pairs_BAM.sh $alignment $workflow.projectDir/bin $host_reference $pathogen_reference $cross_mapped_reads $bam_file_without_crossmapped + """ + } + } + + + /* +  * STAR - Extract processed reads from log file + */ + + process extract_processed_reads_STAR { + publishDir "${params.outdir}/mapping_statistics/STAR/processed_reads", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR/processed_reads" + tag "extract_processed_reads_STAR" + + label 'process_high' + + input: + set val(sample_name), file (Log_final_out) from collect_processed_read_counts_STAR + + output: + file "${sample_name}.txt" into collect_results_star + + script: + """ + $workflow.projectDir/bin/extract_processed_reads.sh $Log_final_out $sample_name star + """ + } + + + /* +  * STAR - Collect STAR processed reads + */ + + process collect_processed_reads_STAR { + publishDir "${params.outdir}/mapping_statistics/STAR", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR" + tag "collect_processed_reads_STAR" + + label 'process_high' + + input: + file process_reads from collect_results_star.collect() + + output: + file "processed_reads_star.tsv" into mapping_stats_total_processed_reads_alignment + file "processed_reads_star.tsv" into mapping_stats_htseq_total_processed_reads_alignment + + script: + """ + cat $process_reads > processed_reads_star.tsv + """ + } + + + /* +  * STAR - Count uniquely mapped reads + */ + + process unique_mapping_stats_STAR { + tag "$sample_name" + publishDir "${params.outdir}/mapping_statistics/STAR/uniquely_mapped", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR/uniquely_mapped" + + label 'process_high' + + input: + set val(sample_name), file(alignment) from alignment_unique_mapping_stats + file(host_reference_names) from reference_host_names_uniquelymapped.collect() + file(pathogen_reference_names) from reference_pathogen_names_uniquelymapped.collect() + + output: + file("${name}") into STAR_mapping_stats_unique + + shell: + name = sample_name + '_uniquely_mapped.txt' + if (params.single_end){ + ''' + !{workflow.projectDir}/bin/count_uniquely_mapped_reads.sh !{alignment} !{host_reference_names} !{pathogen_reference_names} !{sample_name} !{name} + ''' + } else { + ''' + !{workflow.projectDir}/bin/count_uniquely_mapped_read_pairs.sh !{alignment} !{host_reference_names} !{pathogen_reference_names} !{sample_name} !{name} + ''' + } + } + + /* +  * STAR - Collect uniquely mapped reads + */ + + process collect_stats_STAR_uniquely_mapped { + publishDir "${params.outdir}/mapping_statistics/STAR", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR" + tag "collect_uniq_mapped_reads_STAR" + + label 'process_high' + + input: + file stats from STAR_mapping_stats_unique.collect() + + output: + file "uniquely_mapped_reads_star.tsv" into mapping_stats_uniquely_mapped_star + + script: + """ + python $workflow.projectDir/bin/combine_tables.py -i $stats -o uniquely_mapped_reads_star.tsv -s uniquely_mapped_reads + """ + } + + + /* +  * STAR - Count cross mapped reads + */ + + process count_crossmapped_reads { + tag "count_crossmapped_reads" + publishDir "${params.outdir}/mapping_statistics/STAR", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR" + + label 'process_high' + + input: + file(cross_mapped_reads) from count_crossmapped_reads.collect() + + output: + file "cross_mapped_reads_sum.txt" into STAR_mapping_stats_cross_mapped + + script: + """ + $workflow.projectDir/bin/count_cross_mapped_reads.sh $cross_mapped_reads + """ + } + + + /* +  * STAR - Count multi-mapped reads (multi_mapped reads without cross_mapped reads ) +  */ + + process multi_mapping_stats { + tag "$sample_name" + publishDir "${params.outdir}/mapping_statistics/STAR/multi_mapped", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR/multi_mapped" + + label 'process_high' + + input: + set val(sample_name),file(alignment) from alignment_multi_mapping_stats + file(host_reference_names) from reference_host_names_multimapped.collect() + file(pathogen_reference_names) from reference_pathogen_names_multimapped.collect() + + output: + file("${name}") into STAR_mapping_stats_multi + + shell: + name = sample_name + '_multi_mapped.txt' + if (params.single_end){ + ''' + !{workflow.projectDir}/bin/count_multi_mapped_reads.sh !{alignment} !{host_reference_names} !{pathogen_reference_names} !{sample_name} !{name} + ''' + } else { + ''' + !{workflow.projectDir}/bin/count_multi_mapped_read_pairs.sh !{alignment} !{host_reference_names} !{pathogen_reference_names} !{sample_name} !{name} + ''' + } + } + + + /* +  * STAR - Collect multi-mapped reads +  */ + + process collect_stats_STAR_multi_mapped { + publishDir "${params.outdir}/mapping_statistics/STAR", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR" + tag "collect_multi_mapped_reads_STAR" + + label 'process_high' + + input: + file stats from STAR_mapping_stats_multi.collect() + + output: + file "multi_mapped_reads_star.tsv" into mapping_stats_multi_mapped_star + + script: + """ + python $workflow.projectDir/bin/combine_tables.py -i $stats -o multi_mapped_reads_star.tsv -s multi_mapped_reads + """ + } + + + /* +  * STAR - Collect mapping stats +  */ + + process star_mapping_stats { + storeDir "${params.outdir}/mapping_statistics/STAR" + publishDir "${params.outdir}/mapping_statistics/STAR", mode: params.publish_dir_mode + tag "star_mapping_stats" + + label 'process_high' + + input: + file total_raw_reads from collect_total_reads_raw_star.ifEmpty('.') + file total_processed_reads from mapping_stats_total_processed_reads_alignment + file uniquely_mapped_reads from mapping_stats_uniquely_mapped_star + file multi_mapped_reads from mapping_stats_multi_mapped_star + file cross_mapped_reads from STAR_mapping_stats_cross_mapped + + output: + file ('star_mapping_stats.tsv') into star_mapped_stats_to_plot + file ('star_mapping_stats.tsv') into mapping_stats_star_htseq_stats + + script: + """ + python $workflow.projectDir/bin/mapping_stats.py -total_raw $total_raw_reads -total_processed $total_processed_reads -m_u $uniquely_mapped_reads -m_m $multi_mapped_reads -c_m $cross_mapped_reads -t star -o star_mapping_stats.tsv + """ + } + + /* +  * STAR - Plot mapping stats +  */ + + process plot_star_mapping_stats { + tag "plot_star_mapping_stats" + publishDir "${params.outdir}/mapping_statistics/STAR", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/STAR" + + label 'process_high' + + input: + file(stats) from star_mapped_stats_to_plot + + output: + file "*.tsv" + file "*.pdf" + + script: + """ + python $workflow.projectDir/bin/plot_mapping_stats_star.py -i $stats + """ + } + } +}else{ + Channel.empty() + .set {multiqc_star_alignment} +} + + + +/* + * STEP 9 - HTSeq + */ + + +if(params.run_htseq_uniquely_mapped){ + + /* +  * HTSeq - quantification +  */ + + process HTseq_unique_mapping { + publishDir "${params.outdir}/HTSeq", mode: params.publish_dir_mode + storeDir "${params.outdir}/HTSeq" + tag "$sample_name" + + label 'process_high' + + input: + file(gff) from quantification_gff_u_m.collect() + set val(sample_name), file(st) from star_aligned_u_m + val(host_attribute) from host_gff_attribute_htseq + val(stranded) from stranded_htseq_unique + + + output: + file ("$name_file2") into htseq_files_to_combine + file ("$name_file2") into multiqc_htseq_unique + + script: + name_file2 = sample_name + "_count_u_m.txt" + host_attr = host_attribute + max_reads_in_buffer = params.max_reads_in_buffer + minaqual = params.minaqual + htseq_params = params.htseq_params + """ + htseq-count -n $task.cpus -t quant -f bam -r pos $st $gff -i $host_attr -s $stranded --max-reads-in-buffer=$max_reads_in_buffer -a $minaqual $htseq_params > $name_file2 + sed -i '1{h;s/.*/'"$sample_name"'/;G}' "$name_file2" + """ + } + + + /* +  * HTSeq - combine quantification results +  */ + + htseq_files_to_combine + .collect() + .set { htseq_result_quantification } + + process combine_quantification_tables_htseq_uniquely_mapped { + publishDir "${params.outdir}/HTSeq", mode: params.publish_dir_mode + storeDir "${params.outdir}/HTSeq" + tag "comb_quants_htseq_uniq_mapped" + + label 'process_high' + + input: + file input_quantification from htseq_result_quantification + val(host_attribute) from host_gff_attribute_htseq_combine + + output: + file "quantification_results_*.tsv" into htseq_result_quantification_TPM + + script: + """ + python $workflow.projectDir/bin/collect_quantification_data.py -i $input_quantification -q htseq -a $host_attribute + """ + } + + + /* +  * HTSeq - Calculate TPM values +  */ + + process htseq_uniquely_mapped_TPM { + publishDir "${params.outdir}/HTSeq", mode: params.publish_dir_mode + storeDir "${params.outdir}/HTSeq" + tag "htseq_uniquely_mapped_TPM" + + label 'process_high' + + input: + file input_quantification from htseq_result_quantification_TPM + val(host_attribute) from host_gff_attribute_htseq_TPM + file gff_host from gff_host_to_TPM + file gff_pathogen from gff_pathogen_to_TPM + + output: + file "quantification_results_uniquely_mapped_NumReads_TPM.tsv" into split_table_htseq_host + file "quantification_results_uniquely_mapped_NumReads_TPM.tsv" into split_table_htseq_pathogen + + script: + """ + $workflow.projectDir/bin/calculate_TPM_HTSeq.R $input_quantification $host_attribute $gff_pathogen $gff_host + """ + } + + + /* +  * HTSeq - Split quantification tables into host and pathogen results +  */ + + + process split_quantification_tables_htseq_uniquely_mapped { + publishDir "${params.outdir}/HTSeq", mode: params.publish_dir_mode + tag "split_quants_uniq_mapped_host" + + label 'process_high' + + input: + file quant_table from split_table_htseq_host + file host_annotations from annotation_host_split_quant_htseq + file pathogen_annotations from annotation_pathogen_split_quant_htseq + + output: + file 'host_quantification_uniquely_mapped_htseq.tsv' into host_quantification_stats_htseq + file 'host_quantification_uniquely_mapped_htseq.tsv' into host_quantification_stats_htseq_total + file 'host_quantification_uniquely_mapped_htseq.tsv' into host_htseq_quantification_RNA_stats + file 'host_quantification_uniquely_mapped_htseq.tsv' into quant_host_add_annotations_htseq_u_m + file 'host_quantification_uniquely_mapped_htseq.tsv' into quant_scatter_plot_host_htseq_u_m + file 'pathogen_quantification_uniquely_mapped_htseq.tsv' into pathogen_quantification_stats_htseq + file 'pathogen_quantification_uniquely_mapped_htseq.tsv' into pathogen_quantification_stats_htseq_total + file 'pathogen_quantification_uniquely_mapped_htseq.tsv' into pathogen_htseq_quantification_RNA_stats + file 'pathogen_quantification_uniquely_mapped_htseq.tsv' into quant_pathogen_add_annotations_htseq_u_m + file 'pathogen_quantification_uniquely_mapped_htseq.tsv' into quant_scatter_plot_pathogen_htseq_u_m + env pathonen_tab into pathonen_tab into scatterplots_pathogen_htseq + env host_tab into host_tab into scatterplots_host_htseq + + script: + """ + $workflow.projectDir/bin/split_quant_tables.sh $quant_table $host_annotations $pathogen_annotations quantification_uniquely_mapped_htseq.tsv + pathonen_tab=\$(if [ \$(cat pathogen_quantification_uniquely_mapped_htseq.tsv | wc -l) -gt 1 ]; then echo "true"; else echo "false"; fi) + host_tab=\$(if [ \$(cat host_quantification_uniquely_mapped_htseq.tsv | wc -l) -gt 1 ]; then echo "true"; else echo "false"; fi) + """ + } + + + /* +  * HTSeq - Combine pathogen annotations with quantification results +  */ + + process combine_annotations_quant_pathogen_uniquely_mapped_host { + publishDir "${params.outdir}/HTSeq", mode: params.publish_dir_mode + storeDir "${params.outdir}/HTSeq" + tag "comb_annots_quant_pathogen" + + label 'process_high' + + input: + file quantification_table from quant_pathogen_add_annotations_htseq_u_m + file annotation_table from annotation_pathogen_combine_quant_htseq_u_m + val attribute from combine_annot_quant_pathogen_host_gff_attribute + + output: + file "pathogen_combined_quant_annotations.tsv" + + script: + """ + $workflow.projectDir/bin/combine_quant_annotations.py -q $quantification_table -annotations $annotation_table -a $attribute -org pathogen + """ + } + + + /* +  * HTSeq - Combine host annotations with quantification results +  */ + + process combine_annotations_quant_host_uniquely_mapped_host { + publishDir "${params.outdir}/HTSeq", mode: params.publish_dir_mode + storeDir "${params.outdir}/HTSeq" + tag "comb_annots_quant_host" + + label 'process_high' + + input: + file quantification_table from quant_host_add_annotations_htseq_u_m + file annotation_table from annotation_host_combine_quant_htseq + val attribute from combine_annot_quant_pathogen_host_gff_attribute + + output: + file "host_combined_quant_annotations.tsv" + + script: + """ + $workflow.projectDir/bin/combine_quant_annotations.py -q $quantification_table -annotations $annotation_table -a $attribute -org host + """ + } + + + if(params.mapping_statistics) { + + /* +   * HTSeq - Plot scatter plots of technical replicates for pathogen +   */ + + process scatter_plot_pathogen_htseq { + publishDir "${params.outdir}/mapping_statistics/HTSeq/scatter_plots", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/HTSeq/scatter_plots" + tag "scatter_plot_pathogen_htseq" + + label 'process_high' + + input: + file quant_table from quant_scatter_plot_pathogen_htseq_u_m + val attribute from atr_scatter_plot_pathogen_htseq_u_m + val replicates from repl_scatter_plots_htseq_pathogen + val pathogen_table_non_empty from scatterplots_pathogen_htseq + + output: + file ('*.pdf') + + when: + replicates.toBoolean() + pathogen_table_non_empty.toBoolean() + + script: + """ + python $workflow.projectDir/bin/scatter_plots.py -q $quant_table -a $attribute -org pathogen + """ + } + + + /* +   * HTSeq - Plot scatter plots of technical replicates for host +   */ + + + process scatter_plot_host_htseq { + publishDir "${params.outdir}/mapping_statistics/HTSeq/scatter_plots", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/HTSeq/scatter_plots" + tag "scatter_plot_host_htseq" + + label 'process_high' + + input: + file quant_table from quant_scatter_plot_host_htseq_u_m + val attribute from atr_scatter_plot_host_htseq_u_m + val replicates from repl_scatter_plots_htseq_host + val host_table_non_empty from scatterplots_host_htseq + + output: + file ('*.pdf') + + when: + replicates.toBoolean() + host_table_non_empty.toBoolean() + + script: + """ + python $workflow.projectDir/bin/scatter_plots.py -q $quant_table -a $attribute -org host + """ + } + + + /* +   * HTSeq - Collect mapping and quantification stats +   */ + + + process htseq_quantification_stats_uniquely_mapped { + storeDir "${params.outdir}/mapping_statistics/HTSeq" + publishDir "${params.outdir}/mapping_statistics/HTSeq", mode: params.publish_dir_mode + tag "quantification_stats_htseq" + + label 'process_high' + + input: + file quant_table_host from host_quantification_stats_htseq_total + file quant_table_pathogen from pathogen_quantification_stats_htseq_total + val attribute from host_gff_attribute_mapping_stats_htseq + file star_stats from mapping_stats_star_htseq_stats + + output: + file ('htseq_uniquely_mapped_reads_stats.tsv') into htseq_mapped_stats_to_plot + + script: + """ + python $workflow.projectDir/bin/mapping_stats.py -q_p $quant_table_pathogen -q_h $quant_table_host -a $attribute -star $star_stats -t htseq -o htseq_uniquely_mapped_reads_stats.tsv + """ + } + + + /* +   * HTSeq - Plot mapping and stats +   */ + + process plot_mapping_stats_host_pathogen_htseq_uniquely_mapped{ + tag "$name2" + publishDir "${params.outdir}/mapping_statistics/HTSeq", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/HTSeq" + + label 'process_high' + + input: + file(stats) from htseq_mapped_stats_to_plot + + output: + file "*.tsv" + file "*.pdf" + + script: + """ + python $workflow.projectDir/bin/plot_mapping_stats_htseq.py -i $stats + """ + } + + + /* +   * HTSeq - Collect pathogen RNA class mapping stats +   */ + + process RNA_class_statistics_htseq_uniquely_mapped_pathogen { + publishDir "${params.outdir}/mapping_statistics/HTSeq/RNA_classes_pathogen", mode: params.publish_dir_mode + tag "rna_class_stats_htseq_pathogen" + + label 'process_high' + + input: + file quant_table from pathogen_htseq_quantification_RNA_stats + val attribute from host_gff_attribute_RNA_class_pathogen_htseq + file gene_annotations from pathogen_annotations_RNA_class_stats_htseq + + + output: + file "pathogen_RNA_classes_percentage_*.tsv" into plot_RNA_stats_pathogen_htseq_u_m + file "pathogen_RNA_classes_percentage_*.tsv" into plot_RNA_stats_pathogen_combined_htseq_u_m + file "pathogen_RNA_classes_sum_counts_*.tsv" + stdout plot_RNA_stats_pathogen_htseq_u_m_boolean + stdout plot_RNA_stats_pathogen_combined_htseq_u_m_boolean + + shell: + ''' + python !{workflow.projectDir}/bin/RNA_class_content.py -q !{quant_table} -a !{attribute} -annotations !{gene_annotations} -q_tool htseq -org pathogen 2>&1 + ''' + } + + + /* +   * HTSeq - Collect host RNA class mapping stats +   */ + + process RNA_class_statistics_htseq_uniquely_mapped_host { + publishDir "${params.outdir}/mapping_statistics/HTSeq/RNA_classes_host", mode: params.publish_dir_mode + tag "rna_class_stats_htseq_host" + + label 'process_high' + + input: + file quant_table from host_htseq_quantification_RNA_stats + val attribute from host_gff_attribute_RNA_class_host_htseq + file gene_annotations from host_annotations_RNA_class_stats_htseq + file rna_classes_to_replace from RNA_classes_to_replace_htseq_uniquely_mapped + + output: + file "host_RNA_classes_percentage_*.tsv" into plot_RNA_stats_host_htseq_u_m + file "host_RNA_classes_percentage_*.tsv" into plot_RNA_stats_host_combined_htseq_u_m + file "host_RNA_classes_sum_counts_*.tsv" + file "host_gene_types_groups_*" + stdout plot_RNA_stats_host_htseq_u_m_boolean + stdout plot_RNA_stats_host_combined_htseq_u_m_boolean + + shell: + ''' + python !{workflow.projectDir}/bin/RNA_class_content.py -q !{quant_table} -a !{attribute} -annotations !{gene_annotations} -rna !{rna_classes_to_replace} -q_tool htseq -org host 2>&1 + ''' + } + + + /* +   * HTSeq - Plot pathogen RNA class mapping stats for each sample separately +   */ + + process plot_RNA_class_htseq_uniquely_mapped_pathogen_each{ + publishDir "${params.outdir}/mapping_statistics/HTSeq/RNA_classes_pathogen", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/HTSeq/RNA_classes_pathogen" + tag "plot_rna_stats_htseq_pathogen" + + label 'process_high' + + input: + file stats_table from plot_RNA_stats_pathogen_htseq_u_m + val plot_rna from plot_RNA_stats_pathogen_htseq_u_m_boolean + + output: + file "*.pdf" + + when: + plot_rna.toBoolean() + + script: + """ + python $workflow.projectDir/bin/plot_RNA_class_stats_each.py -i $stats_table + """ + } + + + /* +   * HTSeq - Plot host RNA class mapping stats for each sample separately +   */ + + process plot_RNA_class_htseq_uniquely_mapped_host_each { + publishDir "${params.outdir}/mapping_statistics/HTSeq/RNA_classes_host", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/HTSeq/RNA_classes_host" + tag "plot_rna_stats_htseq_host" + + label 'process_high' + + input: + file stats_table from plot_RNA_stats_host_htseq_u_m + val plot_rna from plot_RNA_stats_host_htseq_u_m_boolean + + output: + file "*.pdf" + + when: + plot_rna.toBoolean() + + script: + """ + python $workflow.projectDir/bin/plot_RNA_class_stats_each.py -i $stats_table + """ + } + + + /* +   * HTSeq - Plot pathogen RNA class mapping stats for all samples +   */ + + process plot_RNA_class_htseq_uniquely_mapped_pathogen_combined { + publishDir "${params.outdir}/mapping_statistics/HTSeq/RNA_classes_pathogen", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/HTSeq/RNA_classes_pathogen" + tag "plt_rna_stats_htseq_pathgn_all" + + label 'process_high' + + input: + file stats_table from plot_RNA_stats_pathogen_combined_htseq_u_m + val plot_rna from plot_RNA_stats_pathogen_combined_htseq_u_m_boolean + + output: + file "RNA_class_stats_combined_pathogen.pdf" + + when: + plot_rna.toBoolean() + + script: + """ + python $workflow.projectDir/bin/plot_RNA_class_stats_combined.py -i $stats_table -org pathogen + """ + } + + /* +   * HTSeq - Plot host RNA class mapping stats for all samples +   */ + + process plot_RNA_class_htseq_uniquely_host_combined { + publishDir "${params.outdir}/mapping_statistics/HTSeq/RNA_classes_host", mode: params.publish_dir_mode + storeDir "${params.outdir}/mapping_statistics/HTSeq/RNA_classes_host" + tag "plt_rna_stats_htseq_host_all" + + label 'process_high' + + input: + file stats_table from plot_RNA_stats_host_combined_htseq_u_m + val plot_rna from plot_RNA_stats_host_combined_htseq_u_m_boolean + + output: + file "RNA_class_stats_combined_host.pdf" + + when: + plot_rna.toBoolean() + + script: + """ + python $workflow.projectDir/bin/plot_RNA_class_stats_combined.py -i $stats_table -org host + """ + } + +} +}else{ + Channel.empty() + .set {multiqc_htseq_unique} +} + + + + +process multiqc { + publishDir "${params.outdir}/MultiQC", mode: params.publish_dir_mode + + input: + file (multiqc_config) from ch_multiqc_config + file (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([]) + file ('fastqc/*') from ch_fastqc_results.last().collect().ifEmpty([]) + file ('fastqc_after_trimming/*') from ch_fastqc_trimmed_results.last().collect().ifEmpty([]) + file ('salmon/*') from multiqc_salmon_quant.collect().ifEmpty([]) + file ('salmon_alignment_mode/*') from multiqc_salmon_alignment_quant.collect().ifEmpty([]) + file ('STAR/*') from multiqc_star_alignment.collect().ifEmpty([]) + file ('STAR_for_salmon/*') from multiqc_star_for_salmon_alignment.collect().ifEmpty([]) + file ('uniquely_mapped/*') from multiqc_htseq_unique.collect().ifEmpty([]) + file ('software_versions/*') from ch_software_versions_yaml.collect() + file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") + + output: + file "*multiqc_report.html" into ch_multiqc_report + file "*_data" + file "multiqc_plots" + + script: + rtitle = custom_runName ? "--title \"$custom_runName\"" : '' + rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' + custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' + """ + multiqc -d --export -f $rtitle $rfilename $custom_config_file . + """ +} + + +/* + * STEP 11 - Output Description HTML + */ + + +process output_documentation { + publishDir "${params.outdir}/pipeline_info", mode: params.publish_dir_mode + + input: + file output_docs from ch_output_docs + + output: + file "results_description.html" +// file images from ch_output_docs_images + + script: + """ + markdown_to_html.py $output_docs -o results_description.html + """ +} + + + + + + +/* + * Completion e-mail notification + */ +workflow.onComplete { + + // Set up the e-mail variables + def subject = "[nf-core/dualrnaseq] Successful: $workflow.runName" + if (!workflow.success) { + subject = "[nf-core/dualrnaseq] FAILED: $workflow.runName" + } + def email_fields = [:] + email_fields['version'] = workflow.manifest.version + email_fields['runName'] = custom_runName ?: workflow.runName + email_fields['success'] = workflow.success + email_fields['dateComplete'] = workflow.complete + email_fields['duration'] = workflow.duration + email_fields['exitStatus'] = workflow.exitStatus + email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') + email_fields['errorReport'] = (workflow.errorReport ?: 'None') + email_fields['commandLine'] = workflow.commandLine + email_fields['projectDir'] = workflow.projectDir + email_fields['summary'] = summary + email_fields['summary']['Date Started'] = workflow.start + email_fields['summary']['Date Completed'] = workflow.complete + email_fields['summary']['Pipeline script file path'] = workflow.scriptFile + email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId + if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository + if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId + if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision + email_fields['summary']['Nextflow Version'] = workflow.nextflow.version + email_fields['summary']['Nextflow Build'] = workflow.nextflow.build + email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp + + // On success try attach the multiqc report + def mqc_report = null + try { + if (workflow.success) { + mqc_report = ch_multiqc_report.getVal() + if (mqc_report.getClass() == ArrayList) { + log.warn "[nf-core/dualrnaseq] Found multiple reports from process 'multiqc', will use only one" + mqc_report = mqc_report[0] + } + } + } catch (all) { + log.warn "[nf-core/dualrnaseq] Could not attach MultiQC report to summary email" + } + + // Check if we are only sending emails on failure + email_address = params.email + if (!params.email && params.email_on_fail && !workflow.success) { + email_address = params.email_on_fail + } + + // Render the TXT template + def engine = new groovy.text.GStringTemplateEngine() + def tf = new File("$projectDir/assets/email_template.txt") + def txt_template = engine.createTemplate(tf).make(email_fields) + def email_txt = txt_template.toString() + + // Render the HTML template + def hf = new File("$projectDir/assets/email_template.html") + def html_template = engine.createTemplate(hf).make(email_fields) + def email_html = html_template.toString() + + // Render the sendmail template + def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ] + def sf = new File("$projectDir/assets/sendmail_template.txt") + def sendmail_template = engine.createTemplate(sf).make(smail_fields) + def sendmail_html = sendmail_template.toString() + + // Send the HTML e-mail + if (email_address) { + try { + if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } + // Try to send HTML e-mail using sendmail + [ 'sendmail', '-t' ].execute() << sendmail_html + log.info "[nf-core/dualrnaseq] Sent summary e-mail to $email_address (sendmail)" + } catch (all) { + // Catch failures and try with plaintext + def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] + if ( mqc_report.size() <= params.max_multiqc_email_size.toBytes() ) { + mail_cmd += [ '-A', mqc_report ] + } + mail_cmd.execute() << email_html + log.info "[nf-core/dualrnaseq] Sent summary e-mail to $email_address (mail)" + } + } + + // Write summary e-mail HTML to a file + def output_d = new File("${params.outdir}/pipeline_info/") + if (!output_d.exists()) { + output_d.mkdirs() + } + def output_hf = new File(output_d, "pipeline_report.html") + output_hf.withWriter { w -> w << email_html } + def output_tf = new File(output_d, "pipeline_report.txt") + output_tf.withWriter { w -> w << email_txt } + + c_green = params.monochrome_logs ? '' : "\033[0;32m"; + c_purple = params.monochrome_logs ? '' : "\033[0;35m"; + c_red = params.monochrome_logs ? '' : "\033[0;31m"; + c_reset = params.monochrome_logs ? '' : "\033[0m"; + + if (workflow.stats.ignoredCount > 0 && workflow.success) { + log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}-" + log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}-" + log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}-" + } + + if (workflow.success) { + log.info "-${c_purple}[nf-core/dualrnaseq]${c_green} Pipeline completed successfully${c_reset}-" + } else { + checkHostname() + log.info "-${c_purple}[nf-core/dualrnaseq]${c_red} Pipeline completed with errors${c_reset}-" + } + +} + + +def nfcoreHeader() { + // Log colors ANSI codes + c_black = params.monochrome_logs ? '' : "\033[0;30m"; + c_blue = params.monochrome_logs ? '' : "\033[0;34m"; + c_cyan = params.monochrome_logs ? '' : "\033[0;36m"; + c_dim = params.monochrome_logs ? '' : "\033[2m"; + c_green = params.monochrome_logs ? '' : "\033[0;32m"; + c_purple = params.monochrome_logs ? '' : "\033[0;35m"; + c_reset = params.monochrome_logs ? '' : "\033[0m"; + c_white = params.monochrome_logs ? '' : "\033[0;37m"; + c_yellow = params.monochrome_logs ? '' : "\033[0;33m"; + + return """ -${c_dim}--------------------------------------------------${c_reset}- + ${c_green},--.${c_black}/${c_green},-.${c_reset} + ${c_blue} ___ __ __ __ ___ ${c_green}/,-._.--~\'${c_reset} + ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} + ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} + ${c_green}`._,._,\'${c_reset} + ${c_purple} nf-core/dualrnaseq v${workflow.manifest.version}${c_reset} + -${c_dim}--------------------------------------------------${c_reset}- + """.stripIndent() +} + +def checkHostname() { + def c_reset = params.monochrome_logs ? '' : "\033[0m" + def c_white = params.monochrome_logs ? '' : "\033[0;37m" + def c_red = params.monochrome_logs ? '' : "\033[1;91m" + def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m" + if (params.hostnames) { + def hostname = "hostname".execute().text.trim() + params.hostnames.each { prof, hnames -> + hnames.each { hname -> + if (hostname.contains(hname) && !workflow.profile.contains(prof)) { + log.error "====================================================\n" + + " ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" + + " but your machine hostname is ${c_white}'$hostname'${c_reset}\n" + + " ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" + + "============================================================" + } + } + } + } +} \ No newline at end of file