working on star subworkflow

nf-core · blazejszczerba · Dec 10, 2023 · Dec 13, 2023 · Dec 23, 2023 · Dec 23, 2023
commit 364451c181d5192dc93b6639a725003e0c0f95ff
diff --git a/bin/combine_tables.py b/bin/combine_tables.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Dec  6 09:26:36 2019
+
+@author: B.Mika-Gospodorz
+
+
+Input files: list of .txt files with mapping statistics for different samples generated with count_multi_mapped_reads.sh, count_uniquely_mapped_read_pairs, count_uniquely_mapped_read_pairs.sh or count_multi_mapped_read_pairs.sh
+Output file: .tsv file with combined statistics from all samples
+Description: Used to collect mapping statistics from all samples
+"""
+
+
+import argparse
+
+import pandas as pd
+
+
+# function to combine quantification results
+def combine_stat_tables(input_files, suffix):
+    combined_tables = pd.DataFrame()  # initialize data frame
+    for input_file in input_files:  # iterate over input files
+        # read file
+        table_read = pd.read_csv(input_file, sep=" ", header=None, index_col=1)
+        # extract sample name
+        sample_name = str(table_read[0][0])
+        table_read = table_read.drop(0, 1)
+        table_read.columns = [sample_name]
+        # add new table to combined_tables data frame
+        combined_tables = pd.concat([combined_tables, table_read], axis=1)
+    # transpose table to have 'host' and 'pathogen' labels as column names
+    combined_tables = combined_tables.transpose()
+    combined_tables.columns = ["host_" + suffix, "pathogen_" + suffix]
+    return combined_tables
+
+
+parser = argparse.ArgumentParser(description="""Combine mapping statistis from all samples""")
+parser.add_argument(
+    "-i",
+    "--input_files",
+    metavar="<input_files>",
+    nargs="+",
+    default="*.txt",
+    help="Path to mapping statistic results ",
+)
+parser.add_argument("-o", "--output_dir", metavar="<output>", help="output dir", default=".")
+parser.add_argument("-s", "--suffix", metavar="<suffix>", help="uniquely_mapped/multi_mapped", default=".")
+args = parser.parse_args()
+
+# combine results
+combineTables = combine_stat_tables(args.input_files, args.suffix)
+
+# save results
+combineTables.to_csv(args.output_dir, sep="\t")
diff --git a/bin/count_cross_mapped_reads.sh b/bin/count_cross_mapped_reads.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+#-------------------------
+#
+# Description: 
+# Script to count cross mapped reads in *_cross_mapped_reads.txt file created by remove_crossmapped_reads_BAM.sh
+#
+# Created by B. Mika-Gospodorz
+# Date: 27th May 2020
+#
+# Input files:  list of *_cross_mapped_reads.txt files
+# Output: cross_mapped_reads_sum.txt
+#
+#-------------------------
+
+
+
+for i in "$@" # iterate over .txt files
+do
+lines=$(wc -l $i | awk '{print $1}')  # count number of lines
+echo -n -e "$i\t$lines\n" >> cross_mapped_reads_sum.txt  # appends to cross_mapped_reads_sum.txt
+done
+
+
diff --git a/bin/count_multi_mapped_read_pairs.sh b/bin/count_multi_mapped_read_pairs.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#-------------------------
+#
+# Description: 
+# Script to count multi mapped reads (paired-end reads) from .bam file
+#
+# Created by B. Mika-Gospodorz
+# Date: 17th July 2020
+#
+# Input files:  $1 bam file 
+# 		$2 reference_host_names.txt file that contains host references extracted with extract_reference_names_from_fasta_files.sh
+# 		$3 reference_pathogen_names.txt file that contains pathogen references extracted with extract_reference_names_from_fasta_files.sh
+# 		$4 sample name
+# 		$5 output file name *_multi_mapped.txt
+# Output: *_multi_mapped.txt file with list of multi-mapped reads
+#
+#-------------------------
+
+
+bam_file=$1
+host_reference_names=$2
+pathogen_reference_names=$3
+sample_name=$4
+out_name=$5
+
+
+# count reads that mapped onto multiple positions of host genome (-f 66 to extract read mapped in proper pair and first in pair) 
+host=$(samtools view -f 66 -h $bam_file | grep -f $host_reference_names | fgrep -wv NH:i:1 | fgrep -w HI:i:1 | echo "$sample_name host `wc -l`")
+# count reads that mapped onto multiple positions of pathogen genome 
+pathogen=$(samtools view -f 66 -h $bam_file | grep -f $pathogen_reference_names | fgrep -wv NH:i:1 | fgrep -w HI:i:1 | echo "$sample_name pathogen `wc -l`")
+# combine information about host and pathogen multi-mapped reads
+counts="${host}\n${pathogen}" 
+# save into file
+echo -e $counts > $out_name
+
+
diff --git a/bin/count_multi_mapped_reads.sh b/bin/count_multi_mapped_reads.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#-------------------------
+#
+# Description: 
+# Script to count multi mapped reads in .bam file
+#
+# Created by B. Mika-Gospodorz
+# Date: 27th May 2020
+#
+# Input files:  $1 bam file 
+# 		$2 reference_host_names.txt file that contains host references extracted with extract_reference_names_from_fasta_files.sh
+# 		$3 reference_pathogen_names.txt file that contains pathogen references extracted with extract_reference_names_from_fasta_files.sh
+# 		$4 sample name
+# 		$5 output file name
+# Output: *_multi_mapped.txt file with list of multi-mapped reads
+#
+#-------------------------
+
+
+bam_file=$1
+host_reference_names=$2
+pathogen_reference_names=$3
+sample_name=$4
+out_name=$5
+
+# count reads that mapped onto multiple positions of host genome (-F 4 to extract mapped reads) 
+host=$(samtools view -F 4 -h $bam_file | grep -f $host_reference_names | fgrep -wv NH:i:1 | fgrep -w HI:i:1 | echo "$sample_name host `wc -l`")
+# count reads that mapped onto multiple positions of pathogen genome 
+pathogen=$(samtools view -F 4 -h $bam_file | grep -f $pathogen_reference_names | fgrep -wv NH:i:1 | fgrep -w HI:i:1 | echo "$sample_name pathogen `wc -l`")
+# combine information about host and pathogen multi-mapped reads
+counts="${host}\n${pathogen}" 
+# save into file
+echo -e $counts > $out_name
+
+
+
diff --git a/bin/count_uniquely_mapped_read_pairs.sh b/bin/count_uniquely_mapped_read_pairs.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+#-------------------------
+#
+# Description: 
+# Script to count multi mapped reads (paired-end reads) from .bam file
+#
+# Created by B. Mika-Gospodorz
+# Date: 17th July 2020
+#
+# Input files:  $1 bam file 
+# 		$2 reference_host_names.txt file that contains host references extracted with extract_reference_names_from_fasta_files.sh
+# 		$3 reference_pathogen_names.txt file that contains pathogen references extracted with extract_reference_names_from_fasta_files.sh
+# 		$4 sample name
+# 		$5 output file name *_multi_mapped.txt
+# Output: *_uniquely_mapped.txt file with list of uniquely mapped reads
+#-------------------------
+
+bam_file=$1
+host_reference_names=$2
+pathogen_reference_names=$3
+sample_name=$4
+out_name=$5
+
+# count reads that mapped uniquely onto host genome (-f 66 to extract read mapped in proper pair and first in pair) 
+host=$(samtools view -f 66 -h $bam_file | grep -f $host_reference_names | fgrep -w NH:i:1 | echo "$sample_name host `wc -l`")
+# count reads that mapped uniquely onto pathogen genome 
+pathogen=$(samtools view -f 66 -h $bam_file | grep -f $pathogen_reference_names | fgrep -w NH:i:1 | echo "$sample_name pathogen `wc -l`")
+# combine information about host and pathogen uniquely mapped reads
+counts="${host}\n${pathogen}" 
+# save into file
+echo -e $counts > $out_name
diff --git a/bin/count_uniquely_mapped_reads.sh b/bin/count_uniquely_mapped_reads.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+#-------------------------
+#
+# Description: 
+# Script to count multi mapped reads in .bam file
+#
+# Created by B. Mika-Gospodorz
+# Date: 27th May 2020
+#
+# Input files:  $1 bam file 
+# 		$2 reference_host_names.txt file that contains host references extracted with extract_reference_names_from_fasta_files.sh
+# 		$3 reference_pathogen_names.txt file that contains pathogen references extracted with extract_reference_names_from_fasta_files.sh
+# 		$4 sample name
+# 		$5 output file name
+# Output: *_uniquely_mapped.txt file with list of uniquely mapped reads
+#
+#-------------------------
+
+bam_file=$1
+host_reference_names=$2
+pathogen_reference_names=$3
+sample_name=$4
+out_name=$5
+
+# count reads that mapped uniquely onto host genome (-F 4 to extract mapped reads) 
+host=$(samtools view -F 4 -h $bam_file | grep -f $host_reference_names | fgrep -w NH:i:1 | echo "$sample_name host `wc -l`")
+# count reads that mapped uniquely onto pathogen genome 
+pathogen=$(samtools view -F 4 -h $bam_file | grep -f $pathogen_reference_names | fgrep -w NH:i:1 | echo "$sample_name pathogen `wc -l`")
+# combine information about host and pathogen uniquely mapped reads
+counts="${host}\n${pathogen}" 
+# save into file
+echo -e $counts > $out_name
+
+
diff --git a/bin/extract_crossmapped_reads.py b/bin/extract_crossmapped_reads.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Sep  2 14:37:04 2019
+
+@author: B.Mika-Gospodorz
+
+Input files: stdin with bam file with multi-mapped reads, reference_host_names.txt and reference_pathogen_names.txt files that contain references extracted with extract_reference_names_from_fasta_files.sh
+Output: txt file with cross-mapped reads
+Description: Used to identify and extract reads that mapped onto both host and pathogen genomes (cross-mapped reads). The script is executed by remove_crossmapped_reads_BAM.sh and remove_crossmapped_read_paires_BAM.sh scripts.
+"""
+
+import argparse
+import sys
+
+import pysam
+
+
+# function to identify references given read mapped to
+def check_reference_organisms(read_reference_name, host_reference_names, pathogen_reference_names):
+    reference = ""
+    if (
+        read_reference_name in host_reference_names
+    ):  # if reference of read is in the list of host references, set host as reference
+        reference = "host"
+    elif (
+        read_reference_name in pathogen_reference_names
+    ):  # if reference of read is in the list of pathogen references, set pathogen as reference
+        reference = "pathogen"
+    else:
+        print(("There is no " + read_reference_name + " in the reference name set"))
+    return reference
+
+
+# function to add read and its reference to dictionary
+def add_read(multimapped_reads, read_name, reference_name):
+    if read_name in multimapped_reads:  # if read is in the dict, and reference is not defined, append the reference
+        if reference_name not in multimapped_reads[read_name]:
+            multimapped_reads[(read_name)].append((reference_name))
+    else:  # else create new key (read) and set reference as value
+        multimapped_reads[read_name] = [reference_name]
+
+
+# function to find and save cross-mapped reads
+def find_and_save_cross_mapped_reads(multimapped_reads_with_reference, output_file_name):
+    # extract reads with more than 1 reference name defined (cross-mapped reads)
+    crossmapeed_reads = [
+        read_name
+        for read_name, reference_name in list(multimapped_reads_with_reference.items())
+        if len(reference_name) > 1
+    ]
+    # save cross-mapped reads
+    with open(output_file_name, "w") as f:
+        for cross_mapped_read in crossmapeed_reads:
+            f.write(str(cross_mapped_read) + "\n")
+
+
+# function to identify and save cross-mapped reads
+def read_reads_from_samfile(sam_file_name, host_reference_names, pathogen_reference_names, output_file_name):
+    # read bam file
+    samfile = pysam.AlignmentFile(sam_file_name, "rb")
+    # initialize dictionary of multi-mapped reads
+    multimapped_reads = dict()
+    for read in samfile:  # iterate over reads from bam file
+        # find reference the read mapped to
+        reference_organisms = check_reference_organisms(
+            read.reference_name, host_reference_names, pathogen_reference_names
+        )
+        # add read and reference to multimapped_reads dict.
+        add_read(multimapped_reads, read.query_name, reference_organisms)
+    # find and save cross-mapped reads
+    find_and_save_cross_mapped_reads(multimapped_reads, output_file_name)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "-o", "--output", default="cross_mapped_reads.txt", metavar="output_file_name", help="output file name"
+)
+parser.add_argument(
+    "-h_ref", "--host_reference_names", metavar="<host_reference_name>", help="Path to reference_host_names.txt file"
+)
+parser.add_argument(
+    "-p_ref",
+    "--pathogen_reference_names",
+    metavar="<pathogen_reference_name>",
+    help="Path to reference_pathogen_names.txt file",
+)
+args = parser.parse_args()
+
+# create list of host and pathogen chromosome/plasmid names
+host_reference_names = [line.rstrip() for line in open(args.host_reference_names)]
+pathogen_reference_names = [line.rstrip() for line in open(args.pathogen_reference_names)]
+
+
+# identify and extract cross-mapped reads
+read_reads_from_samfile(sys.stdin, host_reference_names, pathogen_reference_names, args.output)
diff --git a/bin/extract_reference_names_from_fasta_files.sh b/bin/extract_reference_names_from_fasta_files.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+#-------------------------
+#
+# Description: 
+# Script to extract chromosome and plasmid names from genome fasta files
+#
+# Created by B. Mika-Gospodorz
+# Date: 5th September 2019
+#
+# Input files:  $1 name of output file, eg. reference_host_names.txt
+# 		$@ list of fasta files
+# Output: file with a given name defined in the first argument that contains names of chromosomes or plasmids defined in the fasta files
+#
+#-------------------------
+
+# assign output file name to variable 
+output_name=$1
+shift
+
+for file in "$@" #iterate over fasta files
+do   
+   grep ">" ${file} | awk -F" " '{print $1}' | sed 's/>//' >> ${output_name}  # extract name followed '>' character and append to output file 
+done
diff --git a/bin/remove_crossmapped_read_pairs_BAM.sh b/bin/remove_crossmapped_read_pairs_BAM.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+#-------------------------
+#
+# Description: 
+# Script to remove cross-mapped reads from bam file 
+#
+# Created by B. Mika-Gospodorz
+# Date: 17th July 2020
+#
+# Input files:  $1 bam file 
+#		$2 directory where the extract_crossmapped_reads.py script is stored
+# 		$3 reference_host_names.txt file that contains host references extracted with extract_reference_names_from_fasta_files.sh
+# 		$4 reference_pathogen_names.txt file that contains pathogen references extracted with extract_reference_names_from_fasta_files.sh
+# 		$5 output file name for list of cross-mapped reads, e.g. *_cross_mapped_reads.txt
+# 		$6 output file name for bam file with removed cross-mapped reads, e.g. *_no_crossmapped.bam
+# Output: *_cross_mapped_reads.txt, *_no_crossmapped.bam
+#
+#-------------------------
+
+
+alignment=$1
+extract_crossmapped_reads_script_path=$2
+host_reference=$3
+pathogen_reference=$4
+out_name=$5
+
+# extract multi-mapped reads from bam file (-f 0x2 - read mapped in proper pair) and run extract_crossmapped_reads.py to extract reads that mapped onto both host and pathogen genomes simultaneously
+samtools view -f 0x2 -h $alignment | fgrep -vw NH:i:1 | python $extract_crossmapped_reads_script_path/extract_crossmapped_reads.py -h_ref $host_reference -p_ref $pathogen_reference -o $out_name
+
+cross_mapped_fragments=$out_name
+out_bam_name=$6
+
+# remove cross-mapped reads from bam file and create a new one
+samtools view -h $alignment | fgrep -wvf $cross_mapped_fragments | samtools view -bS -o $out_bam_name -