diff --git a/.gitignore b/.gitignore index 9bea433..ee614e8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,7 @@ .DS_Store +lib/TBtools_specifications.pm +MANUAL.md.backup +opt/GenomeAnalysisTK.jar +README.md.backup +Thumbs.db diff --git a/MANUAL.md b/MANUAL.md index 59342b3..d971eb5 100644 --- a/MANUAL.md +++ b/MANUAL.md @@ -92,12 +92,6 @@ MTBseq can then be installed with: `conda install -c bioconda mtbseq` -Due to license restrictions, even bioconda cannot install the dependency GenomeAnalysisTK 3.8 directly. After installation of MTBseq to fully install the GATK, you must download a licensed copy of the GenomeAnalysisTK 3.8 from the Broad Institute ([GATK Version 3.8](https://software.broadinstitute.org/gatk/download/auth?package=GATK-archive&version=3.8-0-ge9d806836)), and call - -`gatk3-register /path/to/GenomeAnalysisTK[-$PKG_VERSION.tar.bz2|.jar]` - - which will copy GATK into your conda environment. - **SOURCE** @@ -286,9 +280,7 @@ functional modules and supplying parameters used in the analysis. --basecalib This OPTION specifies a file for base quality recalibration. The list must be in VCF format and should contain known SNPs. Give the full path to the file. The required structure of the file can be seen here: /MTBseq_source/var/res/MTB_Base_Calibration_List.vcf - --all_vars This OPTION is used in the modules TBvariants, TBstats, TBjoin, and TBstrains. By default, the OPTION is not active. Setting this OPTION will skip all filtering steps and report the calculated information for all positions in the input file. - - --snp_vars This OPTION is used in TBvariants, TBstats, TBjoin, and TBstrains. By default, the OPTION is not active. Setting this OPTION will add an additional filter that excludes all variants except SNPs. + --snp_vars This OPTION is used in TBvariants and TBjoin. By default, the OPTION is not active. Setting this OPTION will add an additional filter that excludes all variants except SNPs. TBvariants could be run without the option and the filter can only be activated for TBjoin afterwards. --lowfreq_vars This OPTION is used in TBvariants, TBstats, TBjoin, and TBstrains. By default, the OPTION is not active. Setting this OPTION has major implications on how the mapping data for each position is processed. By default, the majority allele is called and taken for further calculations. If the --lowfreq_vars OPTION is set, MTBseq will consider the majority allele distinct from wild type, if such an allele is present. This means that only in this detection mode, MTBseq will report variants present only in subpopulations, i.e. low frequency mutations. Of course, OPTIONS --mincovf, --mincovr, --minphred20, and --minfreq need to be set accordingly. Please be aware that output generated in this detection mode should not be used for phylogenetic analysis. @@ -311,7 +303,7 @@ functional modules and supplying parameters used in the analysis. --quiet This OPTION turns off the display logging function and will report the logging only in a file, called "MTBseq_[DATE]_[USER].log". - --threads This OPTION is used in TBbwa, TBmerge, TBrefine, TBpile and TBlist. By default, the OPTION is set to 1. The OPTION sets the maximum number of CPUs to use within the pipeline. You can use more than one core in order to execute the pipeline faster. 8 is the current maximum. + --threads This OPTION is used in TBbwa, TBmerge, TBrefine, TBpile and TBlist. By default, the OPTION is set to 1. The OPTION sets the maximum number of CPUs to use within the pipeline. You can use more than one core in order to execute the pipeline faster. --help This OPTION will show you all available OPTIONs and corresponding VALUEs used by MTBseq. diff --git a/MTBseq b/MTBseq index ecd7ca8..a8c360d 100644 --- a/MTBseq +++ b/MTBseq @@ -20,7 +20,7 @@ use TBgroups; use TBtools; -my $VERSION = "1.0.4"; +my $VERSION = "1.1.0"; # get current working directory and time. my $W_dir = getcwd(); @@ -320,7 +320,7 @@ my @amend_files_new; my @group_files; my @samples; my $sample_number; -my $output_mode = $all_vars . $snp_vars . $lowfreq_vars; + if(-f $samples) { open(IN,"$samples") || die print $logprint "\t",timer(),"\tCan't find $samples file! MTBseq.pl line: ", __LINE__ ," \n"; while(my $line = ) { @@ -536,14 +536,14 @@ if($step eq 'TBvariants') { opendir(POSDIR,"$POS_OUT") || die print $logprint "\t",timer(),"\tCan\'t open directory $POS_OUT: MTBseq.pl line: ", __LINE__ ," \n"; opendir(CALLDIR,"$CALL_OUT") || die print $logprint "\t",timer(),"\tCan\'t open directory $CALL_OUT: MTBseq.pl line: ", __LINE__ ," \n"; @pos_files = grep { $_ =~ /^\w.*\.gatk_position_table\.tab$/ && -f "$POS_OUT/$_" } readdir(POSDIR); -@var_files = grep { $_ =~ /^\w.*\.gatk_position_variants_cf$micovf\_cr$micovr\_fr$mifreq\_ph$miphred20\_outmode$output_mode\.tab$/ && -f "$CALL_OUT/$_" } readdir(CALLDIR); +@var_files = grep { $_ =~ /^\w.*\.gatk_position_variants_cf$micovf\_cr$micovr\_fr$mifreq\_ph$miphred20\_outmode[01]$snp_vars$lowfreq_vars\.tab$/ && -f "$CALL_OUT/$_" } readdir(CALLDIR); closedir(POSDIR); closedir(CALLDIR); if(scalar(@pos_files) == 0) { print $logprint "\n\t",timer(),"\tNo files to call variants! Check content of $POS_OUT!\n"; exit 1; } -%check_up = map { (my $id = $_) =~ s/\.gatk_position_variants_cf$micovf\_cr$micovr\_fr$mifreq\_ph$miphred20\_outmode$output_mode\.tab$//; $id => $id; } @var_files; +%check_up = map { (my $id = $_) =~ s/\.gatk_position_variants_cf$micovf\_cr$micovr\_fr$mifreq\_ph$miphred20\_outmode[01]$snp_vars$lowfreq_vars\.tab$//; $id => $id; } @var_files; for(my $i = 0; $i < scalar(@pos_files); $i++) { my $tmp = $pos_files[$i]; $tmp =~ s/\.gatk_position_table\.tab//; @@ -560,7 +560,7 @@ foreach my $pos_file (sort { $a cmp $b } @pos_files_new) { print $logprint "\t",timer(),"\t$pos_file\n"; } print $logprint "\n\t",timer(),"\tStart variant calling...\n"; -tbvariants($logprint,$VAR_dir,$POS_OUT,$CALL_OUT,$ref,$refg,$micovf,$micovr,$miphred20,$mifreq,$resi_list_master,$int_regions,$all_vars,$snp_vars,$lowfreq_vars,@pos_files_new); +tbvariants($logprint,$VAR_dir,$POS_OUT,$CALL_OUT,$ref,$refg,$micovf,$micovr,$miphred20,$mifreq,$resi_list_master,$int_regions,$snp_vars,$lowfreq_vars,@pos_files_new); print $logprint "\t",timer(),"\tFinished variant calling!\n"; @pos_files = (); @var_files = (); @@ -600,7 +600,7 @@ foreach my $bam (sort { $a cmp $b } @bam_files_new) { print $logprint "\t",timer(),"\t$bam\n"; } print $logprint "\n\t",timer(),"\tStart statistics calculation...\n"; -tbstats($logprint,$W_dir,$VAR_dir,$SAMTOOLS_dir,$SAMTOOLS_call,$BAM_OUT,$POS_OUT,$STATS_OUT,$ref,$refg,$micovf,$micovr,$miphred20,$mifreq,$all_vars,$snp_vars,$lowfreq_vars,$date_string,@bam_files_new); +tbstats($logprint,$W_dir,$VAR_dir,$SAMTOOLS_dir,$SAMTOOLS_call,$BAM_OUT,$POS_OUT,$STATS_OUT,$ref,$refg,$micovf,$micovr,$miphred20,$mifreq,$snp_vars,$lowfreq_vars,$date_string,@bam_files_new); print $logprint "\t",timer(),"\tFinished statistics calculation!\n"; @bam_files = (); @pos_files = (); @@ -626,7 +626,7 @@ foreach my $pos_file (sort { $a cmp $b } @pos_files) { print $logprint "\t",timer(),"\t$pos_file\n"; } print $logprint "\n\t",timer(),"\tStart strain call...\n"; -tbstrains($logprint,$VAR_dir,$POS_OUT,$STRAIN_OUT,$ref,$refg,$micovf,$micovr,$mifreq,$miphred20,$date_string,$all_vars,$snp_vars,$lowfreq_vars,@pos_files); +tbstrains($logprint,$VAR_dir,$POS_OUT,$STRAIN_OUT,$ref,$refg,$micovf,$micovr,$mifreq,$miphred20,$date_string,$lowfreq_vars,@pos_files); print $logprint "\t",timer(),"\tFinished strain call!\n"; @pos_files = (); if($continue == 0 && $step ne 'TBfull') { exit 1; } @@ -653,7 +653,13 @@ while() { close(IN); opendir(CALLDIR,"$CALL_OUT") || die print $logprint "\t",timer(),"\tCan\'t open directory $CALL_OUT: MTBseq.pl line: ", __LINE__ ," \n"; opendir(JOINDIR,"$JOIN_OUT") || die print $logprint "\t",timer(),"\tCan\'t open directory $JOIN_OUT: MTBseq.pl line: ", __LINE__ ," \n"; -@var_files = grep { $_ =~ /^\w.*\.gatk_position_variants_cf$micovf\_cr$micovf\_fr$mifreq\_ph$miphred20\_outmode$output_mode\.tab$/ && -f "$CALL_OUT/$_" } readdir(CALLDIR); +if($snp_vars == 1){ +@var_files = grep { $_ =~ /^\w.*\.gatk_position_variants_cf$micovf\_cr$micovf\_fr$mifreq\_ph$miphred20\_outmode[01][01]$lowfreq_vars\.tab$/ && -f "$CALL_OUT/$_" } readdir(CALLDIR); +} +else { +@var_files = grep { $_ =~ /^\w.*\.gatk_position_variants_cf$micovf\_cr$micovf\_fr$mifreq\_ph$miphred20\_outmode[01]$snp_vars$lowfreq_vars\.tab$/ && -f "$CALL_OUT/$_" } readdir(CALLDIR); +} + @join_files = grep { $_ =~ /$group_name\_joint_cf$micovf\_cr$micovr\_fr$mifreq\_ph$miphred20\_samples$sample_number\.tab$/ && -f "$JOIN_OUT/$_" } readdir(JOINDIR); closedir(CALLDIR); closedir(JOINDIR); @@ -682,7 +688,7 @@ foreach my $var_file (sort { $a cmp $b } @var_files_new) { print $logprint "\t",timer(),"\t$var_file\n"; } print $logprint "\n\t",timer(),"\tStart creating joint variant table...\n"; -tbjoin($logprint,$VAR_dir,$POS_OUT,$CALL_OUT,$JOIN_OUT,$group_name,$ref,$refg,$micovf,$micovr,$miphred20,$mifreq,$all_vars,$snp_vars,$lowfreq_vars,@var_files_new); +tbjoin($logprint,$VAR_dir,$POS_OUT,$CALL_OUT,$JOIN_OUT,$group_name,$ref,$refg,$micovf,$micovr,$miphred20,$mifreq,$snp_vars,$lowfreq_vars,@var_files_new); print $logprint "\t",timer(),"\tFinished creating joint variant table!\n"; @var_files = (); @join_files = (); diff --git a/README.md b/README.md index 1c72e06..d3418e2 100644 --- a/README.md +++ b/README.md @@ -23,25 +23,6 @@ Install [Conda](https://conda.io/docs/) or [Miniconda](https://conda.io/minicond conda install -c bioconda mtbseq ``` - -### NOT NECESSARY ANYMORE - GATK will be installed through conda now! - -Due to license restrictions, even bioconda cannot install the dependency GenomeAnalysisTK 3.8 directly. -After installation of MTBseq to fully install the GATK, you must download a licensed copy of the GenomeAnalysisTK 3.8 -from the Broad Institute (https://console.cloud.google.com/storage/browser/gatk-software/package-archive/gatk). - -For direct download from the commandline: - -``` -wget https://storage.googleapis.com/gatk-software/package-archive/gatk/GenomeAnalysisTK-3.8-0-ge9d806836.tar.bz2 -``` - -To register call: -``` -gatk3-register /path/to/GenomeAnalysisTK[-$PKG_VERSION.tar.bz2|.jar] -``` -, which will copy GATK into your conda environment. - ## Source Please see the [MANUAL.md](https://github.com/ngs-fzb/MTBseq_source/blob/master/MANUAL.md) for installation from source. diff --git a/lib/TBjoin.pm b/lib/TBjoin.pm index 3907ef3..061cf09 100644 --- a/lib/TBjoin.pm +++ b/lib/TBjoin.pm @@ -9,7 +9,7 @@ use TBtools; use Exporter; use vars qw($VERSION @ISA @EXPORT); -$VERSION = 1.0.0; +$VERSION = 1.1.0; @ISA = qw(Exporter); @EXPORT = qw(tbjoin); @@ -27,8 +27,8 @@ sub tbjoin { my $micovr = shift; my $miphred20 = shift; my $mifreq = shift; - my $all_vars = shift; - $all_vars = 1; + #my $all_vars = shift; #deactivated the option + my $all_vars = 1; my $snp_vars = shift; my $lowfreq_vars = shift; my @var_files = @_; @@ -60,7 +60,7 @@ sub tbjoin { foreach my $file(sort { $a cmp $b } @var_files) { $file =~ /^(.*)\.gatk_position_variants_.*\.tab$/; my $id = $1; - parse_variants($logprint,$CALL_OUT,$file,$id,$var_positions,$strain,$micovf,$micovr,$mifreq,$miphred20); + parse_variants($logprint,$CALL_OUT,$file,$id,$var_positions,$strain,$micovf,$micovr,$mifreq,$miphred20,$snp_vars); push(@ids, $id); } print $logprint "\t",timer(),"\tFinished parsing variant files!\n"; diff --git a/lib/TBstats.pm b/lib/TBstats.pm index 7796ed5..0eff6bf 100644 --- a/lib/TBstats.pm +++ b/lib/TBstats.pm @@ -9,7 +9,7 @@ use TBtools; use Exporter; use vars qw($VERSION @ISA @EXPORT); -$VERSION = 1.0.0; +$VERSION = 1.1.0; @ISA = qw(Exporter); @EXPORT = qw(tbstats); @@ -29,7 +29,7 @@ sub tbstats { my $micovr = shift; my $miphred20 = shift; my $mifreq = shift; - my $all_vars = shift; + my $all_vars = 0; #deactivated the option my $snp_vars = shift; my $lowfreq_vars = shift; my $date_string = shift; diff --git a/lib/TBstrains.pm b/lib/TBstrains.pm index 366cdcf..2495ada 100644 --- a/lib/TBstrains.pm +++ b/lib/TBstrains.pm @@ -9,7 +9,7 @@ use TBtools; use Exporter; use vars qw($VERSION @ISA @EXPORT); -$VERSION = 1.0.0; +$VERSION = 1.1.0; @ISA = qw(Exporter); @EXPORT = qw(tbstrains); @@ -26,9 +26,9 @@ sub tbstrains { my $mifreq = shift; my $miphred20 = shift; my $date_string = shift; - my $all_vars = shift; - $all_vars = 1; # set to 1 for fetching all genome positions. - my $snp_vars = shift; + #my $all_vars = shift; + my $all_vars = 1; # set to 1 for fetching all genome positions. + my $snp_vars = 0; #deactivated the option my $lowfreq_vars = shift; my @position_tables = @_; my $genes = {}; diff --git a/lib/TBtools.pm b/lib/TBtools.pm index 9ed5f82..74f8824 100644 --- a/lib/TBtools.pm +++ b/lib/TBtools.pm @@ -11,7 +11,7 @@ use MCE; use Exporter; use vars qw($VERSION @ISA @EXPORT); -$VERSION = 1.0.2; +$VERSION = 1.1.0; @ISA = qw(Exporter); @EXPORT = qw(parse_reference parse_fasta @@ -935,6 +935,7 @@ sub parse_variants { # parse a variant file. my $micovr = shift; my $mifreq = shift; my $miphred20 = shift; + my $snp_vars = shift; open(IN,"$CALL_OUT/$file") || die print $logprint "\t",timer(),"\tCan't open $file: TBtools line: ", __LINE__ , " \n"; ; while() { @@ -959,6 +960,7 @@ sub parse_variants { # parse a variant file. my $resistance = shift(@fields); my $phylo = shift(@fields); my $region = shift(@fields); + next if(($snp_vars == 1) && !($type eq "SNP")); my $unambiguous_base_call = 0; if(($covf >= $micovf) && ($covr >= $micovr) && ($freq1 >= $mifreq) && ($qual20 >= $miphred20)) { if(($allel1 =~ /[ACGTacgt]/) || ($allel1 =~ /GAP/)) { diff --git a/lib/TBvariants.pm b/lib/TBvariants.pm index d11bc86..c0d3bff 100644 --- a/lib/TBvariants.pm +++ b/lib/TBvariants.pm @@ -8,7 +8,7 @@ use TBtools; use Exporter; use vars qw($VERSION @ISA @EXPORT); -$VERSION = 1.0.1; +$VERSION = 1.1.0; @ISA = qw(Exporter); @EXPORT = qw(tbvariants); @@ -26,7 +26,7 @@ sub tbvariants { my $mifreq = shift; my $resi_list_master = shift; my $int_regions = shift; - my $all_vars = shift; + my $all_vars = 0; #deactivated the option my $snp_vars = shift; my $lowfreq_vars = shift; my @pos_files = @_;