diff --git a/README.md b/README.md index 1585852..92897aa 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This codebase is derived from [KevinKuchinski/FluViewer](https://github.com/Kevi ## Analysis Stages -0. **Read Normalization**: The provided reads are normalized and downsampled using a kmer-based approach from [bbmap](https://sourceforge.net/projects/bbmap) called `bbnorm`. This reduces any excessive coverage of certain genome regions. +0. **Read Normalization**: (Optional) The provided reads are normalized and downsampled using a kmer-based approach from [bbmap](https://sourceforge.net/projects/bbmap) called `bbnorm`. This reduces any excessive coverage of certain genome regions. 1. **Assemble Contigs**: The normalized/downsampled reads are assembled de novo into contigs with the [spades](https://github.com/ablab/spades) assembler. @@ -54,7 +54,7 @@ Each stage selects its inputs from the `outputs` of the previous stages's analys ```mermaid flowchart TD - forward_reads[Forward Reads] -- input_reads_fwd --> normalization(Read Normalization) + forward_reads[Forward Reads] -- input_reads_fwd --> normalization(Read Normalization (Optional)) reverse_reads[Reverse Reads] -- input_reads_rev --> normalization normalization -- normalized_reads_fwd --> assemble_contigs(Assemble Contigs) normalization -- normalized_reads_rev --> assemble_contigs @@ -140,13 +140,18 @@ optional arguments: -g, --disable-garbage-collection Disable garbage collection and retain intermediate analysis files --skip-depth-normalization - Skip read depth normalization (bbnorm) step + Skip read depth normalization (bbnorm) stage. --force Allow overwrite of existing files and directories. --log-level {info,debug} Log level (default=info) --version show program's version number and exit ``` +### Depth Normalization + +Depending on the library preparation method used, some libraries get much higher depth of coverage near the +ends of each segment. +The `normalize_depth` stage is intended to normalize the depth-of-coverage across each segment to a consistent level, but this stage is optional. If you would like to skip the `normalize_depth` stage, simply add the `--skip-depth-normalization` flag. If that flag is used, the `assemble_contigs` stage will start directly with the set of reads supplied with the `--forward-reads` (`-f`) and `--reverse-reads` (`-r`) flags. ## FluViewer Database diff --git a/fluviewer/cli_args.py b/fluviewer/cli_args.py index 418545b..449c803 100644 --- a/fluviewer/cli_args.py +++ b/fluviewer/cli_args.py @@ -12,23 +12,23 @@ def parse_args(): :rtype: dict """ parser = argparse.ArgumentParser(description='BCCDC-PHL/FluViewer: Influenza A virus consensus sequence generation and variant calling') - parser.add_argument('-f', '--forward-reads', type=Path, required=True, help='Path to FASTQ file containing forward reads') - parser.add_argument('-r', '--reverse-reads', type=Path, required=True, help='Path to FASTQ file containing reverse reads') - parser.add_argument('-d', '--db', type=Path, required=True, help='Path to FASTA file containing FluViewer database') - parser.add_argument('-o', '--outdir', type=Path, help='Output directory (default=FluViewer_)') - parser.add_argument('-n', '--output-name', type=str, required=True, help='Output name. Includes this name in output files, and in consensus sequence headers') - parser.add_argument('-i', '--min-identity', type=float, default=90, metavar="[0-100]", help='Minimum percent sequence identity between database reference sequences and contigs (default=90)') - parser.add_argument('-l', '--min-alignment-length', type=int, default=50, metavar="[32-]", help='Minimum length of alignment between database reference sequences and contigs (default=50)') - parser.add_argument('-D', '--min-depth', type=int, default=20, metavar="[1-]", help='Minimum read depth for base calling (default=20)') - parser.add_argument('-q', '--min-mapping-quality', type=int, default=20, metavar="[0-]", help='Minimum PHRED score for mapping quality and base quality during variant calling (default=20)') - parser.add_argument('-v', '--variant-threshold-calling', type=float, default=0.75, metavar="[0-1]", help='Variant allele fraction threshold for calling variants (default=0.75)') - parser.add_argument('-V', '--variant-threshold-masking', type=float, default=0.25, metavar="[0-1]", help='Variant allele fraction threshold for masking ambiguous variants (default=0.25)') - parser.add_argument('-N', '--target-depth', type=int, default=200, metavar="[1-]", help='Target depth for pre-normalization of reads (default=200)') - parser.add_argument('-L', '--coverage-limit', type=int, default=200, metavar="[1-]", help='Coverage depth limit for variant calling (default=200)') - parser.add_argument('-t', '--threads', type=int, default=1, metavar="[1-]", help='Threads used for contig/scaffold alignments (default=1)') - parser.add_argument('-M', '--max-memory', type=int, metavar="[1-]", help='Gigabytes of memory allocated for normalizing reads (default=max)') - parser.add_argument('-g', '--disable-garbage-collection', action='store_true', help='Disable garbage collection and retain intermediate analysis files') - parser.add_argument('--skip-depth-normalization', action='store_true', help='Skip read depth normalization (bbnorm) step') + parser.add_argument('-f', '--forward-reads', type=Path, required=True, help='Path to FASTQ file containing forward reads.') + parser.add_argument('-r', '--reverse-reads', type=Path, required=True, help='Path to FASTQ file containing reverse reads.') + parser.add_argument('-d', '--db', type=Path, required=True, help='Path to FASTA file containing FluViewer database.') + parser.add_argument('-o', '--outdir', type=Path, help='Output directory. (default=FluViewer_)') + parser.add_argument('-n', '--output-name', type=str, required=True, help='Output name. Includes this name in output files, and in consensus sequence headers.') + parser.add_argument('-i', '--min-identity', type=float, default=90, metavar="[0-100]", help='Minimum percent sequence identity between database reference sequences and contigs. (default=90)') + parser.add_argument('-l', '--min-alignment-length', type=int, default=50, metavar="[32-]", help='Minimum length of alignment between database reference sequences and contigs. (default=50)') + parser.add_argument('-D', '--min-depth', type=int, default=20, metavar="[1-]", help='Minimum read depth for base calling. (default=20)') + parser.add_argument('-q', '--min-mapping-quality', type=int, default=20, metavar="[0-]", help='Minimum PHRED score for mapping quality and base quality during variant calling. (default=20)') + parser.add_argument('-v', '--variant-threshold-calling', type=float, default=0.75, metavar="[0-1]", help='Variant allele fraction threshold for calling variants. (default=0.75)') + parser.add_argument('-V', '--variant-threshold-masking', type=float, default=0.25, metavar="[0-1]", help='Variant allele fraction threshold for masking ambiguous variants. (default=0.25)') + parser.add_argument('-N', '--target-depth', type=int, default=200, metavar="[1-]", help='Target depth for pre-normalization of reads. (default=200)') + parser.add_argument('-L', '--coverage-limit', type=int, default=200, metavar="[1-]", help='Coverage depth limit for variant calling. (default=200)') + parser.add_argument('-t', '--threads', type=int, default=1, metavar="[1-]", help='Threads used for contig/scaffold alignments. (default=1)') + parser.add_argument('-M', '--max-memory', type=int, metavar="[1-]", help='Gigabytes of memory allocated for normalizing reads. (default=max)') + parser.add_argument('-g', '--disable-garbage-collection', action='store_true', help='Disable garbage collection and retain intermediate analysis files.') + parser.add_argument('--skip-depth-normalization', action='store_true', help='Skip read depth normalization (bbnorm) stage.') parser.add_argument('--force', action='store_true', help='Allow overwrite of existing files and directories.') parser.add_argument('--log-level', default='info', choices=['info', 'debug'], help='Log level (default=info)') parser.add_argument('--version', action='version', version=__version__) diff --git a/fluviewer/fluviewer.py b/fluviewer/fluviewer.py index 0aac0ef..8cd2597 100644 --- a/fluviewer/fluviewer.py +++ b/fluviewer/fluviewer.py @@ -338,15 +338,16 @@ def main(): 'filtered_scaffold_blast_results': blast_scaffolds_analysis_summary['outputs']['filtered_scaffold_blast_results'], 'database': os.path.abspath(args.db), } - if not args.skip_depth_normalization: + + if args.skip_depth_normalization: current_analysis_stage_inputs.update({ - 'reads_fwd': normalize_depth_analysis_summary['outputs']['normalized_reads_fwd'], - 'reads_rev': normalize_depth_analysis_summary['outputs']['normalized_reads_rev'], + 'reads_fwd': os.path.abspath(args.forward_reads), + 'reads_rev': os.path.abspath(args.reverse_reads), }) else: current_analysis_stage_inputs.update({ - 'reads_fwd': os.path.abspath(args.forward_reads), - 'reads_rev': os.path.abspath(args.reverse_reads), + 'reads_fwd': normalize_depth_analysis_summary['outputs']['normalized_reads_fwd'], + 'reads_rev': normalize_depth_analysis_summary['outputs']['normalized_reads_rev'], }) log.info(f'Beginning analysis stage: {current_analysis_stage}')