work on implementing skipping depth normalization

BCCDC-PHL · Jul 11, 2024 · db50363 · db50363
1 parent e3d2163
commit db50363
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ This codebase is derived from [KevinKuchinski/FluViewer](https://github.com/Kevi
 
 ## Analysis Stages
 
-0. **Read Normalization**: The provided reads are normalized and downsampled using a kmer-based approach from [bbmap](https://sourceforge.net/projects/bbmap) called `bbnorm`. This reduces any excessive coverage of certain genome regions.
+0. **Read Normalization**: (Optional) The provided reads are normalized and downsampled using a kmer-based approach from [bbmap](https://sourceforge.net/projects/bbmap) called `bbnorm`. This reduces any excessive coverage of certain genome regions.
 
 1. **Assemble Contigs**: The normalized/downsampled reads are assembled de novo into contigs with the [spades](https://github.com/ablab/spades) assembler.
 
@@ -54,7 +54,7 @@ Each stage selects its inputs from the `outputs` of the previous stages's analys
 
 ```mermaid
 flowchart TD
-  forward_reads[Forward Reads] -- input_reads_fwd --> normalization(Read Normalization)
+  forward_reads[Forward Reads] -- input_reads_fwd --> normalization(Read Normalization (Optional))
   reverse_reads[Reverse Reads] -- input_reads_rev --> normalization
   normalization -- normalized_reads_fwd --> assemble_contigs(Assemble Contigs)
   normalization -- normalized_reads_rev --> assemble_contigs
@@ -140,13 +140,18 @@ optional arguments:
   -g, --disable-garbage-collection
                         Disable garbage collection and retain intermediate analysis files
   --skip-depth-normalization
-                        Skip read depth normalization (bbnorm) step
+                        Skip read depth normalization (bbnorm) stage.
   --force               Allow overwrite of existing files and directories.
   --log-level {info,debug}
                         Log level (default=info)
   --version             show program's version number and exit
 ```
 
+### Depth Normalization
+
+Depending on the library preparation method used, some libraries get much higher depth of coverage near the
+ends of each segment.
+The `normalize_depth` stage is intended to normalize the depth-of-coverage across each segment to a consistent level, but this stage is optional. If you would like to skip the `normalize_depth` stage, simply add the `--skip-depth-normalization` flag. If that flag is used, the `assemble_contigs` stage will start directly with the set of reads supplied with the `--forward-reads` (`-f`) and `--reverse-reads` (`-r`) flags.
 
 ## FluViewer Database
 

diff --git a/fluviewer/cli_args.py b/fluviewer/cli_args.py
@@ -12,23 +12,23 @@ def parse_args():
     :rtype: dict
     """
     parser = argparse.ArgumentParser(description='BCCDC-PHL/FluViewer: Influenza A virus consensus sequence generation and variant calling')
-    parser.add_argument('-f', '--forward-reads', type=Path, required=True, help='Path to FASTQ file containing forward reads')
-    parser.add_argument('-r', '--reverse-reads', type=Path, required=True, help='Path to FASTQ file containing reverse reads')
-    parser.add_argument('-d', '--db', type=Path, required=True, help='Path to FASTA file containing FluViewer database')
-    parser.add_argument('-o', '--outdir', type=Path, help='Output directory (default=FluViewer_<output-name>)')
-    parser.add_argument('-n', '--output-name', type=str, required=True, help='Output name. Includes this name in output files, and in consensus sequence headers')
-    parser.add_argument('-i', '--min-identity', type=float, default=90, metavar="[0-100]", help='Minimum percent sequence identity between database reference sequences and contigs (default=90)')
-    parser.add_argument('-l', '--min-alignment-length', type=int, default=50, metavar="[32-]", help='Minimum length of alignment between database reference sequences and contigs (default=50)')
-    parser.add_argument('-D', '--min-depth', type=int, default=20, metavar="[1-]", help='Minimum read depth for base calling (default=20)')
-    parser.add_argument('-q', '--min-mapping-quality', type=int, default=20, metavar="[0-]", help='Minimum PHRED score for mapping quality and base quality during variant calling (default=20)')
-    parser.add_argument('-v', '--variant-threshold-calling', type=float, default=0.75, metavar="[0-1]", help='Variant allele fraction threshold for calling variants (default=0.75)')
-    parser.add_argument('-V', '--variant-threshold-masking', type=float, default=0.25, metavar="[0-1]", help='Variant allele fraction threshold for masking ambiguous variants (default=0.25)')
-    parser.add_argument('-N', '--target-depth', type=int, default=200, metavar="[1-]", help='Target depth for pre-normalization of reads (default=200)')
-    parser.add_argument('-L', '--coverage-limit', type=int, default=200, metavar="[1-]", help='Coverage depth limit for variant calling (default=200)')
-    parser.add_argument('-t', '--threads', type=int, default=1, metavar="[1-]", help='Threads used for contig/scaffold alignments (default=1)')
-    parser.add_argument('-M', '--max-memory', type=int, metavar="[1-]", help='Gigabytes of memory allocated for normalizing reads (default=max)')
-    parser.add_argument('-g', '--disable-garbage-collection', action='store_true', help='Disable garbage collection and retain intermediate analysis files')
-    parser.add_argument('--skip-depth-normalization', action='store_true', help='Skip read depth normalization (bbnorm) step')
+    parser.add_argument('-f', '--forward-reads', type=Path, required=True, help='Path to FASTQ file containing forward reads.')
+    parser.add_argument('-r', '--reverse-reads', type=Path, required=True, help='Path to FASTQ file containing reverse reads.')
+    parser.add_argument('-d', '--db', type=Path, required=True, help='Path to FASTA file containing FluViewer database.')
+    parser.add_argument('-o', '--outdir', type=Path, help='Output directory. (default=FluViewer_<output-name>)')
+    parser.add_argument('-n', '--output-name', type=str, required=True, help='Output name. Includes this name in output files, and in consensus sequence headers.')
+    parser.add_argument('-i', '--min-identity', type=float, default=90, metavar="[0-100]", help='Minimum percent sequence identity between database reference sequences and contigs. (default=90)')
+    parser.add_argument('-l', '--min-alignment-length', type=int, default=50, metavar="[32-]", help='Minimum length of alignment between database reference sequences and contigs. (default=50)')
+    parser.add_argument('-D', '--min-depth', type=int, default=20, metavar="[1-]", help='Minimum read depth for base calling. (default=20)')
+    parser.add_argument('-q', '--min-mapping-quality', type=int, default=20, metavar="[0-]", help='Minimum PHRED score for mapping quality and base quality during variant calling. (default=20)')
+    parser.add_argument('-v', '--variant-threshold-calling', type=float, default=0.75, metavar="[0-1]", help='Variant allele fraction threshold for calling variants. (default=0.75)')
+    parser.add_argument('-V', '--variant-threshold-masking', type=float, default=0.25, metavar="[0-1]", help='Variant allele fraction threshold for masking ambiguous variants. (default=0.25)')
+    parser.add_argument('-N', '--target-depth', type=int, default=200, metavar="[1-]", help='Target depth for pre-normalization of reads. (default=200)')
+    parser.add_argument('-L', '--coverage-limit', type=int, default=200, metavar="[1-]", help='Coverage depth limit for variant calling. (default=200)')
+    parser.add_argument('-t', '--threads', type=int, default=1, metavar="[1-]", help='Threads used for contig/scaffold alignments. (default=1)')
+    parser.add_argument('-M', '--max-memory', type=int, metavar="[1-]", help='Gigabytes of memory allocated for normalizing reads. (default=max)')
+    parser.add_argument('-g', '--disable-garbage-collection', action='store_true', help='Disable garbage collection and retain intermediate analysis files.')
+    parser.add_argument('--skip-depth-normalization', action='store_true', help='Skip read depth normalization (bbnorm) stage.')
     parser.add_argument('--force', action='store_true', help='Allow overwrite of existing files and directories.')
     parser.add_argument('--log-level', default='info', choices=['info', 'debug'], help='Log level (default=info)')
     parser.add_argument('--version', action='version', version=__version__)

diff --git a/fluviewer/fluviewer.py b/fluviewer/fluviewer.py
@@ -338,15 +338,16 @@ def main():
         'filtered_scaffold_blast_results': blast_scaffolds_analysis_summary['outputs']['filtered_scaffold_blast_results'],
         'database': os.path.abspath(args.db),
     }
-    if not args.skip_depth_normalization:
+
+    if args.skip_depth_normalization:
         current_analysis_stage_inputs.update({
-            'reads_fwd': normalize_depth_analysis_summary['outputs']['normalized_reads_fwd'],
-            'reads_rev': normalize_depth_analysis_summary['outputs']['normalized_reads_rev'],
+            'reads_fwd': os.path.abspath(args.forward_reads),
+            'reads_rev': os.path.abspath(args.reverse_reads),
         })
     else:
         current_analysis_stage_inputs.update({
-            'reads_fwd': os.path.abspath(args.forward_reads),
-            'reads_rev': os.path.abspath(args.reverse_reads),
+            'reads_fwd': normalize_depth_analysis_summary['outputs']['normalized_reads_fwd'],
+            'reads_rev': normalize_depth_analysis_summary['outputs']['normalized_reads_rev'],
         })
 
     log.info(f'Beginning analysis stage: {current_analysis_stage}')