diff --git a/Dockerfile b/Dockerfile index 2f834dc..b1d4fd1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,4 +19,3 @@ COPY ./NanoCaller /opt/conda/envs/nanocaller_env/bin COPY ./NanoCaller_WGS /opt/conda/envs/nanocaller_env/bin RUN chmod +x /opt/conda/envs/nanocaller_env/bin/NanoCaller -RUN chmod +x /opt/conda/envs/nanocaller_env/bin/NanoCaller_WGS diff --git a/NanoCaller b/NanoCaller index 9d84c02..bcd1d5d 100755 --- a/NanoCaller +++ b/NanoCaller @@ -9,135 +9,54 @@ from intervaltree import Interval, IntervalTree from subprocess import PIPE, Popen from nanocaller_src.utils import * - -def run(args, pool): +def run(args): from nanocaller_src import snpCaller, indelCaller - - if not args.output: - args.output=os.getcwd() - - os.makedirs(args.output, exist_ok=True) - - end=None - if not args.end: - try: - with open(args.ref+'.fai','r') as file: - for line in file: - if line.split('\t')[0]==args.chrom: - - end=int(line.split('\t')[1]) - - if end==None: - print('%s: contig %s not found in reference.' %(str(datetime.datetime.now()), args.chrom), flush=True) - sys.exit(2) - - except FileNotFoundError: - print('%s: Index file .fai required for reference genome file' %(str(datetime.datetime.now())), flush=True) - sys.exit(2) - - else: - end=args.end - - if not args.start: - start=1 - else: - start=args.start - threshold=[float(args.neighbor_threshold.split(',')[0]), float(args.neighbor_threshold.split(',')[1])] dirname = os.path.dirname(__file__) - if args.exclude_bed in ['hg38', 'hg19', 'mm10', 'mm39']: - args.exclude_bed=os.path.join(dirname, 'nanocaller_src/release_data/bed_files/%s_centro_telo.bed.gz' %args.exclude_bed) + regions_list=get_regions_list(args) - if args.include_bed: - tbx = pysam.TabixFile(args.include_bed) - include_intervals=IntervalTree(Interval(int(row[1]), int(row[2]), "%s" % (row[1])) for row in tbx.fetch(args.chrom, parser=pysam.asBed())) - - include_intervals=IntervalTree(include_intervals.overlap(start,end)) - - if include_intervals: - start=max(start, min(x[0] for x in include_intervals)) - end=min(end, max(x[1] for x in include_intervals)) - - else: - print('%s: No overlap between include_bed file and start/end coordinates' %(str(datetime.datetime.now())),flush=True) - return + if args.exclude_bed in ['hg38', 'hg19', 'mm10', 'mm39']: + args.exclude_bed=os.path.join(dirname, 'nanocaller_src/release_data/bed_files/%s_centro_telo.bed.gz' %args.exclude_bed) - in_dict={'chrom':args.chrom, 'start':start, 'end':end, 'sam_path':args.bam, 'fasta_path':args.ref, \ + snp_vcf=None + if args.mode in ['snps', 'all']: + snp_time=time.time() + chunks_list=get_chunks(regions_list, args.cpu) + in_dict={'chunks_list': chunks_list, 'regions_list': regions_list, 'sam_path':args.bam, 'fasta_path':args.ref, \ 'mincov':args.mincov, 'maxcov':args.maxcov, 'min_allele_freq':args.min_allele_freq, 'min_nbr_sites':args.min_nbr_sites, \ - 'threshold':threshold, 'snp_model':args.snp_model, 'cpu':args.cpu, 'vcf_path':args.output,'prefix':args.prefix,'sample':args.sample, \ - 'seq':args.sequencing, 'supplementary':args.supplementary, 'include_bed':args.include_bed, 'exclude_bed':args.exclude_bed} + 'threshold':threshold, 'snp_model':args.snp_model, 'cpu':args.cpu, \ + 'vcf_path':args.output, 'prefix':args.prefix, 'sample':args.sample, \ + 'seq':args.sequencing, 'supplementary':args.supplementary, \ + 'exclude_bed':args.exclude_bed, 'suppress_progress':args.suppress_progress_bar} - snp_vcf='' - if args.mode in ['snps', 'snps_unphased', 'both']: - snp_time=time.time() - snp_vcf=snpCaller.test_model(in_dict, pool) - print('\n%s: SNP calling completed for contig %s. Time taken= %.4f\n' %(str(datetime.datetime.now()), in_dict['chrom'], time.time()-snp_time),flush=True) - - if snp_vcf and args.mode in ['snps', 'both']: - enable_whatshap = '--distrust-genotypes --include-homozygous' if args.enable_whatshap else '' - - print('\n%s: ------WhatsHap SNP phasing log------\n' %(str(datetime.datetime.now())),flush=True) - - run_cmd("whatshap phase %s.vcf.gz %s -o %s.phased.preclean.vcf -r %s --ignore-read-groups --chromosome %s %s" %(snp_vcf,in_dict['sam_path'], snp_vcf, in_dict['fasta_path'], in_dict['chrom'] , enable_whatshap), verbose=True) - - - run_cmd("bcftools view -e 'GT=\"0\\0\"' %s.phased.preclean.vcf|bgziptabix %s.phased.vcf.gz" %(snp_vcf,snp_vcf)) - - print('\n%s: ------SNP phasing completed------\n' %(str(datetime.datetime.now())),flush=True) + snp_vcf=snpCaller.call_manager(in_dict) + print('\n%s: SNP calling completed. Time taken= %.4f\n' %(str(datetime.datetime.now()), time.time()-snp_time),flush=True) - - if args.mode=='both' or args.phase_bam: - print('\n%s: ------WhatsHap BAM phasing log------\n' %(str(datetime.datetime.now())),flush=True) - - run_cmd("whatshap haplotag --ignore-read-groups --ignore-linked-read -o %s.phased.bam --reference %s %s.phased.vcf.gz %s --regions %s:%d:%d --tag-supplementary" %(snp_vcf,in_dict['fasta_path'], snp_vcf, in_dict['sam_path'], args.chrom,start,end), verbose=True) - - run_cmd('samtools index %s.phased.bam' %snp_vcf ) - - print('\n%s: ------BAM phasing completed-----\n' %(str(datetime.datetime.now())),flush=True) - - else: - return - - if args.mode in ['indels','both']: - - sam_path= '%s.phased.bam' %snp_vcf if args.mode=='both' else args.bam - in_dict={'chrom':args.chrom, 'start':start, 'end':end, 'sam_path':sam_path, 'fasta_path':args.ref, \ - 'mincov':args.mincov, 'maxcov':args.maxcov, 'min_allele_freq':args.min_allele_freq, 'min_nbr_sites':args.min_nbr_sites, \ - 'threshold':threshold, 'snp_model':args.snp_model,'indel_model':args.indel_model, 'cpu':args.cpu, 'vcf_path':args.output,'prefix':args.prefix,'sample':args.sample, 'seq':args.sequencing, \ - 'del_t':args.del_threshold,'ins_t':args.ins_threshold,'impute_indel_phase':args.impute_indel_phase, 'supplementary':args.supplementary, 'include_bed':args.include_bed\ - , 'exclude_bed':args.exclude_bed,'win_size':args.win_size,'small_win_size':args.small_win_size} - ind_time=time.time() - indel_vcf=indelCaller.test_model(in_dict, pool) + if args.mode in ['indels', 'all'] or args.phase: + indel_phase_time=time.time() + chunks_list=get_chunks(regions_list, args.cpu, max_chunk_size=100000) - print('%s: Post processing' %(str(datetime.datetime.now())),flush=True) + in_dict={'chunks_list': chunks_list,'mode':args.mode, 'snp_vcf':snp_vcf, 'regions_list': regions_list,\ + 'sam_path':args.bam, 'fasta_path':args.ref, 'mincov':args.mincov,\ + 'maxcov':args.maxcov, 'indel_model':args.indel_model, 'cpu':args.cpu,\ + 'vcf_path':args.output, 'prefix':args.prefix, 'sample':args.sample,\ + 'seq':args.sequencing, 'del_t':args.del_threshold, 'ins_t':args.ins_threshold,\ + 'impute_indel_phase':args.impute_indel_phase, 'supplementary':args.supplementary,\ + 'exclude_bed':args.exclude_bed, 'win_size':args.win_size, 'small_win_size':args.small_win_size, + 'enable_whatshap':args.enable_whatshap, 'suppress_progress':args.suppress_progress_bar} - run_cmd('samtools faidx %s %s > %s/%s.fa' %(args.ref,args.chrom,args.output,args.chrom)) + files=indelCaller.call_manager(in_dict) + print('\n%s: %s completed. Time taken= %.4f\n' %(str(datetime.datetime.now()), 'Phasing' if args.mode=='snps' else 'Indel calling', time.time()-indel_phase_time),flush=True) - remove_path('%s/ref.sdf' %args.output) - - run_cmd('rtg RTG_MEM=4G format -f fasta %s/%s.fa -o %s/ref.sdf' % (args.output,args.chrom,args.output)) - - remove_path('%s.vcf.gz' %indel_vcf) - - run_cmd('rtg RTG_MEM=4G vcfdecompose -i %s.raw.vcf.gz --break-mnps -o - -t %s/ref.sdf|rtg RTG_MEM=4G vcffilter -i - --non-snps-only -o %s.vcf.gz' %(indel_vcf,args.output,indel_vcf)) - print('%s: Indel calling completed for contig %s. Time taken= %.4f' %(str(datetime.datetime.now()), in_dict['chrom'], time.time()-ind_time),flush=True) - - if args.mode=='both': - if not args.keep_bam and not args.phase_bam: - os.remove('%s.phased.bam' %snp_vcf) - - final_path=os.path.join(args.output,'%s.final.vcf.gz' %args.prefix) - run_cmd('bcftools concat %s.phased.vcf.gz %s.vcf.gz -a -d all |bgziptabix %s' %(snp_vcf, indel_vcf, final_path)) - - if __name__ == '__main__': + t=time.time() @@ -162,63 +81,65 @@ if __name__ == '__main__': requiredNamed = parser.add_argument_group('Required Arguments') preset_group=parser.add_argument_group("Preset") config_group = parser.add_argument_group('Configurations') - region_group=parser.add_argument_group("Variant Calling Regions") + region_group=parser.add_argument_group(title="Variant Calling Regions", description="Use only one of these options to specify regions for variant calling: --regions or --bed option or --wgs_contigs. If none is provided then whole genome variant calling is assumed and all contigs in the BAM file are used.") snp_group=parser.add_argument_group("SNP Calling") indel_group=parser.add_argument_group("Indel Calling") out_group=parser.add_argument_group("Output Options") phase_group=parser.add_argument_group("Phasing") - config_group.add_argument("-mode", "--mode", help="NanoCaller mode to run, options are 'snps', 'snps_unphased', 'indels' and 'both'. 'snps_unphased' mode quits NanoCaller without using WhatsHap for phasing.", type=str, default='both') - config_group.add_argument("-seq", "--sequencing", help="Sequencing type, options are 'ont', 'ul_ont', 'ul_ont_extreme', and 'pacbio'. 'ont' works well for any type of ONT sequencing datasets. However, use 'ul_ont' if you have several ultra-long ONT reads up to 100kbp long, and 'ul_ont_extreme' if you have several ultra-long ONT reads up to 300kbp long. For PacBio CCS (HiFi) and CLR reads, use 'pacbio'.", type=str, default='ont') + config_group.add_argument("--mode", help="NanoCaller mode to run. 'snps' mode quits NanoCaller without using WhatsHap for phasing. In this mode, if you want NanoCaller to phase SNPs and BAM files, use --phase argument additionally.", type=str, default='all', choices=['snps', 'indels', 'all']) + config_group.add_argument("--sequencing", help="Sequencing type. 'ont' works well for any type of ONT sequencing datasets. However, use 'ul_ont' if you have several ultra-long ONT reads up to 100kbp long, and 'ul_ont_extreme' if you have several ultra-long ONT reads up to 300kbp long. For PacBio CCS (HiFi) and CLR reads, use 'pacbio'.", type=str, default='ont', choices=['ont', 'ul_ont', 'ul_ont_extreme', 'pacbio']) - config_group.add_argument("-cpu", "--cpu", help="Number of CPUs to use", type=int, default=1) - - config_group.add_argument("-mincov", "--mincov", help="Minimum coverage to call a variant", type=int, default=8) - config_group.add_argument("-maxcov", "--maxcov", help="Maximum coverage of reads to use. If sequencing depth at a candidate site exceeds maxcov then reads are downsampled.", type=int, default=160) + config_group.add_argument("--cpu", help="Number of CPUs to use.", type=int, default=1) + config_group.add_argument("--mincov", help="Minimum coverage to call a variant", type=int, default=4) + config_group.add_argument("--maxcov", help="Maximum coverage of reads to use. If sequencing depth at a candidate site exceeds maxcov then reads are downsampled.", type=int, default=160) + config_group.add_argument("--suppress_progress_bar", help="Do not show progress bar.", default=False, action='store_true') #output options - out_group.add_argument('-keep_bam','--keep_bam', help='Keep phased bam files.', default=False, action='store_true') - out_group.add_argument("-o", "--output", help="VCF output path, default is current working directory", type=str) - out_group.add_argument("-prefix", "--prefix", help="VCF file prefix", type=str, default='variant_calls') - out_group.add_argument("-sample", "--sample", help="VCF file sample name", type=str, default='SAMPLE') + out_group.add_argument("--output", help="VCF output path, default is current working directory", type=str) + out_group.add_argument("--prefix", help="VCF file prefix", type=str, default='variant_calls') + out_group.add_argument("--sample", help="VCF file sample name", type=str, default='SAMPLE') + #region - region_group.add_argument("-include_bed", "--include_bed", help="Only call variants inside the intervals specified in the bgzipped and tabix indexed BED file. If any other flags are used to specify a region, intersect the region with intervals in the BED file, e.g. if -chom chr1 -start 10000000 -end 20000000 flags are set, call variants inside the intervals specified by the BED file that overlap with chr1:10000000-20000000. Same goes for the case when whole genome variant calling flag is set.", type=str, default=None) - region_group.add_argument("-exclude_bed", "--exclude_bed", help="Path to bgzipped and tabix indexed BED file containing intervals to ignore for variant calling. BED files of centromere and telomere regions for the following genomes are included in NanoCaller: hg38, hg19, mm10 and mm39. To use these BED files use one of the following options: ['hg38', 'hg19', 'mm10', 'mm39'].", type=str, default=None) - region_group.add_argument("-start", "--start", help="start, default is 1", type=int) - region_group.add_argument("-end", "--end", help="end, default is the end of contig", type=int) + region_group.add_argument( "--regions", nargs='*', help='A space/whitespace separated list of regions specified as "CONTIG_NAME" or "CONTIG_NAME:START-END". If you want to use "CONTIG_NAME:START-END" format then specify both start and end coordinates. For example: chr3 chr6:28000000-35000000 chr22.') + + region_group.add_argument("--bed", help="A BED file specifying regions for variant calling.", type=str, default=None) + + region_group.add_argument('--wgs_contigs', help="""Preset list of chromosomes to use for variant calling on human genomes. "chr1-22XY" option will assume human reference genome with "chr" prefix present in the chromosome notation, and run NanoCaller on chr1 to chr22, chrX and chrY. "1-22XY" option will assume no "chr" prefix is present in the chromosome notation and run NanoCaller on chromosomes 1-22, X and Y.""", type=str, default=None, choices=["chr1-22XY", "1-22XY"]) + + region_group.add_argument("--exclude_bed", help="Path to bgzipped and tabix indexed BED file containing intervals to ignore for variant calling. BED files of centromere and telomere regions for the following genomes are included in NanoCaller: hg38, hg19, mm10 and mm39.", type=str, default=None, choices=['hg38', 'hg19', 'mm10', 'mm39']) #preset - preset_group.add_argument("-p", "--preset", help="Apply recommended preset values for SNP and Indel calling parameters, options are 'ont', 'ul_ont', 'ul_ont_extreme', 'ccs' and 'clr'. 'ont' works well for any type of ONT sequencing datasets. However, use 'ul_ont' if you have several ultra-long ONT reads up to 100kbp long, and 'ul_ont_extreme' if you have several ultra-long ONT reads up to 300kbp long. For PacBio CCS (HiFi) and CLR reads, use 'ccs'and 'clr' respectively. Presets are described in detail here: github.com/WGLab/NanoCaller/blob/master/docs/Usage.md#preset-options.", type=str) + preset_group.add_argument("--preset", help="Apply recommended preset values for SNP and Indel calling parameters. 'ont' works well for all types of ONT sequencing datasets. However, use 'ul_ont' if you have several ultra-long ONT reads up to 100kbp long, and 'ul_ont_extreme' if you have several ultra-long ONT reads up to 300kbp long. For PacBio CCS (HiFi) and CLR reads, use 'ccs'and 'clr' respectively. Presets are described in detail here: github.com/WGLab/NanoCaller/blob/master/docs/Usage.md#preset-options.", type=str, choices=[ 'ont', 'ul_ont', 'ul_ont_extreme', 'ccs', 'clr']) #required - requiredNamed.add_argument("-bam", "--bam", help="Bam file, should be phased if 'indel' mode is selected", required=True) - requiredNamed.add_argument("-ref", "--ref", help="Reference genome file with .fai index", required=True) - requiredNamed.add_argument("-chrom", "--chrom", help="Chromosome",required=True) + requiredNamed.add_argument("--bam", help="Bam file, should be phased if 'indel' mode is selected", required=True) + requiredNamed.add_argument("--ref", help="Reference genome file with .fai index", required=True) #snp - snp_group.add_argument("-snp_model", "--snp_model", help="NanoCaller SNP model to be used", default='ONT-HG002') - snp_group.add_argument("-min_allele_freq", "--min_allele_freq", help="minimum alternative allele frequency", type=float, default=0.15) - snp_group.add_argument("-min_nbr_sites", "--min_nbr_sites", help="minimum number of nbr sites", type=int, default =1) - snp_group.add_argument("-nbr_t", "--neighbor_threshold", help="SNP neighboring site thresholds with lower and upper bounds seperated by comma, for Nanopore reads '0.4,0.6' is recommended, for PacBio CCS anc CLR reads '0.3,0.7' and '0.3,0.6' are recommended respectively", type=str, default='0.4,0.6') - snp_group.add_argument("-sup", "--supplementary", help="Use supplementary reads", default=False, action='store_true') + snp_group.add_argument("--snp_model", help="NanoCaller SNP model to be used", default='ONT-HG002') + snp_group.add_argument("--min_allele_freq", help="minimum alternative allele frequency", type=float, default=0.15) + snp_group.add_argument("--min_nbr_sites", help="minimum number of nbr sites", type=int, default =1) + snp_group.add_argument("--neighbor_threshold", help="SNP neighboring site thresholds with lower and upper bounds seperated by comma, for Nanopore reads '0.4,0.6' is recommended, for PacBio CCS anc CLR reads '0.3,0.7' and '0.3,0.6' are recommended respectively", type=str, default='0.4,0.6') + snp_group.add_argument("--supplementary", help="Use supplementary reads, not fully supported at the moment", default=False, action='store_true') #indel - indel_group.add_argument("-indel_model", "--indel_model", help="NanoCaller indel model to be used", default='ONT-HG002') - indel_group.add_argument("-ins_t", "--ins_threshold", help="Insertion Threshold",type=float,default=0.4) - indel_group.add_argument("-del_t", "--del_threshold", help="Deletion Threshold",type=float,default=0.6) - indel_group.add_argument("-win_size", "--win_size", help="Size of the sliding window in which the number of indels is counted to determine indel candidate site. Only indels longer than 2bp are counted in this window. Larger window size can increase recall, but use a maximum of 50 only", type=int, default=40) - indel_group.add_argument("-small_win_size", "--small_win_size", help="Size of the sliding window in which indel frequency is determined for small indels", type=int, default=4) + indel_group.add_argument("--indel_model", help="NanoCaller indel model to be used", default='ONT-HG002') + indel_group.add_argument("--ins_threshold", help="Insertion Threshold",type=float,default=0.4) + indel_group.add_argument("--del_threshold", help="Deletion Threshold",type=float,default=0.6) + indel_group.add_argument("--win_size", help="Size of the sliding window in which the number of indels is counted to determine indel candidate site. Only indels longer than 2bp are counted in this window. Larger window size can increase recall, but use a maximum of 50 only", type=int, default=40) + indel_group.add_argument("--small_win_size", help="Size of the sliding window in which indel frequency is determined for small indels", type=int, default=4) - indel_group.add_argument('-impute_indel_phase','--impute_indel_phase', help='Infer read phase by rudimentary allele clustering if the no or insufficient phasing information is available, can be useful for datasets without SNPs or regions with poor phasing quality', default=False, action='store_true') + indel_group.add_argument('--impute_indel_phase', help='Infer read phase by rudimentary allele clustering if the no or insufficient phasing information is available, can be useful for datasets without SNPs or regions with poor phasing quality', default=False, action='store_true') #phasing - phase_group.add_argument('-phase_bam','--phase_bam', help='Phase bam files if snps mode is selected. This will phase bam file without indel calling.', default=False, action='store_true') + phase_group.add_argument('--phase', help='Phase SNPs and BAM files if snps mode is selected.', default=False, action='store_true') - phase_group.add_argument("-enable_whatshap", "--enable_whatshap", help="Allow WhatsHap to change SNP genotypes when phasing using --distrust-genotypes and --include-homozygous flags (this is not the same as regenotyping), considerably increasing the time needed for phasing. It has a negligible effect on SNP calling accuracy for Nanopore reads, but may make a small improvement for PacBio reads. By default WhatsHap will only phase SNP calls produced by NanoCaller, but not change their genotypes.", default=False, action='store_true') + phase_group.add_argument("--enable_whatshap", help="Allow WhatsHap to change SNP genotypes when phasing using --distrust-genotypes and --include-homozygous flags (this is not the same as regenotyping), increasing the time needed for phasing. It has a negligible effect on SNP calling accuracy for Nanopore reads, but may make a small improvement for PacBio reads. By default WhatsHap will only phase SNP calls produced by NanoCaller, but not change their genotypes.", default=False, action='store_true') args = parser.parse_args() @@ -229,9 +150,6 @@ if __name__ == '__main__': for x in sys.argv: if '--' in x: set_flags.append(x.replace('-','')) - - elif '-' in x: - set_flags.append(flag_map(x.replace('-',''))) if args.preset: for p in preset_dict[args.preset]: @@ -252,27 +170,12 @@ if __name__ == '__main__': print('\n%s: Starting NanoCaller.\n\nNanoCaller command and arguments are saved in the following file: %s\n' %(str(datetime.datetime.now()), os.path.join(args.output,'args')), flush=True) - pool = mp.Pool(processes=args.cpu) - - run(args, pool) - pool.close() - pool.join() - - - final_file_name={'snps_unphased':'snps','snps':'snps.phased', 'both':'final', 'indels':'indels'} - - final_output=os.path.join(args.output,'%s.%s.vcf.gz' %(args.prefix, final_file_name[args.mode])) - - print('\n\n-----------------------------------------------------------\n\n',) - print ('\n%s: Variant calling output can be found here: %s' %(str(datetime.datetime.now()), final_output)) - - print('\n%s: Printing variant calling summary statistics' %str(datetime.datetime.now())) - stats=run_cmd('rtg vcfstats %s' %final_output, output=True, verbose=True, error=True) + if args.regions and args.bed: + print('\n%s: Please use either --regions or --bed but not both.' %str(datetime.datetime.now(), flush=True)) + sys.exit(2) - print('\n%s: Saving variant calling summary statistics to: %s' %(str(datetime.datetime.now()), os.path.join(args.output,'%s.summary' %args.prefix))) - with open(os.path.join(args.output,'%s.summary' %args.prefix), 'w') as stats_file: - stats_file.write(stats) + run(args) elapsed=time.time()-t diff --git a/NanoCaller_WGS b/NanoCaller_WGS deleted file mode 100755 index 40a8f36..0000000 --- a/NanoCaller_WGS +++ /dev/null @@ -1,290 +0,0 @@ -#!/usr/bin/env python - -from warnings import simplefilter -simplefilter(action='ignore', category=FutureWarning) - -import time, argparse, os, shutil, sys, pysam, datetime, re -import multiprocessing as mp -from intervaltree import Interval, IntervalTree -from subprocess import PIPE, Popen -from nanocaller_src.utils import * - -if __name__ == '__main__': - - t=time.time() - - preset_dict={'ont':{'sequencing':'ont', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.4,0.6', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False}, - - 'ul_ont': {'sequencing':'ul_ont', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.4,0.6', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False}, - - 'ul_ont_extreme':{'sequencing':'ul_ont_extreme', 'snp_model':'ONT-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.4,0.6', 'ins_threshold':0.4,'del_threshold':0.6, 'enable_whatshap':False}, - - 'ccs':{'sequencing':'pacbio', 'snp_model': 'CCS-HG002', 'indel_model':'CCS-HG002', 'neighbor_threshold':'0.3,0.7', 'ins_threshold':0.4,'del_threshold':0.4, 'enable_whatshap':True}, - - 'clr':{'sequencing':'pacbio', 'snp_model':'CLR-HG002', 'indel_model':'ONT-HG002', 'neighbor_threshold':'0.3,0.6', 'ins_threshold':0.6,'del_threshold':0.6, 'win_size':10, 'small_win_size':2, 'enable_whatshap':True} - } - - flag_dict={"seq":"sequencing", "p":"preset", "o":"output", "sup":"supplementary","nbr_t":"neighbor_threshold","ins_t":"ins_threshold", "del_t":"del_threshold"} - flag_map=lambda x: flag_dict[x] if x in flag_dict else x - - - - parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - requiredNamed = parser.add_argument_group('Required Arguments') - preset_group=parser.add_argument_group("Preset") - config_group = parser.add_argument_group('Configurations') - region_group=parser.add_argument_group("Variant Calling Regions") - - snp_group=parser.add_argument_group("SNP Calling") - indel_group=parser.add_argument_group("Indel Calling") - out_group=parser.add_argument_group("Output Options") - phase_group=parser.add_argument_group("Phasing") - - config_group.add_argument("-mode", "--mode", help="NanoCaller mode to run, options are 'snps', 'snps_unphased', 'indels' and 'both'. 'snps_unphased' mode quits NanoCaller without using WhatsHap for phasing.", type=str, default='both') - - config_group.add_argument("-seq", "--sequencing", help="Sequencing type, options are 'ont', 'ul_ont', 'ul_ont_extreme', and 'pacbio'. 'ont' works well for any type of ONT sequencing datasets. However, use 'ul_ont' if you have several ultra-long ONT reads up to 100kbp long, and 'ul_ont_extreme' if you have several ultra-long ONT reads up to 300kbp long. For PacBio CCS (HiFi) and CLR reads, use 'pacbio'.", type=str, default='ont') - - config_group.add_argument("-cpu", "--cpu", help="Number of CPUs to use", type=int, default=1) - - config_group.add_argument("-mincov", "--mincov", help="Minimum coverage to call a variant", type=int, default=8) - config_group.add_argument("-maxcov", "--maxcov", help="Maximum coverage of reads to use. If sequencing depth at a candidate site exceeds maxcov then reads are downsampled.", type=int, default=160) - - #output options - out_group.add_argument('-keep_bam','--keep_bam', help='Keep phased bam files.', default=False, action='store_true') - out_group.add_argument("-o", "--output", help="VCF output path, default is current working directory", type=str) - out_group.add_argument("-prefix", "--prefix", help="VCF file prefix", type=str, default='variant_calls') - out_group.add_argument("-sample", "--sample", help="VCF file sample name", type=str, default='SAMPLE') - - #region - region_group.add_argument("-chrom", "--chrom", nargs='*', help='A space/whitespace separated list of contigs, e.g. chr3 chr6 chr22.') - region_group.add_argument("-include_bed", "--include_bed", help="Only call variants inside the intervals specified in the bgzipped and tabix indexed BED file. If any other flags are used to specify a region, intersect the region with intervals in the BED file, e.g. if -chom chr1 -start 10000000 -end 20000000 flags are set, call variants inside the intervals specified by the BED file that overlap with chr1:10000000-20000000. Same goes for the case when whole genome variant calling flag is set.", type=str, default=None) - region_group.add_argument("-exclude_bed", "--exclude_bed", help="Path to bgzipped and tabix indexed BED file containing intervals to ignore for variant calling. BED files of centromere and telomere regions for the following genomes are included in NanoCaller: hg38, hg19, mm10 and mm39. To use these BED files use one of the following options: ['hg38', 'hg19', 'mm10', 'mm39'].", type=str, default=None) - region_group.add_argument('-wgs_contigs_type','--wgs_contigs_type', \ - help="""Options are "with_chr", "without_chr" and "all",\ - "with_chr" option will assume \ - human genome and run NanoCaller on chr1-22, "without_chr" will \ - run on chromosomes 1-22 if the BAM and reference genome files \ - use chromosome names without "chr". "all" option will run \ - NanoCaller on each contig present in reference genome FASTA file.""", \ - type=str, default='all') - - - #preset - preset_group.add_argument("-p", "--preset", help="Apply recommended preset values for SNP and Indel calling parameters, options are 'ont', 'ul_ont', 'ul_ont_extreme', 'ccs' and 'clr'. 'ont' works well for any type of ONT sequencing datasets. However, use 'ul_ont' if you have several ultra-long ONT reads up to 100kbp long, and 'ul_ont_extreme' if you have several ultra-long ONT reads up to 300kbp long. For PacBio CCS (HiFi) and CLR reads, use 'ccs'and 'clr' respectively. Presets are described in detail here: github.com/WGLab/NanoCaller/blob/master/docs/Usage.md#preset-options.", type=str) - - #required - requiredNamed.add_argument("-bam", "--bam", help="Bam file, should be phased if 'indel' mode is selected", required=True) - requiredNamed.add_argument("-ref", "--ref", help="Reference genome file with .fai index", required=True) - - #snp - snp_group.add_argument("-snp_model", "--snp_model", help="NanoCaller SNP model to be used", default='ONT-HG002') - snp_group.add_argument("-min_allele_freq", "--min_allele_freq", help="minimum alternative allele frequency", type=float, default=0.15) - snp_group.add_argument("-min_nbr_sites", "--min_nbr_sites", help="minimum number of nbr sites", type=int, default =1) - snp_group.add_argument("-nbr_t", "--neighbor_threshold", help="SNP neighboring site thresholds with lower and upper bounds seperated by comma, for Nanopore reads '0.4,0.6' is recommended, for PacBio CCS anc CLR reads '0.3,0.7' and '0.3,0.6' are recommended respectively", type=str, default='0.4,0.6') - snp_group.add_argument("-sup", "--supplementary", help="Use supplementary reads", default=False, action='store_true') - - - #indel - indel_group.add_argument("-indel_model", "--indel_model", help="NanoCaller indel model to be used", default='ONT-HG002') - indel_group.add_argument("-ins_t", "--ins_threshold", help="Insertion Threshold",type=float,default=0.4) - indel_group.add_argument("-del_t", "--del_threshold", help="Deletion Threshold",type=float,default=0.6) - indel_group.add_argument("-win_size", "--win_size", help="Size of the sliding window in which the number of indels is counted to determine indel candidate site. Only indels longer than 2bp are counted in this window. Larger window size can increase recall, but use a maximum of 50 only", type=int, default=40) - indel_group.add_argument("-small_win_size", "--small_win_size", help="Size of the sliding window in which indel frequency is determined for small indels", type=int, default=4) - - indel_group.add_argument('-impute_indel_phase','--impute_indel_phase', help='Infer read phase by rudimentary allele clustering if the no or insufficient phasing information is available, can be useful for datasets without SNPs or regions with poor phasing quality', default=False, action='store_true') - - #phasing - phase_group.add_argument('-phase_bam','--phase_bam', help='Phase bam files if snps mode is selected. This will phase bam file without indel calling.', default=False, action='store_true') - - phase_group.add_argument("-enable_whatshap", "--enable_whatshap", help="Allow WhatsHap to change SNP genotypes when phasing using --distrust-genotypes and --include-homozygous flags (this is not the same as regenotyping), considerably increasing the time needed for phasing. It has a negligible effect on SNP calling accuracy for Nanopore reads, but may make a small improvement for PacBio reads. By default WhatsHap will only phase SNP calls produced by NanoCaller, but not change their genotypes.", default=False, action='store_true') - - - - - - args = parser.parse_args() - - set_flags=[] - - for x in sys.argv: - if '--' in x: - set_flags.append(x.replace('-','')) - - elif '-' in x: - set_flags.append(flag_map(x.replace('-',''))) - - if args.preset: - for p in preset_dict[args.preset]: - if p not in set_flags: - vars(args)[p]=preset_dict[args.preset][p] - - - if not args.output: - args.output=os.getcwd() - - if args.phase_bam: - args.keep_bam=True - - make_and_remove_path(os.path.join(args.output, 'intermediate_files')) - - log_path=os.path.join(args.output,'logs' ) - make_and_remove_path(log_path) - - remove_path(os.path.join(args.output,'args')) - - with open(os.path.join(args.output,'args'),'w') as file: - file.write('Command: python %s\n\n\n' %(' '.join(sys.argv))) - file.write('------Parameters Used For Variant Calling------\n') - for k in vars(args): - file.write('{}: {}\n'.format(k,vars(args)[k]) ) - - - print('\n%s: Starting NanoCaller.\n\nNanoCaller command and arguments are saved in the following file: %s\n' %(str(datetime.datetime.now()), os.path.join(args.output,'args')), flush=True) - - if args.chrom: - chrom_list= args.chrom - - else: - if args.wgs_contigs_type=='with_chr': - chrom_list=['chr%d' %d for d in range(1,23)] - - elif args.wgs_contigs_type == 'without_chr': - chrom_list=['%d' %d for d in range(1,23)] - - elif args.wgs_contigs_type == 'all': - chrom_list=[] - - try: - with open(args.ref+'.fai','r') as file: - for line in file: - chrom_list.append(line.split('\t')[0]) - - except FileNotFoundError: - print('%s: index file .fai required for reference genome file.\n' %str(datetime.datetime.now()), flush=True) - sys.exit(2) - - if args.include_bed: - stream=run_cmd('zcat %s|cut -f 1|uniq' %args.include_bed, output=True) - bed_chroms=stream.split() - - chrom_list=[chrom for chrom in chrom_list if chrom in bed_chroms] - - args_dict=vars(args) - chrom_lengths={} - with open(args.ref+'.fai','r') as file: - for line in file: - chrom_lengths[line.split('\t')[0]]=int(line.split('\t')[1]) - - bam_chrom_list=out=run_cmd('samtools idxstats %s|cut -f 1' %args.bam, output=True).split('\n') - - - job_dict={} - - remove_path(os.path.join(args.output,'wg_commands')) - with open(os.path.join(args.output,'wg_commands'),'w') as wg_commands: - job_counter=0 - for chrom in chrom_list: - cmd='' - - for x in args_dict: - if x in ['chrom','wgs_contigs_type','start','end','output','cpu','prefix'] or args_dict[x] is None: - pass - - elif x in ['supplementary', 'enable_whatshap','keep_bam','phase_bam','impute_indel_phase']: - if args_dict[x]==True: - cmd+=' --%s ' %x - - else: - cmd+= '--%s %s ' %(x, args_dict[x]) - - dirname = os.path.dirname(__file__) - - try: - chr_end=chrom_lengths[chrom] - - except KeyError: - print('Contig %s not found in reference. Ignoring it.' %chrom,flush=True) - continue - - if chrom not in bam_chrom_list: - print('Contig %s not found in BAM file. Ignoring it.' %chrom,flush=True) - continue - - for mbase in range(1,chr_end,10000000): - job_id='%s_%d_%d' %(chrom, mbase, min(chr_end,mbase+10000000-1)) - out_path=os.path.join(args.output, 'intermediate_files', job_id) - job_command='%s -chrom %s %s -cpu 1 --output %s -start %d -end %d -prefix %s > %s/%s 2>&1' %(os.path.join(dirname,'NanoCaller'), chrom, cmd, out_path ,mbase, min(chr_end,mbase+10000000-1),job_id, log_path,job_id) - job_dict[job_id]=job_command - wg_commands.write('%s\n' %job_command) - - job_counter+=1 - - if job_counter==0: - print('VARIANT CALLING FAILED due to lack of suitable contigs. Please check if the contig names specified are consistent and present in reference genome, BAM file and any --include_bed file.\n', flush=True) - sys.exit(2) - - print('%s: Commands for running NanoCaller on contigs in whole genome are saved in the file: %s.\n' %(str(datetime.datetime.now()), os.path.join(args.output,'wg_commands')), flush=True) - - - - print('Running %d jobs using %d workers in parallel.\n\nIMPORTANT: Logs for each parallel job generated by NanoCaller are saved in the file: %s, check this directory for additional information for any errors in running the jobs.\nA log file created by parallel command is saved in the file: %s, which contains exit codes of each parallel job.\n' %(job_counter, args.cpu, log_path, os.path.join(args.output,'parallel_run_log')), flush=True) - - remove_path(os.path.join(args.output,'parallel_run_log')) - run_cmd('cat %s|parallel -j %d --joblog %s' %(os.path.join(args.output,'wg_commands'), args.cpu, os.path.join(args.output,'parallel_run_log')), verbose=True) - - out_path=os.path.join(args.output, 'intermediate_files') - - bad_runs=run_cmd('grep -L "Total Time Elapsed" %s/*' %log_path, output=True) - - if bad_runs: - failed_job_file=os.path.join(args.output,'failed_jobs_commands') - failed_job_file_cmb_logs=os.path.join(args.output,'failed_jobs_combined_logs') - failed_jobs_names=re.findall('%s/(.+?)\n' %log_path,bad_runs) - failed_jobs_logs=re.findall('(.+?)\n',bad_runs) - - run_cmd('grep -L "Total Time Elapsed" %s/*| while read file; do printf "\n\n\n#### Log File: $file ####\n"; cat $file; printf "%%0.s-" {1..100}; done > %s' %(log_path, failed_job_file_cmb_logs)) - - print('Number of jobs failed = %d\nCombined logs of failed jobs are written in this file: %s\nThe commands of these jobs are stored in the following file: %s\n' %(len(failed_jobs_names),failed_job_file_cmb_logs, failed_job_file),flush=True) - - with open(failed_job_file,'w') as fail_job: - for job,log in zip(failed_jobs_names,failed_jobs_logs): - fail_job.write('%s\n' %job_dict[job]) - - - - final_logs='' - if args.mode in ['snps_unphased','snps','both']: - final_logs+=run_cmd('ls -1 %s/*/*snps.vcf.gz|bcftools concat -f - -a|bcftools sort|bgziptabix %s/%s.snps.vcf.gz' %(out_path, args.output, args.prefix),error=True) - - if args.mode!='snps_unphased': - final_logs+=run_cmd('ls -1 %s/*/*snps.phased.vcf.gz|bcftools concat -f - -a|bcftools sort|bgziptabix %s/%s.snps.phased.vcf.gz' %(out_path, args.output, args.prefix),error=True) - - if args.mode in ['indels','both']: - final_logs+=run_cmd('ls -1 %s/*/*indels.vcf.gz|bcftools concat -f - -a|bcftools sort|bgziptabix %s/%s.indels.vcf.gz' %(out_path, args.output, args.prefix),error=True) - - if args.mode=='both': - final_logs+=run_cmd('ls -1 %s/*/*final.vcf.gz|bcftools concat -f - -a|bcftools sort|bgziptabix %s/%s.final.vcf.gz' %(out_path, args.output, args.prefix),error=True) - - - if 'ls: cannot access' in final_logs: - print('VARIANT CALLING FAILED. Please check any errors printed above, or in the log files here: %s.\n' %log_path, flush=True) - - elapsed=time.time()-t - - final_file_name={'snps_unphased':'snps','snps':'snps.phased', 'both':'final', 'indels':'indels'} - - final_output=os.path.join(args.output,'%s.%s.vcf.gz' %(args.prefix, final_file_name[args.mode])) - - print('\n\n-----------------------------------------------------------\n\n',) - print ('\n%s: Variant calling output can be found here: %s' %(str(datetime.datetime.now()), final_output)) - - print('\n%s: Printing variant calling summary statistics' %str(datetime.datetime.now())) - stats=run_cmd('rtg vcfstats %s' %final_output, output=True, verbose=True, error=True) - - print('\n%s: Saving variant calling summary statistics to: %s' %(str(datetime.datetime.now()), os.path.join(args.output,'%s.summary' %args.prefix))) - with open(os.path.join(args.output,'%s.summary' %args.prefix), 'w') as stats_file: - stats_file.write(stats) - - elapsed=time.time()-t - print ('\n%s: Total Time Elapsed: %.2f seconds' %(str(datetime.datetime.now()), elapsed), flush=True) diff --git a/README.md b/README.md index 0d3e96f..dc78159 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,8 @@ NanoCaller is a computational method that integrates long reads in deep convolut NanoCaller is distributed under the [MIT License by Wang Genomics Lab](https://wglab.mit-license.org/). ## Latest Updates +_**v3.0.0** (June 7 2022)_ : A major update in API with single entry point for running NanoCaller. Major changes in parallelization routine with GNU parallel no longer used for whole genome variant calling. + _**v2.0.0** (Feb 2 2022)_ : A major update in API and installation instructions, with release of bioconda recipe for NanoCaller. Added support for indel calling in case of poor or non-existent phasing. _**v1.0.0** (Aug 8 2021)_ : First post-production release with citeable DOI: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5176764.svg)](https://doi.org/10.5281/zenodo.5176764) @@ -13,8 +15,6 @@ _**v0.4.1** (Aug 3 2021)_ : Fixed a bug causing slower runtime in whole genome v _**v0.4.0** (June 2 2021)_ : Added NanoCaller models trained on ONT reads basecalled with Guppy v4.2.2 and Bonito v0.30, as well as R10.3 reads. Added new NanoCaller models trained with long CCS reads (15-20kb library selection). Improved indel calling with rolling window for candidate selection which helps with indels in low complexity regions. -## Citing NanoCaller -Please cite: Ahsan, M.U., Liu, Q., Fang, L. et al. NanoCaller for accurate detection of SNPs and indels in difficult-to-map regions from long-read sequencing by haplotype-aware deep neural networks. Genome Biol 22, 261 (2021). https://doi.org/10.1186/s13059-021-02472-2. ## Installation NanoCaller can be installed using Docker or Conda. The easiest way to install is from the bioconda channel: @@ -24,20 +24,20 @@ NanoCaller can be installed using Docker or Conda. The easiest way to install is or using Docker: ``` -VERSION="2.0.0" +VERSION="3.0.0" docker pull genomicslab/nanocaller:${VERSION} ``` Please refer to [Installation](docs/Install.md) for instructions regarding installing NanoCaller through other methods. ## Usage -General usage of NanoCaller is described in [Usage](docs/Usage.md). For a comprehensive case study of variant calling on Nanopore reads, see [ONT Case Study](docs/ONT%20Case%20Study.md), where we describe end-to-end variant calling pipeline for using NanoCaller, where we start with aligning FASTQ files of HG002, calls variants using NanoCaller, and evaluate performances on various genomic regions. +General usage of NanoCaller is described in [Usage](docs/Usage.md). Some quick usage examples: -## Example -An example of NanoCaller usage is provided in [sample](sample). The results are stored in [test output](sample/test_run) and were created using the following command: +- `NanoCaller --bam YOU_BAM --ref YOU_REF --cpu 10` will run NanoCaller on whole genome using 10 parallel processes. +- `NanoCaller --bam YOU_BAM --ref YOU_REF --cpu 10 --regions chr22:20000000-21000000 chr21` will NanoCaller on chr21 and chr22:20000000-21000000 only. +- `NanoCaller --bam YOU_BAM --ref YOU_REF --cpu 10 --mode snps` will only call SNPs. -`NanoCaller-bam HG002.nanopore.chr22.sample.bam -p ont -o test_run -chrom chr22 -start 20000000 -end 21000000 -ref chr22_ref.fa -cpu 4 > log` +For a comprehensive case study of variant calling on Nanopore reads, see [ONT Case Study](docs/ONT%20Case%20Study.md), where we describe end-to-end variant calling pipeline for using NanoCaller, where we start with aligning FASTQ files of HG002, calls variants using NanoCaller, and evaluate performances on various genomic regions. -which is also in the file [sample_call](sample/sample_call). This example should take about 10-15 minutes to run. ## Trained models Trained models for [ONT](https://github.com/WGLab/NanoCaller/tree/master/nanocaller_src/release_data/ONT_models) data, [CLR](https://github.com/WGLab/NanoCaller/tree/master/nanocaller_src/release_data/clr_models) data and [HIFI](https://github.com/WGLab/NanoCaller/tree/master/nanocaller_src/release_data/hifi_models) data can be found [here](https://github.com/WGLab/NanoCaller/tree/master/nanocaller_src/release_data). These models are trained on chr1-22 of the genomes stated below, unless mentioned othewise. @@ -79,3 +79,6 @@ You can specify SNP and indel models using `--snp_model` and `--indel_model` par | CCS-HG002 | PacBio CCS | HG002 | 56 | v4.2.1 | \- | | NanoCaller1 | ONT R9.4.1 | HG001 | 34 | v3.3.2 | Guppy2.3.8 | | NanoCaller3 | PacBio CCS | HG001 | 29 | v3.3.2 | \- | + +## Citing NanoCaller +Please cite: Ahsan, M.U., Liu, Q., Fang, L. et al. NanoCaller for accurate detection of SNPs and indels in difficult-to-map regions from long-read sequencing by haplotype-aware deep neural networks. Genome Biol 22, 261 (2021). https://doi.org/10.1186/s13059-021-02472-2. \ No newline at end of file diff --git a/docs/Install.md b/docs/Install.md index 5685dca..d7c2b0c 100644 --- a/docs/Install.md +++ b/docs/Install.md @@ -1,5 +1,5 @@ # Installation -There are three ways to install and run NanoCaller, via Docker, Singularity or conda. +There are three ways to install and run NanoCaller, via Docker, Singularity or Conda. NanoCaller has been developed and tested to work with Linux OS; we do not recommend using Windows or Mac OS. However, if you use Windows or Mac OS and have Docker installed on your machine, you can run NanoCaller inside a Docker container. NanoCaller does not require a GPU or any other special hardware to run. @@ -8,9 +8,6 @@ Please check the [NanoCaller Docker Hub repository](https://hub.docker.com/repos ## Conda Installation -You can install NanoCaller in conda using: -`conda install -c bioconda nanocaller` - If you do not have Anaconda, you will need to install it first. Here, we show how to install Miniconda, a minimal installation of Anaconda, which is much smaller and has a faster installation: ``` @@ -19,13 +16,28 @@ bash Miniconda3-latest-Linux-x86_64.sh ``` Go through all the prompts (installation in `$HOME` is recommended). The installation should take about 10 minutes, including the installation of Miniconda. +### Bioconda +You can install NanoCaller in conda using Bioconda recipe: +`conda install -c bioconda nanocaller` -It is recommened that you install NanoCaller in a new conda environment to avoid any package conflict, in the following way: +It is recommened that you install NanoCaller in a new conda environment to avoid any package conflict and use mamba for fast installation, in the following way: +``` +conda create -n nanocaller_env -c conda-forge mamba +conda activate nanocaller_env +mamba install -c bioconda nanocaller ``` -conda create -n nanocaller_env -c bioconda nanocaller + +### Manual Installation +You can obtain the latest NanoCaller version from github that has not yet been pushed to bioconda via manual installation. + +``` +git clone https://github.com/WGLab/NanoCaller.git +conda env create -f NanoCaller/environment.yml +chmod +x NanoCaller/NanoCaller conda activate nanocaller_env ``` +Then you can run NanoCaller using `PATH_TO_NANOCALLER_REPO/NanoCaller --help`. ## Docker Installation @@ -34,7 +46,7 @@ For instructions regarding Docker installation, please visit [Docker website](ht ### 1) via Docker Hub (preferred) You can pull NanoCaller docker images from Docker Hub by specifiying a version number. ``` -VERSION="2.0.0" +VERSION="3.0.0" docker run genomicslab/nanocaller:${VERSION} NanoCaller --help ``` @@ -44,13 +56,13 @@ If you want to build an image for the latest commit of NanoCaller Github reposit ``` git clone https://github.com/WGLab/NanoCaller.git docker build -t nanocaller NanoCaller -docker run nanocaller NanoCaller --help +docker run nanocaller NanoCaller --help ``` ## Singularity For instructions regarding Singularity installation, please visit [Singularity website] (https://sylabs.io/guides/3.7/user-guide/quick_start.html). ``` -VERSION="2.0.0" +VERSION="3.0.0" singularity pull docker://genomicslab/nanocaller:${VERSION} singularity exec -e --pwd /app nanocaller_${VERSION}.sif NanoCaller --help ``` \ No newline at end of file diff --git a/docs/ONT Case Study.md b/docs/ONT Case Study.md index d82a0bc..ce2a3e0 100644 --- a/docs/ONT Case Study.md +++ b/docs/ONT Case Study.md @@ -39,13 +39,13 @@ GM24385_2_Guppy_4.2.2_prom.fastq.gz GM24385_3_Guppy_4.2.2_prom.fastq.gz -t $CPU samtools index HG002.Guppy_4.2.2_prom.bam -@ $CPU # run nanocaller -VERSION=2.0.0 -docker run -it -v ${PWD}:'/mnt/' genomicslab/nanocaller:${VERSION} NanoCaller_WGS \ --bam /mnt/HG002.Guppy_4.2.2_prom.bam -ref /mnt/GRCh38.fa -prefix HG002 -p ont \ --o /mnt/calls -cpu $CPU --exclude_bed hg38 +VERSION=3.0.0 +docker run -it -v ${PWD}:'/mnt/' genomicslab/nanocaller:${VERSION} NanoCaller \ +--bam /mnt/HG002.Guppy_4.2.2_prom.bam --ref /mnt/GRCh38.fa --prefix HG002 --preset ont \ +--output /mnt/calls --cpu $CPU --exclude_bed hg38 --wgs_contigs chr1-22XY -# If you want to run NanoCaller without docker, run the following command `NanoCaller_WGS -bam HG002.Guppy_4.2.2_prom.bam -ref GRCh38.fa -prefix HG002 -p ont -o calls --exclude_bed hg38 -cpu $CPU` +# If you want to run NanoCaller without docker, run the following command `NanoCaller --bam HG002.Guppy_4.2.2_prom.bam --ref GRCh38.fa --prefix HG002 --preset ont --output calls --exclude_bed hg38 --cpu $CPU --wgs_contigs chr1-22XY` # run `conda install -c bioconda bedtools` to install bedtools to create BED files for variant calling evaluation in difficult-to-map genomic regions. diff --git a/docs/Usage.md b/docs/Usage.md index be1186e..b3ec335 100644 --- a/docs/Usage.md +++ b/docs/Usage.md @@ -1,407 +1,150 @@ # Usage -### Using Docker to run NanoCaller +## NanoCaller Modes -Please check the [NanoCaller Docker Hub repository](https://hub.docker.com/repository/docker/genomicslab/nanocaller) for the most up to date version of NanoCaller docker image. - -#### Whole Genome Variant Calling +NanoCaller can be run in three modes (`snps indels all`) that are specified using `--mode` parameter. +- `snps`: this is the fastest run option which only procudes SNP calls abd can additionally output phased VCF and BAM files using WhatsHap if `--phase` parameter is also used +- `indels`: in this mode, NanoCaller expects a phased BAM file with HP tags to label read haplotypes. This mode will produce only indels shorter than 50bp, and may also produce any SNP calls that are within complex indel blocks consiting of several small variants in a short distance. +- `all`: the default mode which runs `snps` mode, phases BAM file and then runs `indels` mode on phased BAM files. -For whole genome variant calling, or calling variants on several chromosomes, use `NanoCaller_WGS.py` to call variants. Assuming all your input files are in a folder `YOUR_INPUT_DIR`, and you want to use `YOUR_OUTPUT_DIR` to store the results. -``` -VERSION=0.4.0 -docker run \ --v 'YOUR_INPUT_DIR':'/input/' \ --v 'YOUR_WORKING_DIR':'/output/' \ -genomicslab/nanocaller:${VERSION} \ -NanoCaller_WGS \ --bam /input/YOUR_BAM \ --ref /input/YOUR_REF \ --o /output \ --cpu NUMBER_OF_CPUS_TO_USE --p PRESET -``` +## Running NanoCaller with Docker -#### Single Chromosome Variant Calling +Please check the [NanoCaller Docker Hub repository](https://hub.docker.com/repository/docker/genomicslab/nanocaller) for the most up to date version of NanoCaller docker image. -For calling variants on single chromosomes, use `NanoCaller.py` to call variants. Assuming all your input files are in a folder `YOUR_INPUT_DIR`, and you want to use `YOUR_OUTPUT_DIR` to store the results. +NanoCaller can call variants from whole genome or several chromosomes using a single command `NanoCaller`. Assuming all your input files are in a folder `YOUR_INPUT_DIR`, and you want to use `YOUR_OUTPUT_DIR` to store the results. You can use `--regions` or `--bed` to specify regions for variant calling, otherwise all contigs in the input BAM file will be used. ``` -VERSION=0.4.0 +VERSION=3.0.0 docker run \ -v 'YOUR_INPUT_DIR':'/input/' \ -v 'YOUR_WORKING_DIR':'/output/' \ genomicslab/nanocaller:${VERSION} \ NanoCaller \ --chrom CHROMOSOME \ --bam /input/YOUR_BAM \ --ref /input/YOUR_REF \ --o /output \ --cpu NUMBER_OF_CPUS_TO_USE --p PRESET +--bam /input/YOUR_BAM \ +--ref /input/YOUR_REF \ +--output /output \ +--cpu NUMBER_OF_CPUS_TO_USE +--preset PRESET ``` +## Running NanoCaller with Bioconda Installation +NanoCaller can call variants from whole genome or several chromosomes using a single command `NanoCaller`. Assuming all your input files are in a folder `YOUR_INPUT_DIR`, and you want to use `YOUR_OUTPUT_DIR` to store the results. You can use `--regions` or `--bed` to specify regions for variant calling, otherwise all contigs in the input BAM file will be used. -### Running NanoCaller without Docker -#### Whole Genome Variant Calling - -For whole genome variant calling, or calling variants on several chromosomes, use `NanoCaller_WGS.py` to call variants. Assuming NanoCaller respository is cloned and located at `PATH_TO_NANOCALLER_REPOSITORY` directory, run the following command. -``` -NanoCaller_WGS --bam YOUR_BAM \ --ref YOUR_REF \ --o OUTPUT_DIRECTORY \ --cpu NUMBER_OF_CPUS_TO_USE --p PRESET -``` - -Another way to run NanoCaller for whole genome variant calling is to use `NanoCaller` or `NanoCaller_WGS` on each chromosome separately by setting `chrom` argument. This option is suitable if you have a large computing cluster with a lot of computing resources. For instance, on a Sun Grid Engine, you can submit a separate job for each chromosome like this, using 16 CPUs per job: - -``` -for i in {1..22};do echo "NanoCaller --chrom chr$i --bam YOUR_BAM \ --ref YOUR_REF \ --o OUTPUT_DIRECTORY \ --cpu 16 --p PRESET" |qsub -V -cwd -pe smp 16 -N chr$i -e chr$i.log -o chr$i.log; done ``` - - - -#### Single Chromosome Variant Calling - -For calling variants on single chromosomes, use `NanoCaller` to call variants. - - Assuming NanoCaller respository is cloned and located at `PATH_TO_NANOCALLER_REPOSITORY` directory, run the following command. -``` -NanoCaller_WGS --chrom CHROMOSOME \ --bam YOUR_BAM \ --ref YOUR_REF \ --o OUTPUT_DIRECTORY \ --cpu NUMBER_OF_CPUS_TO_USE --p PRESET +NanoCaller \ +--bam YOUR_BAM \ +--ref YOUR_REF \ +--output OUTPUT_DIRECTORY \ +--cpu NUMBER_OF_CPUS_TO_USE +--preset PRESET ``` ## General Usage Options -### Single Chromosome Variant Calling ``` -usage: NanoCaller [-h] [-mode MODE] [-seq SEQUENCING] [-cpu CPU] - [-mincov MINCOV] [-maxcov MAXCOV] [-keep_bam] [-o OUTPUT] - [-prefix PREFIX] [-sample SAMPLE] [-include_bed INCLUDE_BED] - [-exclude_bed EXCLUDE_BED] [-start START] [-end END] - [-p PRESET] -bam BAM -ref REF -chrom CHROM - [-snp_model SNP_MODEL] [-min_allele_freq MIN_ALLELE_FREQ] - [-min_nbr_sites MIN_NBR_SITES] [-nbr_t NEIGHBOR_THRESHOLD] - [-sup] [-indel_model INDEL_MODEL] [-ins_t INS_THRESHOLD] - [-del_t DEL_THRESHOLD] [-win_size WIN_SIZE] - [-small_win_size SMALL_WIN_SIZE] [-impute_indel_phase] - [-phase_bam] [-enable_whatshap] +usage: NanoCaller [-h] [--mode {snps,indels,all}] [--sequencing {ont,ul_ont,ul_ont_extreme,pacbio}] [--cpu CPU] [--mincov MINCOV] [--maxcov MAXCOV] + [--suppress_progress_bar] [--output OUTPUT] [--prefix PREFIX] [--sample SAMPLE] [--regions [REGIONS [REGIONS ...]]] [--bed BED] + [--wgs_contigs {chr1-22XY,1-22XY}] [--exclude_bed {hg38,hg19,mm10,mm39}] [--preset {ont,ul_ont,ul_ont_extreme,ccs,clr}] --bam BAM --ref + REF [--snp_model SNP_MODEL] [--min_allele_freq MIN_ALLELE_FREQ] [--min_nbr_sites MIN_NBR_SITES] [--neighbor_threshold NEIGHBOR_THRESHOLD] + [--supplementary] [--indel_model INDEL_MODEL] [--ins_threshold INS_THRESHOLD] [--del_threshold DEL_THRESHOLD] [--win_size WIN_SIZE] + [--small_win_size SMALL_WIN_SIZE] [--impute_indel_phase] [--phase] [--enable_whatshap] optional arguments: -h, --help show this help message and exit Required Arguments: - -bam BAM, --bam BAM Bam file, should be phased if 'indel' mode is selected - (default: None) - -ref REF, --ref REF Reference genome file with .fai index (default: None) - -chrom CHROM, --chrom CHROM - Chromosome (default: None) + --bam BAM Bam file, should be phased if 'indel' mode is selected (default: None) + --ref REF Reference genome file with .fai index (default: None) Preset: - -p PRESET, --preset PRESET - Apply recommended preset values for SNP and Indel - calling parameters, options are 'ont', 'ul_ont', - 'ul_ont_extreme', 'ccs' and 'clr'. 'ont' works well - for any type of ONT sequencing datasets. However, use - 'ul_ont' if you have several ultra-long ONT reads up - to 100kbp long, and 'ul_ont_extreme' if you have - several ultra-long ONT reads up to 300kbp long. For - PacBio CCS (HiFi) and CLR reads, use 'ccs'and 'clr' - respectively. Presets are described in detail here: gi - thub.com/WGLab/NanoCaller/blob/master/docs/Usage.md#pr - eset-options. (default: None) + --preset {ont,ul_ont,ul_ont_extreme,ccs,clr} + Apply recommended preset values for SNP and Indel calling parameters. 'ont' works well for all types of ONT sequencing datasets. + However, use 'ul_ont' if you have several ultra-long ONT reads up to 100kbp long, and 'ul_ont_extreme' if you have several ultra- + long ONT reads up to 300kbp long. For PacBio CCS (HiFi) and CLR reads, use 'ccs'and 'clr' respectively. Presets are described in + detail here: github.com/WGLab/NanoCaller/blob/master/docs/Usage.md#preset-options. (default: None) Configurations: - -mode MODE, --mode MODE - NanoCaller mode to run, options are 'snps', - 'snps_unphased', 'indels' and 'both'. 'snps_unphased' - mode quits NanoCaller without using WhatsHap for - phasing. (default: both) - -seq SEQUENCING, --sequencing SEQUENCING - Sequencing type, options are 'ont', 'ul_ont', - 'ul_ont_extreme', and 'pacbio'. 'ont' works well for - any type of ONT sequencing datasets. However, use - 'ul_ont' if you have several ultra-long ONT reads up - to 100kbp long, and 'ul_ont_extreme' if you have - several ultra-long ONT reads up to 300kbp long. For - PacBio CCS (HiFi) and CLR reads, use 'pacbio'. - (default: ont) - -cpu CPU, --cpu CPU Number of CPUs to use (default: 1) - -mincov MINCOV, --mincov MINCOV - Minimum coverage to call a variant (default: 8) - -maxcov MAXCOV, --maxcov MAXCOV - Maximum coverage of reads to use. If sequencing depth - at a candidate site exceeds maxcov then reads are - downsampled. (default: 160) + --mode {snps,indels,all} + NanoCaller mode to run. 'snps' mode quits NanoCaller without using WhatsHap for phasing. In this mode, if you want NanoCaller to + phase SNPs and BAM files, use --phase argument additionally. (default: all) + --sequencing {ont,ul_ont,ul_ont_extreme,pacbio} + Sequencing type. 'ont' works well for any type of ONT sequencing datasets. However, use 'ul_ont' if you have several ultra-long ONT + reads up to 100kbp long, and 'ul_ont_extreme' if you have several ultra-long ONT reads up to 300kbp long. For PacBio CCS (HiFi) and + CLR reads, use 'pacbio'. (default: ont) + --cpu CPU Number of CPUs to use. (default: 1) + --mincov MINCOV Minimum coverage to call a variant (default: 4) + --maxcov MAXCOV Maximum coverage of reads to use. If sequencing depth at a candidate site exceeds maxcov then reads are downsampled. (default: 160) + --suppress_progress_bar + Do not show progress bar. (default: False) Variant Calling Regions: - -include_bed INCLUDE_BED, --include_bed INCLUDE_BED - Only call variants inside the intervals specified in - the bgzipped and tabix indexed BED file. If any other - flags are used to specify a region, intersect the - region with intervals in the BED file, e.g. if -chom - chr1 -start 10000000 -end 20000000 flags are set, call - variants inside the intervals specified by the BED - file that overlap with chr1:10000000-20000000. Same - goes for the case when whole genome variant calling - flag is set. (default: None) - -exclude_bed EXCLUDE_BED, --exclude_bed EXCLUDE_BED - Path to bgzipped and tabix indexed BED file containing - intervals to ignore for variant calling. BED files of - centromere and telomere regions for the following - genomes are included in NanoCaller: hg38, hg19, mm10 - and mm39. To use these BED files use one of the - following options: ['hg38', 'hg19', 'mm10', 'mm39']. - (default: None) - -start START, --start START - start, default is 1 (default: None) - -end END, --end END end, default is the end of contig (default: None) - -SNP Calling: - -snp_model SNP_MODEL, --snp_model SNP_MODEL - NanoCaller SNP model to be used (default: ONT-HG002) - -min_allele_freq MIN_ALLELE_FREQ, --min_allele_freq MIN_ALLELE_FREQ - minimum alternative allele frequency (default: 0.15) - -min_nbr_sites MIN_NBR_SITES, --min_nbr_sites MIN_NBR_SITES - minimum number of nbr sites (default: 1) - -nbr_t NEIGHBOR_THRESHOLD, --neighbor_threshold NEIGHBOR_THRESHOLD - SNP neighboring site thresholds with lower and upper - bounds seperated by comma, for Nanopore reads - '0.4,0.6' is recommended, for PacBio CCS anc CLR reads - '0.3,0.7' and '0.3,0.6' are recommended respectively - (default: 0.4,0.6) - -sup, --supplementary - Use supplementary reads (default: False) + Use only one of these options to specify regions for variant calling: --regions or --bed option or --wgs_contigs. If none is provided then whole genome + variant calling is assumed and all contigs in the BAM file are used. -Indel Calling: - -indel_model INDEL_MODEL, --indel_model INDEL_MODEL - NanoCaller indel model to be used (default: ONT-HG002) - -ins_t INS_THRESHOLD, --ins_threshold INS_THRESHOLD - Insertion Threshold (default: 0.4) - -del_t DEL_THRESHOLD, --del_threshold DEL_THRESHOLD - Deletion Threshold (default: 0.6) - -win_size WIN_SIZE, --win_size WIN_SIZE - Size of the sliding window in which the number of - indels is counted to determine indel candidate site. - Only indels longer than 2bp are counted in this - window. Larger window size can increase recall, but - use a maximum of 50 only (default: 40) - -small_win_size SMALL_WIN_SIZE, --small_win_size SMALL_WIN_SIZE - Size of the sliding window in which indel frequency is - determined for small indels (default: 4) - -impute_indel_phase, --impute_indel_phase - Infer read phase by rudimentary allele clustering if - the no or insufficient phasing information is - available, can be useful for datasets without SNPs or - regions with poor phasing quality (default: False) - -Output Options: - -keep_bam, --keep_bam - Keep phased bam files. (default: False) - -o OUTPUT, --output OUTPUT - VCF output path, default is current working directory + --regions [REGIONS [REGIONS ...]] + A space/whitespace separated list of regions specified as "CONTIG_NAME" or "CONTIG_NAME:START-END". If you want to use + "CONTIG_NAME:START-END" format then specify both start and end coordinates. For example: chr3 chr6:28000000-35000000 chr22. (default: None) - -prefix PREFIX, --prefix PREFIX - VCF file prefix (default: variant_calls) - -sample SAMPLE, --sample SAMPLE - VCF file sample name (default: SAMPLE) - -Phasing: - -phase_bam, --phase_bam - Phase bam files if snps mode is selected. This will - phase bam file without indel calling. (default: False) - -enable_whatshap, --enable_whatshap - Allow WhatsHap to change SNP genotypes when phasing - using --distrust-genotypes and --include-homozygous - flags (this is not the same as regenotyping), - considerably increasing the time needed for phasing. - It has a negligible effect on SNP calling accuracy for - Nanopore reads, but may make a small improvement for - PacBio reads. By default WhatsHap will only phase SNP - calls produced by NanoCaller, but not change their - genotypes. (default: False) -``` - -### Whole Genome Variant Calling -``` -usage: NanoCaller_WGS [-h] [-mode MODE] [-seq SEQUENCING] [-cpu CPU] - [-mincov MINCOV] [-maxcov MAXCOV] [-keep_bam] - [-o OUTPUT] [-prefix PREFIX] [-sample SAMPLE] - [-chrom [CHROM [CHROM ...]]] [-include_bed INCLUDE_BED] - [-exclude_bed EXCLUDE_BED] - [-wgs_contigs_type WGS_CONTIGS_TYPE] [-p PRESET] -bam - BAM -ref REF [-snp_model SNP_MODEL] - [-min_allele_freq MIN_ALLELE_FREQ] - [-min_nbr_sites MIN_NBR_SITES] - [-nbr_t NEIGHBOR_THRESHOLD] [-sup] - [-indel_model INDEL_MODEL] [-ins_t INS_THRESHOLD] - [-del_t DEL_THRESHOLD] [-win_size WIN_SIZE] - [-small_win_size SMALL_WIN_SIZE] [-impute_indel_phase] - [-phase_bam] [-enable_whatshap] - -optional arguments: - -h, --help show this help message and exit - -Required Arguments: - -bam BAM, --bam BAM Bam file, should be phased if 'indel' mode is selected - (default: None) - -ref REF, --ref REF Reference genome file with .fai index (default: None) - -Preset: - -p PRESET, --preset PRESET - Apply recommended preset values for SNP and Indel - calling parameters, options are 'ont', 'ul_ont', - 'ul_ont_extreme', 'ccs' and 'clr'. 'ont' works well - for any type of ONT sequencing datasets. However, use - 'ul_ont' if you have several ultra-long ONT reads up - to 100kbp long, and 'ul_ont_extreme' if you have - several ultra-long ONT reads up to 300kbp long. For - PacBio CCS (HiFi) and CLR reads, use 'ccs'and 'clr' - respectively. Presets are described in detail here: gi - thub.com/WGLab/NanoCaller/blob/master/docs/Usage.md#pr - eset-options. (default: None) - -Configurations: - -mode MODE, --mode MODE - NanoCaller mode to run, options are 'snps', - 'snps_unphased', 'indels' and 'both'. 'snps_unphased' - mode quits NanoCaller without using WhatsHap for - phasing. (default: both) - -seq SEQUENCING, --sequencing SEQUENCING - Sequencing type, options are 'ont', 'ul_ont', - 'ul_ont_extreme', and 'pacbio'. 'ont' works well for - any type of ONT sequencing datasets. However, use - 'ul_ont' if you have several ultra-long ONT reads up - to 100kbp long, and 'ul_ont_extreme' if you have - several ultra-long ONT reads up to 300kbp long. For - PacBio CCS (HiFi) and CLR reads, use 'pacbio'. - (default: ont) - -cpu CPU, --cpu CPU Number of CPUs to use (default: 1) - -mincov MINCOV, --mincov MINCOV - Minimum coverage to call a variant (default: 8) - -maxcov MAXCOV, --maxcov MAXCOV - Maximum coverage of reads to use. If sequencing depth - at a candidate site exceeds maxcov then reads are - downsampled. (default: 160) - -Variant Calling Regions: - -chrom [CHROM [CHROM ...]], --chrom [CHROM [CHROM ...]] - A space/whitespace separated list of contigs, e.g. - chr3 chr6 chr22. (default: None) - -include_bed INCLUDE_BED, --include_bed INCLUDE_BED - Only call variants inside the intervals specified in - the bgzipped and tabix indexed BED file. If any other - flags are used to specify a region, intersect the - region with intervals in the BED file, e.g. if -chom - chr1 -start 10000000 -end 20000000 flags are set, call - variants inside the intervals specified by the BED - file that overlap with chr1:10000000-20000000. Same - goes for the case when whole genome variant calling - flag is set. (default: None) - -exclude_bed EXCLUDE_BED, --exclude_bed EXCLUDE_BED - Path to bgzipped and tabix indexed BED file containing - intervals to ignore for variant calling. BED files of - centromere and telomere regions for the following - genomes are included in NanoCaller: hg38, hg19, mm10 - and mm39. To use these BED files use one of the - following options: ['hg38', 'hg19', 'mm10', 'mm39']. - (default: None) - -wgs_contigs_type WGS_CONTIGS_TYPE, --wgs_contigs_type WGS_CONTIGS_TYPE - Options are "with_chr", "without_chr" and "all", - "with_chr" option will assume human genome and run - NanoCaller on chr1-22, "without_chr" will run on - chromosomes 1-22 if the BAM and reference genome files - use chromosome names without "chr". "all" option will - run NanoCaller on each contig present in reference - genome FASTA file. (default: with_chr) + --bed BED A BED file specifying regions for variant calling. (default: None) + --wgs_contigs {chr1-22XY,1-22XY} + Preset list of chromosomes to use for variant calling on human genomes. "chr1-22XY" option will assume human reference genome with + "chr" prefix present in the chromosome notation, and run NanoCaller on chr1 to chr22, chrX and chrY. "1-22XY" option will assume no + "chr" prefix is present in the chromosome notation and run NanoCaller on chromosomes 1-22, X and Y. (default: None) + --exclude_bed {hg38,hg19,mm10,mm39} + Path to bgzipped and tabix indexed BED file containing intervals to ignore for variant calling. BED files of centromere and + telomere regions for the following genomes are included in NanoCaller: hg38, hg19, mm10 and mm39. (default: None) SNP Calling: - -snp_model SNP_MODEL, --snp_model SNP_MODEL + --snp_model SNP_MODEL NanoCaller SNP model to be used (default: ONT-HG002) - -min_allele_freq MIN_ALLELE_FREQ, --min_allele_freq MIN_ALLELE_FREQ + --min_allele_freq MIN_ALLELE_FREQ minimum alternative allele frequency (default: 0.15) - -min_nbr_sites MIN_NBR_SITES, --min_nbr_sites MIN_NBR_SITES + --min_nbr_sites MIN_NBR_SITES minimum number of nbr sites (default: 1) - -nbr_t NEIGHBOR_THRESHOLD, --neighbor_threshold NEIGHBOR_THRESHOLD - SNP neighboring site thresholds with lower and upper - bounds seperated by comma, for Nanopore reads - '0.4,0.6' is recommended, for PacBio CCS anc CLR reads - '0.3,0.7' and '0.3,0.6' are recommended respectively - (default: 0.4,0.6) - -sup, --supplementary - Use supplementary reads (default: False) + --neighbor_threshold NEIGHBOR_THRESHOLD + SNP neighboring site thresholds with lower and upper bounds seperated by comma, for Nanopore reads '0.4,0.6' is recommended, for + PacBio CCS anc CLR reads '0.3,0.7' and '0.3,0.6' are recommended respectively (default: 0.4,0.6) + --supplementary Use supplementary reads, not fully supported at the moment (default: False) Indel Calling: - -indel_model INDEL_MODEL, --indel_model INDEL_MODEL + --indel_model INDEL_MODEL NanoCaller indel model to be used (default: ONT-HG002) - -ins_t INS_THRESHOLD, --ins_threshold INS_THRESHOLD + --ins_threshold INS_THRESHOLD Insertion Threshold (default: 0.4) - -del_t DEL_THRESHOLD, --del_threshold DEL_THRESHOLD + --del_threshold DEL_THRESHOLD Deletion Threshold (default: 0.6) - -win_size WIN_SIZE, --win_size WIN_SIZE - Size of the sliding window in which the number of - indels is counted to determine indel candidate site. - Only indels longer than 2bp are counted in this - window. Larger window size can increase recall, but - use a maximum of 50 only (default: 40) - -small_win_size SMALL_WIN_SIZE, --small_win_size SMALL_WIN_SIZE - Size of the sliding window in which indel frequency is - determined for small indels (default: 4) - -impute_indel_phase, --impute_indel_phase - Infer read phase by rudimentary allele clustering if - the no or insufficient phasing information is - available, can be useful for datasets without SNPs or - regions with poor phasing quality (default: False) + --win_size WIN_SIZE Size of the sliding window in which the number of indels is counted to determine indel candidate site. Only indels longer than 2bp + are counted in this window. Larger window size can increase recall, but use a maximum of 50 only (default: 40) + --small_win_size SMALL_WIN_SIZE + Size of the sliding window in which indel frequency is determined for small indels (default: 4) + --impute_indel_phase Infer read phase by rudimentary allele clustering if the no or insufficient phasing information is available, can be useful for + datasets without SNPs or regions with poor phasing quality (default: False) Output Options: - -keep_bam, --keep_bam - Keep phased bam files. (default: False) - -o OUTPUT, --output OUTPUT - VCF output path, default is current working directory - (default: None) - -prefix PREFIX, --prefix PREFIX - VCF file prefix (default: variant_calls) - -sample SAMPLE, --sample SAMPLE - VCF file sample name (default: SAMPLE) + --output OUTPUT VCF output path, default is current working directory (default: None) + --prefix PREFIX VCF file prefix (default: variant_calls) + --sample SAMPLE VCF file sample name (default: SAMPLE) Phasing: - -phase_bam, --phase_bam - Phase bam files if snps mode is selected. This will - phase bam file without indel calling. (default: False) - -enable_whatshap, --enable_whatshap - Allow WhatsHap to change SNP genotypes when phasing - using --distrust-genotypes and --include-homozygous - flags (this is not the same as regenotyping), - considerably increasing the time needed for phasing. - It has a negligible effect on SNP calling accuracy for - Nanopore reads, but may make a small improvement for - PacBio reads. By default WhatsHap will only phase SNP - calls produced by NanoCaller, but not change their - genotypes. (default: False) - + --phase Phase SNPs and BAM files if snps mode is selected. (default: False) + --enable_whatshap Allow WhatsHap to change SNP genotypes when phasing using --distrust-genotypes and --include-homozygous flags (this is not the same + as regenotyping), increasing the time needed for phasing. It has a negligible effect on SNP calling accuracy for Nanopore reads, + but may make a small improvement for PacBio reads. By default WhatsHap will only phase SNP calls produced by NanoCaller, but not + change their genotypes. (default: False) ``` + ## Understanding NanoCaller Output + Depending upon which mode is run, NanoCaller will produce the following files: -- PREFIX.snps.vcf.gz contains unphased SNP calls made by NanoCaller using a deep learning model. NanoCaller modes that produce this file are: `snps_unphased`, `snps` and `both`. -- PREFIX.snps.phased.vcf.gz contains SNP calls from PREFIX.snps.vcf.gz that are phase with WhatsHap. By default they have the same genotype as in the PREFIX.snps.vcf.gz file, unless `--enable_whatshap` flag is set which can allow WhatsHap to change genotypes. NanoCaller modes that produce this file are: `snps` and `both`. -- PREFIX.indels.vcf.gz contains indel calls made by NanoCaller using multiple sequence alignment. Some of these calls might be indels combined with nearby substitutions or multi-nucleotide substitutions. NanoCaller modes that produce this file are: `indels` and `both`. -- PREFIX.final.vcf.gz contains SNP calls from PREFIX.snps.phased.vcf.gz and indel calls from PREFIX.indels.vcf.gz. NanoCaller mode that produce this file is: `both`. +- PREFIX.snps.vcf.gz contains unphased SNP calls made by NanoCaller using a deep learning model. +- PREFIX.snps.phased.vcf.gz contains SNP calls from PREFIX.snps.vcf.gz that are phased with WhatsHap if `all` mode is selected or `--phase` option was selected with `snps` mode. By default SNP calls in this file have the same genotype as in PREFIX.snps.vcf.gz file, unless `--enable_whatshap` flag is set which can allow WhatsHap to change genotypes. +- PREFIX.indels.vcf.gz contains indel calls made by NanoCaller using multiple sequence alignment. Some of these indel calls might be combined with nearby substitutions or multi-nucleotide substitutions. +- PREFIX.vcf.gz contains SNP calls from PREFIX.snps.phased.vcf.gz and indel calls from PREFIX.indels.vcf.gz. +- intermediate_phase_files directory will contain phased BAM files if `all` mode is selected or `--phase` option was selected with `snps` mode. ## Preset Options -Users can select recommended NanoCaller settings for various sequencing types by using preset option `-p` or `--preset`. There are five presets, which are described below with their equivalent parameter settings. Any parameters included in the presets can be overwritten by specifying the parameter. +Users can select recommended NanoCaller settings for various sequencing types by using preset option `--preset`. There are five presets, which are described below with their equivalent parameter settings. Any parameters included in the presets can be overwritten by specifying the parameter. ### ONT reads Options are: `ont`, `ul_ont` and `ul_ont_extreme`. These presets use ONT trained models for SNP and indell calling, and only differ in `--sequencing` argument. With `ont`, `ul_ont` and `ul_ont_extreme` options, NanoCaller uses haplotype information from up to 50kbp, 100kbp and 300kbp bases away, respectively, for SNP calling. There is no difference in indel calling between these options. Therefore, use `ul_ont` if you have a significant number of reads that are up to 100kbp long, and `ul_ont_extreme` if you have a significant number of reads that are up to 300kbp long. @@ -470,12 +213,7 @@ Preset `clr` is equivalent to: Some important options to keep in mind when using NanoCaller: - `--sequencing` argument is important to set because NanoCaller has slightly different settings for generating inputs for each type of sequencing. If you use a preset option `--preset`, then `--sequencing` is set automatically. - `--exclude_bed` argument can be used to speed up NanoCaller. For instance, by setting it to `hg38`, you can tell NanoCaller to skip centromere and telomere regions which have incredibly high number of candidate variants due to poor alignment. -- `--mode` argument can be used to select which types of variants you want to call. The fastest option is `snps_unphased` which only makes SNP calls. -- `--phase_bam` flag can be set to phase reads if you are only interested in calling SNPs, and phasing input BAM file with WhatsHap. This argument only works when you use `--mode snps`, and it is recommended to use it with NanoCaller's single variant calling routine using `NanoCaller.py` to phase reads from the entire chromosome together. If you use `--mode both` then reads are automatically phased for indel calling and you can select to keep phased BAM files them using `--keep_bam`. - `-nbr_t` option is sensitive to sequencing type so choose this accordingly. - `-ins_t` and `del_t` are insertion and deletion frequency thresholds are per haplotype. Default values are slightly higher due to high error in ONT reads, but these thresholds can be lowered for CCS reads. - CLR reads have incredibly low insertion and deletion freqencies in a pileup due highly variable placement of indels by aligners. To detect indels on CLR reads, you would need to set low frequency threhsolds, which leads to a huge increase in runtime. -- `--keep_bam` flag can be used to save phased BAM files created by NanoCaller for indel calling. By default, we delete these BAM files in order to not use up too much storage. -- `--enable_whatshap` flag can improve SNP calling performance for PacBio reads. - -We recommend using `NanoCaller_WGS.py` for whole genome variant calling, and `NanoCaller.py` for single chromosome variant calling, although `NanoCaller_WGS.py` can also be used for single chromosome. `NanoCaller_WGS.py` breaks genome into 10Mb chunks, uses GNU parallel to run `NanoCaller.py` on each chunk independently using 1 CPU, and then combines the results at the end. We do not break genome into chunks smaller than 10Mb so that phasing can be done accurately. Lets say you can use 20 CPUs, and if you use `NanoCaller_WGS.py` for whole human genome of 3000Mb, NanoCaller will create 300 jobs for 20 CPUs to run. But if you use it for a chromosome of size 100Mb, then only 10 jobs will be created for 20 CPUs, and using more than 10 CPUs is not going to improve runtime and the rest of the CPUs will be left idle. If you run `NanoCaller.py` on the same 100Mb chromosome with 20 CPUs, `NanoCaller.py` will use python's multiprocessing module to generate features for SNPs in chunks of 200Kb using 20 CPUs by running 500 jobs, use only a single CPU to run WhatsHap for phasing the entire chromosome, followed by generating features for indels in chunks of 50Kb using 20 CPUs by running 2000 jobs. This method is more aggressive in terms of utilizing computing resources by breaking the chromosome into very small chunks for feature generation, creates fewer intermediate files and folder, but can have bottleneck issues with using WhatsHap with 1 CPU only. +- `--enable_whatshap` flag can improve SNP calling performance for PacBio reads. \ No newline at end of file diff --git a/environment.yml b/environment.yml index 8b4554e..f56b981 100644 --- a/environment.yml +++ b/environment.yml @@ -8,12 +8,12 @@ dependencies: - biopython - muscle=3.8 - numpy - - parallel - pysam - python>=3.8 - rtg-tools - - samtools=1.10 + - samtools>=1.10 - tensorflow>=2.2 - - whatshap=1.0 + - whatshap>=1.0 - vcflib - intervaltree=3.* + - tqdm \ No newline at end of file diff --git a/nanocaller_src/generate_SNP_pileups.py b/nanocaller_src/generate_SNP_pileups.py index f674d06..1f62ce0 100644 --- a/nanocaller_src/generate_SNP_pileups.py +++ b/nanocaller_src/generate_SNP_pileups.py @@ -1,8 +1,6 @@ -import sys, pysam, time, os, copy, argparse, random +import pysam, random from collections import Counter import numpy as np -import multiprocessing as mp -from pysam import VariantFile from intervaltree import Interval, IntervalTree def get_cnd_pos(v_pos,cnd_pos, seq='ont'): @@ -86,68 +84,15 @@ def get_cnd_pos(v_pos,cnd_pos, seq='ont'): return ls_total_1, ls_total_2 -def get_nbr(dct): - base_to_num_map={'*':4,'A':0,'G':1,'T':2,'C':3,'N':4} - - chrom=dct['chrom'] - start=max(dct['start']-50000,1) - end=dct['end']+50000 - - sam_path=dct['sam_path'] - fasta_path=dct['fasta_path'] - samfile = pysam.Samfile(sam_path, "rb") - fastafile=pysam.FastaFile(fasta_path) - - rlist=[s for s in fastafile.fetch(chrom,start-1,end-1)] - output_seq={} - - if dct['supplementary']: - flag=0x4|0x100|0x200|0x400 - else: - flag=0x4|0x100|0x200|0x400|0x800 - - for pcol in samfile.pileup(chrom,start-1,end-1,min_base_quality=0, flag_filter=flag,truncate=True): - n=pcol.get_num_aligned() - r=rlist[pcol.pos+1-start] - - if r in 'AGTC' and n>=dct['mincov']: - seq=''.join([x[0] for x in pcol.get_query_sequences( mark_matches=False, mark_ends=False,add_indels=True)]).upper() - alt_freq=max([x[1] for x in Counter(seq).items() if (x[0]!=r and x[0] in 'AGTC')]+[0])/n - - if dct['threshold'][0]<=alt_freq and alt_freq=dct['mincov'] and pcol.pos+1>=start and pcol.pos+1<=end: - alt_freq=max([x[1] for x in Counter(seq).items() if (x[0]!=r and x[0] in 'AGTC')]+[0])/n - - if dct['min_allele_freq']<=alt_freq: - pileup_dict[pcol.pos+1]={n:base_to_num_map[s] for (n,s) in zip(name,seq)} - output[pcol.pos+1]=(n,alt_freq) + + alt_freq=max([x[1] for x in Counter(seq).items() if (x[0]!=r and x[0] in 'AGTC')]+[0])/n + + + + if n>=dct['mincov']: + # add to neighbor site list + if dct['threshold'][0]<=alt_freq and alt_freq=start and v_pos<=end and dct['min_allele_freq']<=alt_freq: + if v_pos not in pileup_dict: + pileup_dict[v_pos]={n:base_to_num_map[s] for (n,s) in zip(name,seq)} + output[v_pos]=(n,alt_freq) + + nbr_sites=np.array(nbr_sites) + pileup_list=[] pos_list=output.keys() @@ -212,7 +167,7 @@ def ex_bed(tree, pos): for v_pos in pos_list: np.random.seed(812) - ls_total_1, ls_total_2 = get_cnd_pos(v_pos, cnd_pos, dct['seq']) + ls_total_1, ls_total_2 = get_cnd_pos(v_pos, nbr_sites, dct['seq']) nbr_dict={} @@ -231,7 +186,7 @@ def ex_bed(tree, pos): for i,name in enumerate(sample): for j,nb_pos in enumerate(ls_total_1): try: - tmp_mat[i][j]=cnd_seq[nb_pos][name] + tmp_mat[i][j]=pileup_dict[nb_pos][name] except KeyError: pass @@ -240,11 +195,11 @@ def ex_bed(tree, pos): for j,nb_pos in enumerate(ls_total_2): try: - tmp_mat[i][j +len(ls_total_1)+1]=cnd_seq[nb_pos][name] + tmp_mat[i][j +len(ls_total_1)+1]=pileup_dict[nb_pos][name] except KeyError: pass for nb_pos in ls_total_1+ls_total_2: - rlist.append((nb_pos, cnd_seq[nb_pos]['ref'])) + rlist.append((nb_pos, base_to_num_map[ref_dict[nb_pos]])) rlist=sorted(rlist, key=lambda x:x[0]) total_rlist=np.array([x[1] for x in rlist]) diff --git a/nanocaller_src/generate_indel_pileups.py b/nanocaller_src/generate_indel_pileups.py index 12bbb70..4903002 100644 --- a/nanocaller_src/generate_indel_pileups.py +++ b/nanocaller_src/generate_indel_pileups.py @@ -3,16 +3,11 @@ import multiprocessing as mp from pysam import VariantFile from subprocess import Popen, PIPE, STDOUT -from Bio import pairwise2 from intervaltree import Interval, IntervalTree import collections +import parasail - - -def pairwise(x,y): - alignments = pairwise2.align.globalms(x, y, 2, -1.0, -0.9, -0.1) - - return alignments +sub_mat=parasail.matrix_create('AGTC',20,-10) def msa(seq_list, ref, v_pos, mincov, maxcov): mapping={'A':0,'G':1,'T':2,'C':3,'-':4} @@ -66,64 +61,72 @@ def msa(seq_list, ref, v_pos, mincov, maxcov): tmp_mat=np.copy(alt_mat) tmp_mat[:,4]=tmp_mat[:,4]-0.01 - cns=''.join([rev_base_map[x] for x in np.argmax(tmp_mat,axis=1)]) + cns=''.join([rev_base_map[x] for x in np.argmax(tmp_mat,axis=1)]).replace('-','') ref_seq=ref_real_0.replace('-','') alt_mat-=ref_real_0_mat - final_mat=np.dstack([alt_mat, ref_real_0_mat])[:128,:,:] - return (1,1,final_mat.transpose(1,0,2),cns,ref_seq) + + final_mat=np.dstack([alt_mat, ref_real_0_mat])[:128,:,:].transpose(1,0,2) + if final_mat.shape[1]<128: + final_mat=np.hstack((final_mat,np.zeros((5, 128-final_mat.shape[1], 2)))) + + return (1,1,final_mat,cns,ref_seq) -def allele_prediction(alt,ref_seq,max_range): - res=pairwise(alt.replace('-',''),ref_seq.replace('-','')) - try: - flag=False - best=None - score=1e6 - for pair in res: - current=sum([m.start() for m in re.finditer('-', pair[0])])+sum([m.start() for m in re.finditer('-', pair[1])]) - if current =max_range+10 and flag==False: - break - if ref_new[i]=='-' or alt_new[i]=='-': - flag=True - if ref_new[i:i+10]==alt_new[i:i+10] and flag and ref_count>=max_range+10: - break - i+=1 - s2=alt_new[:i].replace('-','') - s1=ref_new[:i].replace('-','') +def allele_prediction(alt, ref_seq, max_range): + global sub_mat + cigar=parasail.nw_trace(alt, ref_seq, 9, 1, sub_mat).cigar + cigar_op=[(x & 0x7, x >> 4) for x in cigar.seq] + indel=False + ref_cnt=[0]*10 + alt_cnt=[0]*10 - s1=s1[0]+'.'+s1[1:]+'|' - s2=s2[0]+'.'+s2[1:]+'|' - except IndexError: - return (None,None) - if s1==s2: - return (None,None) - i=-1 + mis_match_cnt_before_indel=False + mis_match_cnt_after_indel=(0, 0) + for op, cnt in cigar_op: + if op==8 or op==7: + ref_cnt[op]+=cnt + alt_cnt[op]+=cnt + + if indel: + mis_match_cnt_after_indel[op-7]+=cnt + else: + mis_match_cnt_before_indel=True + + if op==1: + alt_cnt[op]+=cnt + mis_match_cnt_after_indel=[0, 0] + indel=True - while s1[i]==s2[i] and s1[i]!='.' and s2[i]!='.': - i-=1 - - ref_out=s1[:i+1].replace('.','') - allele_out=s2[:i+1].replace('.','') + if op==2: + ref_cnt[op]+=cnt + mis_match_cnt_after_indel=[0, 0] + indel=True + + if indel==False and sum(ref_cnt)>=max_range+10: + if ref_cnt[8]: + out_len=sum(ref_cnt) if op==8 else sum(ref_cnt)-cnt + return ref_seq[:out_len], alt[:out_len] + else: + return (None, None) + + + if indel==True: + if sum(mis_match_cnt_after_indel)>20: + break + + ref_out_len=sum(ref_cnt) if op==8 else sum(ref_cnt)-cnt + alt_out_len=sum(alt_cnt) if op==8 else sum(alt_cnt)-cnt - return (ref_out,allele_out) + if not mis_match_cnt_before_indel: + ref_out_len+=1 + alt_out_len+=1 + + return ref_seq[:ref_out_len], alt[:alt_out_len] -def get_indel_testing_candidates(dct): +def get_indel_testing_candidates(dct, chunk): mapping={'A':0,'G':1,'T':2,'C':3,'-':4} rev_base_map={0:'A',1:'G',2:'T',3:'C',4:'-'} @@ -135,10 +138,11 @@ def get_indel_testing_candidates(dct): if dct['seq']=='pacbio': window_after=260 - chrom=dct['chrom'] - start=dct['start'] - end=dct['end'] - sam_path=dct['sam_path'] + chrom=chunk['chrom'] + start=chunk['start'] + end=chunk['end'] + + sam_path=chunk['sam_path'] fasta_path=dct['fasta_path'] samfile = pysam.Samfile(sam_path, "rb") fastafile=pysam.FastaFile(fasta_path) @@ -149,27 +153,8 @@ def get_indel_testing_candidates(dct): mincov,maxcov=dct['mincov'],dct['maxcov'] ins_t,del_t=dct['ins_t'],dct['del_t'] - include_intervals, exclude_intervals=None, None + exclude_intervals = None - if dct['include_bed']: - tbx = pysam.TabixFile(dct['include_bed']) - include_intervals=IntervalTree(Interval(int(row[1]), int(row[2]), "%s" % (row[1])) for row in tbx.fetch(chrom, parser=pysam.asBed())) - - def in_bed(tree,pos): - return tree.overlaps(pos) - - include_intervals=IntervalTree(include_intervals.overlap(start,end)) - - if not include_intervals: - return [],[],[],[],[] - - else: - start=max(start, min(x[0] for x in include_intervals)) - end=min(end, max(x[1] for x in include_intervals)) - - else: - def in_bed(tree, pos): - return True if dct['exclude_bed']: tbx = pysam.TabixFile(dct['exclude_bed']) @@ -186,15 +171,18 @@ def ex_bed(tree, pos): def ex_bed(tree, pos): return False - ref_dict={j:s.upper() if s in 'AGTC' else '' for j,s in zip(range(max(1,start-200),end+400+1),fastafile.fetch(chrom,max(1,start-200)-1,end+400)) } + ref_dict={j:s.upper() if s in 'AGTC' else 'N' for j,s in zip(range(max(1,start-200),end+400+1),fastafile.fetch(chrom,max(1,start-200)-1,end+400)) } chrom_length=fastafile.get_reference_length(chrom) hap_dict={1:[],2:[]} - - for pread in samfile.fetch(chrom, max(0,dct['start']-100000), dct['end']+1000): + phase_dict={} + for pread in samfile.fetch(chrom, max(0, start-100000), end + 1000): if pread.has_tag('HP'): + phase_dict[pread.qname]=pread.get_tag('PS') hap_dict[pread.get_tag('HP')].append(pread.qname) + else: + phase_dict[pread.qname]=None hap_reads_0=set(hap_dict[1]) hap_reads_1=set(hap_dict[2]) @@ -204,7 +192,7 @@ def ex_bed(tree, pos): else: flag=0x4|0x100|0x200|0x400|0x800 - output_pos,output_data_0,output_data_1,output_data_total,alleles=[],[],[],[],[] + output_pos,output_data_0,output_data_1,output_data_total,alleles, output_phase=[], [], [], [], [], [] del_queue_0,del_queue_1=collections.deque(window_size*[set()], window_size),collections.deque(window_size*[set()], window_size) ins_queue_0,ins_queue_1=collections.deque(window_size*[set()], window_size),collections.deque(window_size*[set()], window_size) @@ -226,7 +214,7 @@ def ex_bed(tree, pos): flag_filter=flag,truncate=True): v_pos=pcol.pos+1 - if in_bed(include_intervals, v_pos) and not ex_bed(exclude_intervals, v_pos): + if not ex_bed(exclude_intervals, v_pos): read_names=pcol.get_query_names() read_names_0=set(read_names) & hap_reads_0 read_names_1=set(read_names) & hap_reads_1 @@ -336,7 +324,9 @@ def ex_bed(tree, pos): ref=''.join([ref_dict[p] for p in range(v_pos-window_before,min(chrom_length,v_pos+window_after+1))]) - + if 'N' in ref: + continue + for pread in pcol.pileups: dt=pread.alignment.query_sequence[max(0,pread.query_position_or_next-window_before):pread.query_position_or_next+window_after] d_tot[pread.alignment.qname]=dt @@ -362,6 +352,7 @@ def ex_bed(tree, pos): output_data_0.append(data_0) output_data_1.append(data_1) output_data_total.append(data_total) + output_phase.append(phase_dict[next(iter(d['hap0'].keys()))]) tp=max_range[variants[v_pos]] @@ -369,13 +360,12 @@ def ex_bed(tree, pos): allele_prediction(alt_1, ref_seq_1, max_range[variants[v_pos]]), \ allele_prediction(alt_total, ref_seq_total, max_range[variants[v_pos]])]) - + if len(output_pos)==0: - return (output_pos,output_data_0,output_data_1,output_data_total,alleles) - - output_pos=np.array(output_pos) + return (output_pos, output_data_0, output_data_1, output_data_total, alleles, output_phase) + output_data_0=np.array(output_data_0) output_data_1=np.array(output_data_1) output_data_total=np.array(output_data_total) - - return (output_pos,output_data_0,output_data_1,output_data_total,alleles) \ No newline at end of file + + return (output_pos, output_data_0, output_data_1, output_data_total, alleles, output_phase) \ No newline at end of file diff --git a/nanocaller_src/indelCaller.py b/nanocaller_src/indelCaller.py index 0917b31..847a4dc 100644 --- a/nanocaller_src/indelCaller.py +++ b/nanocaller_src/indelCaller.py @@ -1,12 +1,13 @@ -import time,os,copy,argparse,subprocess, datetime - +import time, os, datetime, queue +from tqdm import tqdm import numpy as np import multiprocessing as mp import tensorflow as tf from .model_architect_indel import * -from Bio import pairwise2 from .generate_indel_pileups import get_indel_testing_candidates from .utils import * +from multiprocessing import current_process + rev_gt_map={0:'hom-ref', 1:'hom-alt', 2:'het-ref', 3:'het-alt'} rev_base_map={0:'A',1:'G',2:'T',3:'C',4:'-'} @@ -33,148 +34,292 @@ def get_indel_model(indel_model): return indel_model_path -def test_model(params,pool): - print('%s: Indel calling started.' %(str(datetime.datetime.now())), flush=True) - - rev_allele_map={0:'N',1:'D',2:'I'} - - vcf_path,prefix=params['vcf_path'], params['prefix'] - chrom,start,end=params['chrom'], params['start'],params['end'] + +def indel_run(params, indel_dict, job_Q, counter_Q, indel_files_list): + cur_p = current_process() + curr_vcf_path=os.path.join(params['intermediate_indel_files_dir'],'%s.%d.indel.vcf' %(params['prefix'], cur_p._identity[0])) + indel_files_list.append(curr_vcf_path) model_path=get_indel_model(params['indel_model']) - if model_path==None: print('Invalid indel model name or path', flush=True) - return + sys.exit(1) indel_model=Indel_model() - indel_model.load_weights(model_path) + indel_model.load_weights(model_path).expect_partial() batch_size=100 - neg_file=open(os.path.join(vcf_path,'%s.indel_stats' %prefix),'w') - neg_file.write('pos,ref\n') + with open(curr_vcf_path,'w') as f: + while len(indel_dict)>0 or not job_Q.empty(): + try: + job=job_Q.get(block=False) + chunk=job[1] + chrom=chunk['chrom'] + pos, x0_test, x1_test, x2_test, alleles_seq, phase = get_indel_testing_candidates(params, chunk) + + prev=0 + prev_len=0 + + if len(pos)>0: + for batch in range(int(np.ceil(len(x0_test)/batch_size))): + batch_pos = pos[batch*batch_size:min((batch+1)*batch_size,len(pos))] + + batch_x0 = x0_test[batch*batch_size:min((batch+1)*batch_size,len(x0_test))] + batch_x1 = x1_test[batch*batch_size:min((batch+1)*batch_size,len(x1_test))] + batch_x2 = x2_test[batch*batch_size:min((batch+1)*batch_size,len(x2_test))] + batch_alleles_seq = alleles_seq[batch*batch_size:min((batch+1)*batch_size,len(alleles_seq))] + batch_phase = phase[batch*batch_size:min((batch+1)*batch_size,len(phase))] + + batch_x_all=np.hstack([batch_x0, batch_x1, batch_x2]) + + batch_prob_all= indel_model(batch_x_all) + + batch_pred_all=np.argmax(batch_prob_all,axis=1) + + qual_all=-10*np.log10(1e-6+1-batch_prob_all) + + for j in range(len(batch_pred_all)): + q=min(999,qual_all[j,batch_pred_all[j]]) + if batch_pos[j]>prev: + + if batch_prob_all[j,0]<=0.95: + + q=-100*np.log10(1e-6+batch_prob_all[j,0]) + allele0_data, allele1_data,allele_total_data= batch_alleles_seq[j] + + if batch_pred_all[j]==1 and allele_total_data[0]: + gq=-100*np.log10(1+1e-6-batch_prob_all[j,1]) + s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ\t1/1:%.2f\n' %(chrom, batch_pos[j], allele_total_data[0], allele_total_data[1], q, gq) + f.write(s) + prev=batch_pos[j]+max(len(allele_total_data[0]), len(allele_total_data[1])) + + else: + if allele0_data[0] and allele1_data[0]: + + if allele0_data[0]==allele1_data[0] and allele0_data[1]==allele1_data[1]: + gq=-100*np.log10(1+1e-6-batch_prob_all[j,1]) + s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ\t1/1:%.2f\n' %(chrom, batch_pos[j], allele0_data[0], allele0_data[1], q, gq) + f.write(s) + prev=batch_pos[j]+max(len(allele0_data[0]), len(allele0_data[1])) + + else: + ref1,alt1=allele0_data + ref2,alt2=allele1_data + + l=min(len(ref1),len(ref2)) + + if len(ref1)>len(ref2): + ref=ref1 + alt2=alt2+ref1[l:] + + else: + ref=ref2 + alt1=alt1+ref2[l:] + gq=-100*np.log10(1+1e-6-batch_prob_all[j,3]) + if batch_phase[j]: + s='%s\t%d\t.\t%s\t%s,%s\t%.2f\tPASS\t.\tGT:GQ:PS\t1|2:%.2f:%d\n' %(chrom, batch_pos[j], ref, alt1, alt2, q, gq, batch_phase[j]) + else: + s='%s\t%d\t.\t%s\t%s,%s\t%.2f\tPASS\t.\tGT:GQ\t1|2:%.2f\n' %(chrom, batch_pos[j], ref, alt1, alt2, q, gq) + f.write(s) + prev=batch_pos[j]+max(len(ref), len(alt1),len(alt2)) + + elif allele0_data[0]: + gq=-100*np.log10(1+1e-6-batch_prob_all[j,2]) + if batch_phase[j]: + s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ:PS\t0|1:%.2f:%d\n' %(chrom, batch_pos[j], allele0_data[0], allele0_data[1], q, gq, batch_phase[j]) + else: + s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ\t0|1:%.2f\n' %(chrom, batch_pos[j], allele0_data[0], allele0_data[1], q, gq) + f.write(s) + prev=batch_pos[j]+max(len(allele0_data[0]), len(allele0_data[1])) + + elif allele1_data[0]: + gq=-100*np.log10(1+1e-6-batch_prob_all[j,2]) + if batch_phase[j]: + s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ:PS\t1|0:%.2f:%d\n' %(chrom, batch_pos[j], allele1_data[0], allele1_data[1], q, gq, batch_phase[j]) + else: + s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ\t1|0:%.2f\n' %(chrom, batch_pos[j], allele1_data[0], allele1_data[1], q, gq) + f.write(s) + prev=batch_pos[j]+max(len(allele1_data[0]), len(allele1_data[1])) + + batch_x0,batch_x1,batch_x2,batch_x=None,None,None,None + pos, x0_test, x1_test, x2_test=None,None,None,None + + f.flush() + os.fsync(f.fileno()) + counter_Q.put(1) + + except queue.Empty: + continue + +def phase_run(contig_dict, params, indel_dict, job_Q, counter_Q, phased_snp_files_list): + contig = contig_dict['name'] + start, end = contig_dict['start'], contig_dict['end'] + phase_dir = params['intermediate_phase_files_dir'] + input_snp_vcf = params['snp_vcf'] + input_contig_vcf = os.path.join(phase_dir, '%s.snps.unphased.vcf' %contig) + raw_output_contig_vcf = os.path.join(phase_dir, '%s.snps.phased.raw.vcf' %contig) + output_contig_vcf = os.path.join(phase_dir, '%s.snps.phased.vcf.gz' %contig) + phased_bam = os.path.join(phase_dir, '%s.phased.bam' %contig) + phased_snp_files_list.append(output_contig_vcf) - outfile=os.path.join(vcf_path,'%s.indels' %prefix) + enable_whatshap = '--distrust-genotypes --include-homozygous' if params['enable_whatshap'] else '' - with open('%s.raw.vcf' %outfile,'w') as f: + # extract region from VCF + run_cmd('bcftools view %s -r %s -o %s' %(input_snp_vcf, contig, input_contig_vcf)) + + #phase VCF + run_cmd("whatshap phase %s %s -o %s -r %s --ignore-read-groups --chromosome %s %s" %(input_contig_vcf, params['sam_path'], raw_output_contig_vcf, params['fasta_path'], contig , enable_whatshap)) - f.write('##fileformat=VCFv4.2\n') - f.write('##FILTER=\n') - c='##contig=\n' %chrom - f.write('##contig=\n' %chrom) - - f.write('##FORMAT=\n') - f.write('##FORMAT=\n') - f.write('#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT %s\n' %params['sample']) - - in_dict_list=[] - - for mbase in range(start, end, 50000): - d = copy.deepcopy(params) - d['start']=mbase - d['end']=min(end,mbase+50000) - in_dict_list.append(d) - - result=pool.imap_unordered(get_indel_testing_candidates, in_dict_list) - - total_regions=len(in_dict_list) - completed=0 - for res in result: - pos, x0_test, x1_test, x2_test, alleles_seq=res - completed+=1 - - prev=0 - prev_len=0 - - if len(pos)==0: - continue - for batch in range(int(np.ceil(len(x0_test)/batch_size))): - batch_pos = pos[batch*batch_size:min((batch+1)*batch_size,len(pos))] + run_cmd("bcftools view -e 'GT=\"0\\0\"' %s|bgziptabix %s" %(raw_output_contig_vcf, output_contig_vcf)) + + run_cmd("whatshap haplotag --ignore-read-groups --ignore-linked-read --reference %s %s %s --regions %s:%d-%d --tag-supplementary -o - | samtools view -b -1 --write-index -o %s" %(params['fasta_path'], output_contig_vcf, params['sam_path'], contig, start, end, phased_bam), verbose=True) + run_cmd("touch -c %s.csi" %phased_bam) + if params['mode']=='snps': + counter_Q.put(1) + else: + indel_jobs=indel_dict[contig] + for chunk in indel_jobs: + chunk['sam_path']=phased_bam - batch_x0 = x0_test[batch*batch_size:min((batch+1)*batch_size,len(x0_test))] - batch_x1 = x1_test[batch*batch_size:min((batch+1)*batch_size,len(x1_test))] - batch_x2 = x2_test[batch*batch_size:min((batch+1)*batch_size,len(x2_test))] - batch_alleles_seq = alleles_seq[batch*batch_size:min((batch+1)*batch_size,len(alleles_seq))] + job_Q.put(('indel', chunk)) - batch_x_all=np.hstack([batch_x0, batch_x1, batch_x2]) - - batch_prob_all= indel_model(batch_x_all) - - batch_pred_all=np.argmax(batch_prob_all,axis=1) - - qual_all=-10*np.log10(1e-6+1-batch_prob_all) + indel_dict.pop(contig) + +def caller(params, job_Q, counter_Q, indel_dict, phased_snp_files_list, indel_files_list): + while len(indel_dict)>0 or not job_Q.empty(): + try: + job=job_Q.get(block=False) + if job[0]=='phase': + phase_run(job[1], params, indel_dict, job_Q, counter_Q, phased_snp_files_list) + + elif job[0]=='indel': + job_Q.put(job) + indel_run(params, indel_dict, job_Q, counter_Q, indel_files_list) - for j in range(len(batch_pred_all)): - neg_file.write('%s,%d,%s,%.4f,%.4f,%.4f,%.4f\n' %(chrom,batch_pos[j], rev_gt_map[batch_pred_all[j]], batch_prob_all[j,0], batch_prob_all[j,1], batch_prob_all[j,2], batch_prob_all[j,3])) - - q=min(999,qual_all[j,batch_pred_all[j]]) - if batch_pos[j]>prev: - - if batch_prob_all[j,0]<=0.95: - - q=-100*np.log10(1e-6+batch_prob_all[j,0]) - allele0_data, allele1_data,allele_total_data= batch_alleles_seq[j] - - if batch_pred_all[j]==1 and allele_total_data[0]: - gq=-100*np.log10(1+1e-6-batch_prob_all[j,1]) - s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ\t1/1:%.2f\n' %(chrom, batch_pos[j], allele_total_data[0], allele_total_data[1], q, gq) - f.write(s) - prev=batch_pos[j]+max(len(allele_total_data[0]), len(allele_total_data[1])) - - else: - if allele0_data[0] and allele1_data[0]: - - if allele0_data[0]==allele1_data[0] and allele0_data[1]==allele1_data[1]: - gq=-100*np.log10(1+1e-6-batch_prob_all[j,1]) - s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ\t1/1:%.2f\n' %(chrom, batch_pos[j], allele0_data[0], allele0_data[1], q, gq) - f.write(s) - prev=batch_pos[j]+max(len(allele0_data[0]), len(allele0_data[1])) + except queue.Empty: + continue - else: - ref1,alt1=allele0_data - ref2,alt2=allele1_data - - l=min(len(ref1),len(ref2)) - - if len(ref1)>len(ref2): - ref=ref1 - alt2=alt2+ref1[l:] - - else: - ref=ref2 - alt1=alt1+ref2[l:] - gq=-100*np.log10(1+1e-6-batch_prob_all[j,3]) - s='%s\t%d\t.\t%s\t%s,%s\t%.2f\tPASS\t.\tGT:GQ\t1|2:%.2f\n' %(chrom, batch_pos[j], ref, alt1, alt2, q, gq) - f.write(s) - prev=batch_pos[j]+max(len(ref), len(alt1),len(alt2)) - - elif allele0_data[0]: - gq=-100*np.log10(1+1e-6-batch_prob_all[j,2]) - s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ\t0|1:%.2f\n' %(chrom, batch_pos[j], allele0_data[0], allele0_data[1], q, gq) - f.write(s) - prev=batch_pos[j]+max(len(allele0_data[0]), len(allele0_data[1])) - - elif allele1_data[0]: - gq=-100*np.log10(1+1e-6-batch_prob_all[j,2]) - s='%s\t%d\t.\t%s\t%s\t%.2f\tPASS\t.\tGT:GQ\t1|0:%.2f\n' %(chrom, batch_pos[j], allele1_data[0], allele1_data[1], q, gq) - f.write(s) - prev=batch_pos[j]+max(len(allele1_data[0]), len(allele1_data[1])) - - batch_x0,batch_x1,batch_x2,batch_x=None,None,None,None - pos, x0_test, x1_test, x2_test=None,None,None,None - - print('%s: (%d/%d) regions completed.' %(str(datetime.datetime.now()), completed, total_regions),flush=True) - f.flush() - os.fsync(f.fileno()) - neg_file.flush() - os.fsync(neg_file.fileno()) - +def progress_bar(counter_Q, total_indel_jobs, mode, suppress_progress): + if not suppress_progress: + pbar = tqdm(total = total_indel_jobs) + if mode=='snps': + pbar.set_description("SNPs and BAM Phasing Progress") + + else: + pbar.set_description("Indel Calling Progress") + + for item in iter(counter_Q.get, None): + pbar.update() + +def call_manager(params): + pmanager = mp.Manager() + + indel_dict=pmanager.dict() + job_Q=pmanager.Queue() + counter_Q=pmanager.Queue() + phased_snp_files_list=pmanager.list() + indel_files_list=pmanager.list() + + contigs_list={} + for x in params['regions_list']: + if x[0] not in contigs_list: + contigs_list[x[0]]={'name':x[0], 'start':x[1], 'end':x[2]} + else: + contigs_list[x[0]]['start']=min(x[1], contigs_list[x[0]]['start']) + contigs_list[x[0]]['end']=max(x[2], contigs_list[x[0]]['end']) + + if params['mode']=='indels' or params['mode']=='all': + params['intermediate_indel_files_dir']=os.path.join(params['vcf_path'], 'intermediate_indel_files') + make_and_remove_path(params['intermediate_indel_files_dir']) + + if params['mode']=='snps' or params['mode']=='all': + params['intermediate_phase_files_dir']=os.path.join(params['vcf_path'], 'intermediate_phase_files') + make_and_remove_path(params['intermediate_phase_files_dir']) + + + total_phase_jobs=0 + total_indel_jobs=0 + + + if params['mode']=='indels': + for chunk in params['chunks_list']: + chunk['sam_path']=params['sam_path'] + job_Q.put(('indel', chunk)) + total_indel_jobs+=1 + + else: + for contig in contigs_list: + job_Q.put(('phase', contigs_list[contig])) + total_phase_jobs+=1 - neg_file.close() + if params['mode']=='all': + for chunk in params['chunks_list']: + if chunk['chrom'] not in indel_dict: + indel_dict[chunk['chrom']]=pmanager.list() + indel_dict[chunk['chrom']].append(chunk) + total_indel_jobs+=1 + + total_jobs=total_phase_jobs if params['mode']=='snps' else total_indel_jobs + + tqdm_proc = mp.Process(target=progress_bar, args=(counter_Q, total_jobs, params['mode'], params['suppress_progress'])) + tqdm_proc.start() + + shared_var = (params, job_Q, counter_Q, indel_dict, phased_snp_files_list, indel_files_list) + + handlers = [] + for hid in range(params['cpu']): + p = mp.Process(target=caller, args=shared_var); + p.start(); + handlers.append(p); + + + for job in handlers: + job.join() - run_cmd("bcftools sort %s.raw.vcf|bgziptabix %s.raw.vcf.gz" %(outfile, outfile)) + counter_Q.put(None) + tqdm_proc.join() + + output_files={'snps':None, 'indels':None, 'final':None} + + if params['mode']=='snps' or params['mode']=='all': + output_files['snps']=os.path.join(params['vcf_path'],'%s.snps.phased.vcf.gz' %params['prefix']) + + phased_snps_files=' '.join(phased_snp_files_list) + + run_cmd('bcftools concat %s --threads %d|bgziptabix %s' %(phased_snps_files, params['cpu'], output_files['snps'])) + + if params['mode']=='indels' or params['mode']=='all': + raw_indel_vcf=os.path.join(params['intermediate_indel_files_dir'],'%s.raw.indel.vcf' %params['prefix']) + output_files['indels']=os.path.join(params['vcf_path'],'%s.indels.vcf.gz' %params['prefix']) + + with open(raw_indel_vcf,'wb') as outfile: + outfile.write(b'##fileformat=VCFv4.2\n') + outfile.write(b'##FILTER=\n') + + #loop through contigs + for chrom in contigs_list: + outfile.write(b'##contig=\n' %bytes(chrom.encode('utf-8'))) + + outfile.write(b'##FORMAT=\n') + outfile.write(b'##FORMAT=\n') + outfile.write(b'##FORMAT=\n') + outfile.write(b'#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n' %bytes(params['sample'].encode('utf-8'))) - return outfile + for int_file in indel_files_list: + with open(int_file,'rb') as fd: + shutil.copyfileobj(fd, outfile) + + print('\n%s: Compressing and indexing indel calls.' %str(datetime.datetime.now())) + run_cmd('bcftools norm -c w --fasta-ref %s --threads %d %s|bcftools sort|bgziptabix %s' %(params['fasta_path'], params['cpu'], raw_indel_vcf, output_files['indels']), error=True) + + if params['mode']=='all': + final_vcf=os.path.join(params['vcf_path'],'%s.vcf.gz' %params['prefix']) + output_files['final']=final_vcf + + run_cmd('bcftools concat --threads %d %s %s -a | bgziptabix %s' %(params['cpu'], output_files['snps'], output_files['indels'], output_files['final']), error=True) + + + return output_files \ No newline at end of file diff --git a/nanocaller_src/snpCaller.py b/nanocaller_src/snpCaller.py index 9b1f4c3..7d366b3 100644 --- a/nanocaller_src/snpCaller.py +++ b/nanocaller_src/snpCaller.py @@ -1,6 +1,5 @@ import time,os,copy,argparse,subprocess, sys, pysam, datetime - - +from tqdm import tqdm import numpy as np import multiprocessing as mp import tensorflow as tf @@ -8,7 +7,9 @@ from .generate_SNP_pileups import get_snp_testing_candidates from intervaltree import Interval, IntervalTree from .utils import * +from multiprocessing import current_process +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" num_to_base_map={0:'A',1:'G',2:'T',3:'C'} snp_model_dict={'NanoCaller1':'release_data/ONT_models/SNPs/NanoCaller1_beta/model-rt-1', @@ -50,26 +51,11 @@ def get_SNP_model(snp_model): train_coverage=0 return snp_model_path, train_coverage - -def test_model(params,pool): - print('%s: SNP calling started.' %(str(datetime.datetime.now())), flush=True) - - vcf_path,prefix= params['vcf_path'], params['prefix'] - chrom,start,end=params['chrom'], params['start'],params['end'] - - - coverage=get_coverage(params, pool) - - print('%s: Coverage=%.2fx.' %(str(datetime.datetime.now()), coverage), flush=True) - - if coverage==0: - print('%s: No coverage found for the contig %s.' %(str(datetime.datetime.now()), chrom), flush=True) - return - if params['maxcov'] < coverage: - coverage=params['maxcov'] - - print('%s: Coverage exceeds maxcov parameter. Downsampling to --maxcov parameter=%.2fx.' %(str(datetime.datetime.now()), coverage), flush=True) +def caller(params, chunks_Q, counter_Q, snp_files): + cur_p = current_process() + curr_vcf_path=os.path.join(params['intermediate_snp_files_dir'],'%s.%d.snps.vcf' %(params['prefix'], cur_p._identity[0])) + snp_files.append(curr_vcf_path) n_input=[5,41,5] @@ -77,115 +63,156 @@ def test_model(params,pool): if model_path==None: print('Invalid SNP model name or path', flush=True) - return + sys.exit(1) train_coverage=coverage if train_coverage==0 else train_coverage snp_model=SNP_model() - snp_model.load_weights(model_path) + snp_model.load_weights(model_path).expect_partial() batch_size=1000 - neg_file=open(os.path.join(vcf_path,'%s.snp_stats' %prefix),'w') - neg_file.write('pos,ref,prob_GT,prob_A,prob_G,prob_T,prob_C,DP,freq\n') - - with open(os.path.join(vcf_path,'%s.snps.vcf' %prefix),'w') as f: - - f.write('##fileformat=VCFv4.2\n') - f.write('##FILTER=\n') - c='##contig=\n' %chrom - f.write('##contig=\n' %chrom) - f.write('##FORMAT=\n') - f.write('##FORMAT=\n') - f.write('##FORMAT=\n') - f.write('#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT %s\n' %params['sample']) - - in_dict_list=[] - - for mbase in range(start,end,200000): - d = copy.deepcopy(params) - d['start']=mbase - d['end']=min(end,mbase+200000) - in_dict_list.append(d) - - result=pool.imap_unordered(get_snp_testing_candidates, in_dict_list) - - total_regions=len(in_dict_list) - completed=0 - - for res in result: - - pos,test_ref,x_test,dp,freq=res - - completed+=1 - if len(pos)==0: - continue - - - test_ref=test_ref.astype(np.float16) + + with open(curr_vcf_path,'w') as f: + while not chunks_Q.empty(): + chunk = chunks_Q.get(block=False) + chrom=chunk['chrom'] + pos, test_ref, x_test, dp, freq = get_snp_testing_candidates(params, chunk) - - x_test=x_test.astype(np.float32) + if len(pos)>0: + test_ref=test_ref.astype(np.float16) + x_test=x_test.astype(np.float32) + + coverage=np.mean(dp) + + x_test[:,1:,:,:4]=x_test[:,1:,:,:4]*(train_coverage/coverage) + + for batch in range(int(np.ceil(len(x_test)/batch_size))): + batch_freq=freq[batch*batch_size:min((batch+1)*batch_size,len(freq))] + batch_dp=dp[batch*batch_size:min((batch+1)*batch_size,len(dp))] + batch_pos = pos[batch*batch_size:min((batch+1)*batch_size,len(pos))] + batch_x = x_test[batch*batch_size:min((batch+1)*batch_size,len(x_test))] + + batch_ref = test_ref[batch*batch_size:min((batch+1)*batch_size, len(test_ref))] + + batch_coverage=np.mean(batch_dp) + + batch_x[:,1:,:,:4]=batch_x[:,1:,:,:4]*(train_coverage/batch_coverage) + + batch_prob_A, batch_prob_G, batch_prob_T, batch_prob_C, batch_prob_GT = snp_model([batch_x, batch_ref[:,0][:,np.newaxis], batch_ref[:,1][:,np.newaxis], batch_ref[:,2][:,np.newaxis], batch_ref[:,3][:,np.newaxis]]) + + batch_pred_GT=np.argmax(batch_prob_GT,axis=1) + + batch_probs=np.hstack([batch_prob_A[:,1][:,np.newaxis], batch_prob_G[:,1][:,np.newaxis], batch_prob_T[:,1][:,np.newaxis], batch_prob_C[:,1][:,np.newaxis]]) + + + batch_pred=np.argsort(batch_probs,axis=1) + + batch_ref_vec=batch_ref + + batch_ref=np.argmax(batch_ref,1) + + batch_pred_GT=np.sum(batch_probs>=0.5,axis=1) + + sort_probs=np.sort(batch_probs,axis=1) + + for j in range(len(batch_pred_GT)): + + if batch_pred_GT[j]>=2: # if het + pred1,pred2=batch_pred[j,-1],batch_pred[j,-2] + if pred1==batch_ref[j]: + s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred2], min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS','0/1', batch_dp[j], batch_freq[j]) + f.write(s) + + elif pred2==batch_ref[j] and batch_probs[j,pred2]>=0.5: + s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom,batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS','1/0', batch_dp[j], batch_freq[j]) + f.write(s) + + elif pred2!=batch_ref[j] and pred1!=batch_ref[j] and batch_probs[j,pred2]>=0.5: + s='%s\t%d\t.\t%s\t%s,%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %\ + (chrom,batch_pos[j],num_to_base_map[batch_ref[j]],num_to_base_map[pred1],num_to_base_map[pred2],min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS','1/2', batch_dp[j], batch_freq[j]) + f.write(s) + + elif batch_pred_GT[j]==1 and batch_ref[j]!=batch_pred[j,-1] and batch_probs[j,batch_pred[j,-1]]>=0.5: + pred1=batch_pred[j,-1] + s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred1])),'PASS','1/1', batch_dp[j], batch_freq[j]) + f.write(s) - x_test[:,1:,:,:4]=x_test[:,1:,:,:4]*(train_coverage/coverage) - - for batch in range(int(np.ceil(len(x_test)/batch_size))): - batch_freq=freq[batch*batch_size:min((batch+1)*batch_size,len(freq))] - batch_dp=dp[batch*batch_size:min((batch+1)*batch_size,len(dp))] - batch_pos = pos[batch*batch_size:min((batch+1)*batch_size,len(pos))] - batch_x = x_test[batch*batch_size:min((batch+1)*batch_size,len(x_test))] - - batch_ref = test_ref[batch*batch_size:min((batch+1)*batch_size, len(test_ref))] - batch_prob_A, batch_prob_G, batch_prob_T, batch_prob_C, batch_prob_GT = snp_model([batch_x, batch_ref[:,0][:,np.newaxis], batch_ref[:,1][:,np.newaxis], batch_ref[:,2][:,np.newaxis], batch_ref[:,3][:,np.newaxis]]) - - batch_pred_GT=np.argmax(batch_prob_GT,axis=1) - - batch_probs=np.hstack([batch_prob_A[:,1][:,np.newaxis], batch_prob_G[:,1][:,np.newaxis], batch_prob_T[:,1][:,np.newaxis], batch_prob_C[:,1][:,np.newaxis]]) - - - batch_pred=np.argsort(batch_probs,axis=1) - - batch_ref_vec=batch_ref - - batch_ref=np.argmax(batch_ref,1) - - batch_pred_GT=np.sum(batch_probs>=0.5,axis=1) - - sort_probs=np.sort(batch_probs,axis=1) - - for j in range(len(batch_pred_GT)): - - if batch_pred_GT[j]>=2: # if het - pred1,pred2=batch_pred[j,-1],batch_pred[j,-2] - if pred1==batch_ref[j]: - s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred2], min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS','0/1', batch_dp[j], batch_freq[j]) - f.write(s) - - elif pred2==batch_ref[j] and batch_probs[j,pred2]>=0.5: - s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom,batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS','1/0', batch_dp[j], batch_freq[j]) - f.write(s) - - elif pred2!=batch_ref[j] and pred1!=batch_ref[j] and batch_probs[j,pred2]>=0.5: - s='%s\t%d\t.\t%s\t%s,%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %\ - (chrom,batch_pos[j],num_to_base_map[batch_ref[j]],num_to_base_map[pred1],num_to_base_map[pred2],min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred2])),'PASS','1/2', batch_dp[j], batch_freq[j]) - f.write(s) - - elif batch_pred_GT[j]==1 and batch_ref[j]!=batch_pred[j,-1] and batch_probs[j,batch_pred[j,-1]]>=0.5: - pred1=batch_pred[j,-1] - s='%s\t%d\t.\t%s\t%s\t%.3f\t%s\t.\tGT:DP:FQ\t%s:%d:%.4f\n' %(chrom, batch_pos[j], num_to_base_map[batch_ref[j]], num_to_base_map[pred1], min(999,-100*np.log10(1e-10+ 1-batch_probs[j,pred1])),'PASS','1/1', batch_dp[j], batch_freq[j]) - f.write(s) - - - neg_file.write('%d,%s,%.4f,%.4f,%.4f,%.4f,%.4f,%d,%.4f\n' %(batch_pos[j],num_to_base_map[batch_ref[j]], batch_prob_GT[j,0], batch_probs[j,0], batch_probs[j,1], batch_probs[j,2], batch_probs[j,3], batch_dp[j], batch_freq[j])) - - - print('%s: (%d/%d) regions completed.' %(str(datetime.datetime.now()), completed, total_regions),flush=True) f.flush() os.fsync(f.fileno()) - neg_file.flush() - os.fsync(neg_file.fileno()) + counter_Q.put(1) + +def progress_bar(counter_Q, total_snp_jobs, suppress_progress): + if not suppress_progress: + pbar = tqdm(total = total_snp_jobs) + pbar.set_description("SNP Calling Progress") + for item in iter(counter_Q.get, None): + pbar.update() + + +def call_manager(params): + + pmanager = mp.Manager() + chunks_Q = pmanager.Queue() + snp_files = pmanager.list() + counter_Q = pmanager.Queue() + + chrom_list=set(x[0] for x in params['regions_list']) + + total_snp_jobs=0 + for chunk in params['chunks_list']: + chunks_Q.put(chunk) + total_snp_jobs+=1 + + + params['intermediate_snp_files_dir']=os.path.join(params['vcf_path'], 'intermediate_snp_files') + make_and_remove_path(params['intermediate_snp_files_dir']) + + + tqdm_proc = mp.Process(target=progress_bar, args=(counter_Q, total_snp_jobs, params['suppress_progress'])) + tqdm_proc.start() + + + shared_var = (params, chunks_Q, counter_Q, snp_files) + handlers = [] + for hid in range(params['cpu']): + p = mp.Process(target=caller, args=shared_var); + p.start(); + handlers.append(p); + + + for job in handlers: + job.join() + + + counter_Q.put(None) + tqdm_proc.join() + + output_file_path=os.path.join(params['vcf_path'],'%s.snps.vcf.gz' %params['prefix']) + + temp_output_file_path=os.path.join(params['intermediate_snp_files_dir'],'combined.snps.vcf') + + print('\n%s: Combining SNP calls.' %str(datetime.datetime.now())) + + with open(temp_output_file_path,'wb') as outfile: + outfile.write(b'##fileformat=VCFv4.2\n') + outfile.write(b'##FILTER=\n') + + #loop through contigs + for chrom in chrom_list: + outfile.write(b'##contig=\n' %bytes(chrom.encode('utf-8'))) + + outfile.write(b'##FORMAT=\n') + outfile.write(b'##FORMAT=\n') + outfile.write(b'##FORMAT=\n') + outfile.write(b'#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n' %bytes(params['sample'].encode('utf-8'))) + + for int_file in snp_files: + with open(int_file,'rb') as fd: + shutil.copyfileobj(fd, outfile) + + print('\n%s: Compressing and indexing SNP calls.' %str(datetime.datetime.now())) + + run_cmd("bcftools sort %s|bgziptabix %a" %(temp_output_file_path, output_file_path), error=True) - output_file=os.path.join(vcf_path,'%s.snps' %prefix) - run_cmd("bcftools sort %s.vcf|bgziptabix %s.vcf.gz" %(output_file, output_file)) + return output_file_path - return output_file - diff --git a/nanocaller_src/utils.py b/nanocaller_src/utils.py index f86a1ba..a41fe9b 100644 --- a/nanocaller_src/utils.py +++ b/nanocaller_src/utils.py @@ -1,14 +1,82 @@ from subprocess import PIPE, Popen -import os, shutil +import os, shutil, pysam -def run_cmd(cmd, verbose=False, output=False,error=False): +def get_regions_list(args): + regions_list=[] + if args.wgs_contigs: + sam_file=pysam.Samfile(args.bam) + + for contig in list(range(1,23)) + ['X','Y']: + contig='chr'+str(contig) if args.wgs_contigs=='chr1-22XY' else str(contig) + if sam_file.is_valid_reference_name(contig): + regions_list.append((contig, 1, sam_file.get_reference_length(contig))) + + elif args.regions: + sam_file=pysam.Samfile(args.bam) + for r in args.regions: + r2=r.split(':') + if len(r2)==1: + r2=r2[0] + if sam_file.is_valid_reference_name(r2): + regions_list.append((r2, 1, sam_file.get_reference_length(r2))) + else: + print('\n%s: Contig %s not present in the BAM file.' %(str(datetime.datetime.now()), r2), flush=True) + + elif len(r2)==2: + cord=r2[1].split('-') + if len(cord)==2: + regions_list.append((r2[0], int(cord[0]), int(cord[1]))) + else: + print('\n%s: Invalid region %s.' %(str(datetime.datetime.now()), r), flush=True) + + else: + print('\n%s: Invalid region %s.' %(str(datetime.datetime.now()), r), flush=True) + + elif args.bed: + sam_file=pysam.Samfile(args.bam) + with open(args.bed, 'r') as bed_file: + for line in bed_file: + line=line.rstrip('\n').split() + if sam_file.is_valid_reference_name(line[0]): + regions_list.append((line[0], int(line[1]), int(line[2]))) + else: + print('\n%s: Contig %s not present in the BAM file.' %(str(datetime.datetime.now()), line[0]), flush=True) + else: + sam_file=pysam.Samfile(args.bam) + regions_list=[(r, 1, sam_file.get_reference_length(r)) for r in sam_file.references] + + if len(regions_list)==0: + print('\n%s: No valid regions found.' %str(datetime.datetime.now()), flush=True) + sys.exit(2) + + return regions_list + + + +def get_chunks(regions_list, cpu, max_chunk_size=500000, min_chunk_size=10000): + chunks_list=[] + total_bases=sum(region[2]-region[1]+1 for region in regions_list) + + + chunksize=min(max_chunk_size, max(min_chunk_size, total_bases//cpu+1)) + + for region in regions_list: + contig=region[0] + start=region[1] + end=region[2] + for chunk in range(start, end, chunksize): + chunks_list.append({'chrom':contig, 'start': chunk, 'end' : min(end, chunk + chunksize)}) + + return chunks_list + +def run_cmd(cmd, verbose=False, output=False, error=False): stream=Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = stream.communicate() stdout=stdout.decode('utf-8') stderr=stderr.decode('utf-8') - if stderr: + if stderr and error: print(stderr, flush=True) if verbose: